xref: /onnv-gate/usr/src/uts/common/vm/vm_seg.c (revision 11066:cebb50cbe4f9)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
53247Sgjelinek  * Common Development and Distribution License (the "License").
63247Sgjelinek  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
22*11066Srafael.vanoni@sun.com  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
270Sstevel@tonic-gate /*	  All Rights Reserved  	*/
280Sstevel@tonic-gate 
290Sstevel@tonic-gate /*
300Sstevel@tonic-gate  * University Copyright- Copyright (c) 1982, 1986, 1988
310Sstevel@tonic-gate  * The Regents of the University of California
320Sstevel@tonic-gate  * All Rights Reserved
330Sstevel@tonic-gate  *
340Sstevel@tonic-gate  * University Acknowledgment- Portions of this document are derived from
350Sstevel@tonic-gate  * software developed by the University of California, Berkeley, and its
360Sstevel@tonic-gate  * contributors.
370Sstevel@tonic-gate  */
380Sstevel@tonic-gate 
390Sstevel@tonic-gate /*
400Sstevel@tonic-gate  * VM - segment management.
410Sstevel@tonic-gate  */
420Sstevel@tonic-gate 
430Sstevel@tonic-gate #include <sys/types.h>
440Sstevel@tonic-gate #include <sys/inttypes.h>
450Sstevel@tonic-gate #include <sys/t_lock.h>
460Sstevel@tonic-gate #include <sys/param.h>
470Sstevel@tonic-gate #include <sys/systm.h>
480Sstevel@tonic-gate #include <sys/kmem.h>
496695Saguzovsk #include <sys/sysmacros.h>
500Sstevel@tonic-gate #include <sys/vmsystm.h>
516695Saguzovsk #include <sys/tuneable.h>
520Sstevel@tonic-gate #include <sys/debug.h>
536695Saguzovsk #include <sys/fs/swapnode.h>
540Sstevel@tonic-gate #include <sys/cmn_err.h>
550Sstevel@tonic-gate #include <sys/callb.h>
560Sstevel@tonic-gate #include <sys/mem_config.h>
573247Sgjelinek #include <sys/mman.h>
580Sstevel@tonic-gate 
590Sstevel@tonic-gate #include <vm/hat.h>
600Sstevel@tonic-gate #include <vm/as.h>
610Sstevel@tonic-gate #include <vm/seg.h>
620Sstevel@tonic-gate #include <vm/seg_kmem.h>
633247Sgjelinek #include <vm/seg_spt.h>
643247Sgjelinek #include <vm/seg_vn.h>
656695Saguzovsk #include <vm/anon.h>
666695Saguzovsk 
670Sstevel@tonic-gate /*
680Sstevel@tonic-gate  * kstats for segment advise
690Sstevel@tonic-gate  */
700Sstevel@tonic-gate segadvstat_t segadvstat = {
710Sstevel@tonic-gate 	{ "MADV_FREE_hit",	KSTAT_DATA_ULONG },
720Sstevel@tonic-gate 	{ "MADV_FREE_miss",	KSTAT_DATA_ULONG },
730Sstevel@tonic-gate };
740Sstevel@tonic-gate 
750Sstevel@tonic-gate kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
760Sstevel@tonic-gate uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
770Sstevel@tonic-gate 
780Sstevel@tonic-gate /*
790Sstevel@tonic-gate  * entry in the segment page cache
800Sstevel@tonic-gate  */
810Sstevel@tonic-gate struct seg_pcache {
826695Saguzovsk 	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
836695Saguzovsk 	struct seg_pcache	*p_hprev;
846695Saguzovsk 	pcache_link_t		p_plink;	/* per segment/amp list */
856695Saguzovsk 	void 			*p_htag0;	/* segment/amp pointer */
866695Saguzovsk 	caddr_t			p_addr;		/* base address/anon_idx */
876695Saguzovsk 	size_t			p_len;		/* total bytes */
886695Saguzovsk 	size_t			p_wlen;		/* writtable bytes at p_addr */
896695Saguzovsk 	struct page		**p_pp;		/* pp shadow list */
906695Saguzovsk 	seg_preclaim_cbfunc_t	p_callback;	/* reclaim callback function */
916695Saguzovsk 	clock_t			p_lbolt;	/* lbolt from last use */
926695Saguzovsk 	struct seg_phash	*p_hashp;	/* our pcache hash bucket */
936695Saguzovsk 	uint_t			p_active;	/* active count */
946695Saguzovsk 	uchar_t			p_write;	/* true if S_WRITE */
956695Saguzovsk 	uchar_t			p_ref;		/* reference byte */
966695Saguzovsk 	ushort_t		p_flags;	/* bit flags */
970Sstevel@tonic-gate };
980Sstevel@tonic-gate 
990Sstevel@tonic-gate struct seg_phash {
1006695Saguzovsk 	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
1016695Saguzovsk 	struct seg_pcache	*p_hprev;
1026695Saguzovsk 	kmutex_t		p_hmutex;	/* protects hash bucket */
1036695Saguzovsk 	pcache_link_t		p_halink[2];	/* active bucket linkages */
1046695Saguzovsk };
1056695Saguzovsk 
1066695Saguzovsk struct seg_phash_wired {
1076695Saguzovsk 	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
1086695Saguzovsk 	struct seg_pcache	*p_hprev;
1096695Saguzovsk 	kmutex_t		p_hmutex;	/* protects hash bucket */
1100Sstevel@tonic-gate };
1110Sstevel@tonic-gate 
1126695Saguzovsk /*
1136695Saguzovsk  * A parameter to control a maximum number of bytes that can be
1146695Saguzovsk  * purged from pcache at a time.
1156695Saguzovsk  */
1166695Saguzovsk #define	P_MAX_APURGE_BYTES	(1024 * 1024 * 1024)
1176695Saguzovsk 
1186695Saguzovsk /*
1196695Saguzovsk  * log2(fraction of pcache to reclaim at a time).
1206695Saguzovsk  */
1216695Saguzovsk #define	P_SHRINK_SHFT		(5)
1226695Saguzovsk 
1236695Saguzovsk /*
1246695Saguzovsk  * The following variables can be tuned via /etc/system.
1256695Saguzovsk  */
1266695Saguzovsk 
1276695Saguzovsk int	segpcache_enabled = 1;		/* if 1, shadow lists are cached */
1286695Saguzovsk pgcnt_t	segpcache_maxwindow = 0;	/* max # of pages that can be cached */
1296695Saguzovsk ulong_t	segpcache_hashsize_win = 0;	/* # of non wired buckets */
1306695Saguzovsk ulong_t	segpcache_hashsize_wired = 0;	/* # of wired buckets */
1316695Saguzovsk int	segpcache_reap_sec = 1;		/* reap check rate in secs */
1326695Saguzovsk clock_t	segpcache_reap_ticks = 0;	/* reap interval in ticks */
1336695Saguzovsk int	segpcache_pcp_maxage_sec = 1;	/* pcp max age in secs */
1346695Saguzovsk clock_t	segpcache_pcp_maxage_ticks = 0;	/* pcp max age in ticks */
1356695Saguzovsk int	segpcache_shrink_shift = P_SHRINK_SHFT;	/* log2 reap fraction */
1366695Saguzovsk pgcnt_t	segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES;	/* max purge bytes */
1370Sstevel@tonic-gate 
1386695Saguzovsk static kmutex_t seg_pcache_mtx;	/* protects seg_pdisabled counter */
1396695Saguzovsk static kmutex_t seg_pasync_mtx;	/* protects async thread scheduling */
1406695Saguzovsk static kcondvar_t seg_pasync_cv;
1416695Saguzovsk 
1426695Saguzovsk #pragma align 64(pctrl1)
1436695Saguzovsk #pragma align 64(pctrl2)
1446695Saguzovsk #pragma align 64(pctrl3)
1450Sstevel@tonic-gate 
1466695Saguzovsk /*
1476695Saguzovsk  * Keep frequently used variables together in one cache line.
1486695Saguzovsk  */
1496695Saguzovsk static struct p_ctrl1 {
1506695Saguzovsk 	uint_t p_disabled;		/* if not 0, caching temporarily off */
1516695Saguzovsk 	pgcnt_t p_maxwin;		/* max # of pages that can be cached */
1526695Saguzovsk 	size_t p_hashwin_sz;		/* # of non wired buckets */
1536695Saguzovsk 	struct seg_phash *p_htabwin;	/* hash table for non wired entries */
1546695Saguzovsk 	size_t p_hashwired_sz;		/* # of wired buckets */
1556695Saguzovsk 	struct seg_phash_wired *p_htabwired; /* hash table for wired entries */
1566695Saguzovsk 	kmem_cache_t *p_kmcache;	/* kmem cache for seg_pcache structs */
1576695Saguzovsk #ifdef _LP64
1586695Saguzovsk 	ulong_t pad[1];
1596695Saguzovsk #endif /* _LP64 */
1606695Saguzovsk } pctrl1;
1616695Saguzovsk 
1626695Saguzovsk static struct p_ctrl2 {
1636695Saguzovsk 	kmutex_t p_mem_mtx;	/* protects window counter and p_halinks */
1646695Saguzovsk 	pgcnt_t  p_locked_win;	/* # pages from window */
1656695Saguzovsk 	pgcnt_t  p_locked;	/* # of pages cached by pagelock */
1666695Saguzovsk 	uchar_t	 p_ahcur;	/* current active links for insert/delete */
1676695Saguzovsk 	uchar_t  p_athr_on;	/* async reclaim thread is running. */
1686695Saguzovsk 	pcache_link_t p_ahhead[2]; /* active buckets linkages */
1696695Saguzovsk } pctrl2;
1700Sstevel@tonic-gate 
1716695Saguzovsk static struct p_ctrl3 {
1726695Saguzovsk 	clock_t	p_pcp_maxage;		/* max pcp age in ticks */
1736695Saguzovsk 	ulong_t	p_athr_empty_ahb;	/* athread walk stats */
1746695Saguzovsk 	ulong_t p_athr_full_ahb;	/* athread walk stats */
1756695Saguzovsk 	pgcnt_t	p_maxapurge_npages;	/* max pages to purge at a time */
1766695Saguzovsk 	int	p_shrink_shft;		/* reap shift factor */
1776695Saguzovsk #ifdef _LP64
1786695Saguzovsk 	ulong_t pad[3];
1796695Saguzovsk #endif /* _LP64 */
1806695Saguzovsk } pctrl3;
1810Sstevel@tonic-gate 
1826695Saguzovsk #define	seg_pdisabled			pctrl1.p_disabled
1836695Saguzovsk #define	seg_pmaxwindow			pctrl1.p_maxwin
1846695Saguzovsk #define	seg_phashsize_win		pctrl1.p_hashwin_sz
1856695Saguzovsk #define	seg_phashtab_win		pctrl1.p_htabwin
1866695Saguzovsk #define	seg_phashsize_wired		pctrl1.p_hashwired_sz
1876695Saguzovsk #define	seg_phashtab_wired		pctrl1.p_htabwired
1886695Saguzovsk #define	seg_pkmcache			pctrl1.p_kmcache
1896695Saguzovsk #define	seg_pmem_mtx			pctrl2.p_mem_mtx
1906695Saguzovsk #define	seg_plocked_window		pctrl2.p_locked_win
1916695Saguzovsk #define	seg_plocked			pctrl2.p_locked
1926695Saguzovsk #define	seg_pahcur			pctrl2.p_ahcur
1936695Saguzovsk #define	seg_pathr_on			pctrl2.p_athr_on
1946695Saguzovsk #define	seg_pahhead			pctrl2.p_ahhead
1956695Saguzovsk #define	seg_pmax_pcpage			pctrl3.p_pcp_maxage
1966695Saguzovsk #define	seg_pathr_empty_ahb		pctrl3.p_athr_empty_ahb
1976695Saguzovsk #define	seg_pathr_full_ahb		pctrl3.p_athr_full_ahb
1986695Saguzovsk #define	seg_pshrink_shift		pctrl3.p_shrink_shft
1996695Saguzovsk #define	seg_pmaxapurge_npages		pctrl3.p_maxapurge_npages
2006695Saguzovsk 
2016695Saguzovsk #define	P_HASHWIN_MASK			(seg_phashsize_win - 1)
2026695Saguzovsk #define	P_HASHWIRED_MASK		(seg_phashsize_wired - 1)
2036695Saguzovsk #define	P_BASESHIFT			(6)
2046695Saguzovsk 
2056695Saguzovsk kthread_t *seg_pasync_thr;
2066695Saguzovsk 
2076695Saguzovsk extern struct seg_ops segvn_ops;
2086695Saguzovsk extern struct seg_ops segspt_shmops;
2090Sstevel@tonic-gate 
2106695Saguzovsk #define	IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED)
2116695Saguzovsk #define	IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags)
2126695Saguzovsk 
213*11066Srafael.vanoni@sun.com #define	LBOLT_DELTA(t)	((ulong_t)(ddi_get_lbolt() - (t)))
2146695Saguzovsk 
2156695Saguzovsk #define	PCP_AGE(pcp)	LBOLT_DELTA((pcp)->p_lbolt)
2166695Saguzovsk 
2176695Saguzovsk /*
2186695Saguzovsk  * htag0 argument can be a seg or amp pointer.
2196695Saguzovsk  */
2206695Saguzovsk #define	P_HASHBP(seg, htag0, addr, flags)				\
2216695Saguzovsk 	(IS_PFLAGS_WIRED((flags)) ?					\
2226695Saguzovsk 	    ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK &	\
2236695Saguzovsk 	    ((uintptr_t)(htag0) >> P_BASESHIFT)]) :			\
2246695Saguzovsk 	    (&seg_phashtab_win[P_HASHWIN_MASK &				\
2256695Saguzovsk 	    (((uintptr_t)(htag0) >> 3) ^				\
2266695Saguzovsk 	    ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ?		\
2276695Saguzovsk 	    (flags >> 16) : page_get_shift((seg)->s_szc))))]))
2280Sstevel@tonic-gate 
2296695Saguzovsk /*
2306695Saguzovsk  * htag0 argument can be a seg or amp pointer.
2316695Saguzovsk  */
2326695Saguzovsk #define	P_MATCH(pcp, htag0, addr, len)					\
2336695Saguzovsk 	((pcp)->p_htag0 == (htag0) &&					\
2346695Saguzovsk 	(pcp)->p_addr == (addr) &&					\
2356695Saguzovsk 	(pcp)->p_len >= (len))
2360Sstevel@tonic-gate 
2376695Saguzovsk #define	P_MATCH_PP(pcp, htag0, addr, len, pp)				\
2386695Saguzovsk 	((pcp)->p_pp == (pp) &&						\
2396695Saguzovsk 	(pcp)->p_htag0 == (htag0) &&					\
2406695Saguzovsk 	(pcp)->p_addr == (addr) &&					\
2416695Saguzovsk 	(pcp)->p_len >= (len))
2426695Saguzovsk 
2436695Saguzovsk #define	plink2pcache(pl)	((struct seg_pcache *)((uintptr_t)(pl) - \
2446695Saguzovsk     offsetof(struct seg_pcache, p_plink)))
2456695Saguzovsk 
2466695Saguzovsk #define	hlink2phash(hl, l)	((struct seg_phash *)((uintptr_t)(hl) -	\
2476695Saguzovsk     offsetof(struct seg_phash, p_halink[l])))
2480Sstevel@tonic-gate 
2490Sstevel@tonic-gate /*
2506695Saguzovsk  * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
2516695Saguzovsk  * active hash bucket lists. We maintain active bucket lists to reduce the
2526695Saguzovsk  * overhead of finding active buckets during asynchronous purging since there
2536695Saguzovsk  * can be 10s of millions of buckets on a large system but only a small subset
2546695Saguzovsk  * of them in actual use.
2556695Saguzovsk  *
2566695Saguzovsk  * There're 2 active bucket lists. Current active list (as per seg_pahcur) is
2576695Saguzovsk  * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
2586695Saguzovsk  * buckets. The other list is used by asynchronous purge thread. This allows
2596695Saguzovsk  * the purge thread to walk its active list without holding seg_pmem_mtx for a
2606695Saguzovsk  * long time. When asynchronous thread is done with its list it switches to
2616695Saguzovsk  * current active list and makes the list it just finished processing as
2626695Saguzovsk  * current active list.
2636695Saguzovsk  *
2646695Saguzovsk  * seg_padd_abuck() only adds the bucket to current list if the bucket is not
2656695Saguzovsk  * yet on any list.  seg_premove_abuck() may remove the bucket from either
2666695Saguzovsk  * list. If the bucket is on current list it will be always removed. Otherwise
2676695Saguzovsk  * the bucket is only removed if asynchronous purge thread is not currently
2686695Saguzovsk  * running or seg_premove_abuck() is called by asynchronous purge thread
2696695Saguzovsk  * itself. A given bucket can only be on one of active lists at a time. These
2706695Saguzovsk  * routines should be called with per bucket lock held.  The routines use
2716695Saguzovsk  * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
2726695Saguzovsk  * the first entry is added to the bucket chain and seg_premove_abuck() must
2736695Saguzovsk  * be called after the last pcp entry is deleted from its chain. Per bucket
2746695Saguzovsk  * lock should be held by the callers.  This avoids a potential race condition
2756695Saguzovsk  * when seg_premove_abuck() removes a bucket after pcp entries are added to
2766695Saguzovsk  * its list after the caller checked that the bucket has no entries. (this
2776695Saguzovsk  * race would cause a loss of an active bucket from the active lists).
2786695Saguzovsk  *
2796695Saguzovsk  * Both lists are circular doubly linked lists anchored at seg_pahhead heads.
2806695Saguzovsk  * New entries are added to the end of the list since LRU is used as the
2816695Saguzovsk  * purging policy.
2826695Saguzovsk  */
2836695Saguzovsk static void
seg_padd_abuck(struct seg_phash * hp)2846695Saguzovsk seg_padd_abuck(struct seg_phash *hp)
2856695Saguzovsk {
2866695Saguzovsk 	int lix;
2876695Saguzovsk 
2886695Saguzovsk 	ASSERT(MUTEX_HELD(&hp->p_hmutex));
2896695Saguzovsk 	ASSERT((struct seg_phash *)hp->p_hnext != hp);
2906695Saguzovsk 	ASSERT((struct seg_phash *)hp->p_hprev != hp);
2916695Saguzovsk 	ASSERT(hp->p_hnext == hp->p_hprev);
2926695Saguzovsk 	ASSERT(!IS_PCP_WIRED(hp->p_hnext));
2936695Saguzovsk 	ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp);
2946695Saguzovsk 	ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp);
2956695Saguzovsk 	ASSERT(hp >= seg_phashtab_win &&
2966695Saguzovsk 	    hp < &seg_phashtab_win[seg_phashsize_win]);
2976695Saguzovsk 
2986695Saguzovsk 	/*
2996695Saguzovsk 	 * This bucket can already be on one of active lists
3006695Saguzovsk 	 * since seg_premove_abuck() may have failed to remove it
3016695Saguzovsk 	 * before.
3026695Saguzovsk 	 */
3036695Saguzovsk 	mutex_enter(&seg_pmem_mtx);
3046695Saguzovsk 	lix = seg_pahcur;
3056695Saguzovsk 	ASSERT(lix >= 0 && lix <= 1);
3066695Saguzovsk 	if (hp->p_halink[lix].p_lnext != NULL) {
3076695Saguzovsk 		ASSERT(hp->p_halink[lix].p_lprev != NULL);
3086695Saguzovsk 		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
3096695Saguzovsk 		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
3106695Saguzovsk 		mutex_exit(&seg_pmem_mtx);
3116695Saguzovsk 		return;
3126695Saguzovsk 	}
3136695Saguzovsk 	ASSERT(hp->p_halink[lix].p_lprev == NULL);
3146695Saguzovsk 
3156695Saguzovsk 	/*
3166695Saguzovsk 	 * If this bucket is still on list !lix async thread can't yet remove
3176695Saguzovsk 	 * it since we hold here per bucket lock. In this case just return
3186695Saguzovsk 	 * since async thread will eventually find and process this bucket.
3196695Saguzovsk 	 */
3206695Saguzovsk 	if (hp->p_halink[!lix].p_lnext != NULL) {
3216695Saguzovsk 		ASSERT(hp->p_halink[!lix].p_lprev != NULL);
3226695Saguzovsk 		mutex_exit(&seg_pmem_mtx);
3236695Saguzovsk 		return;
3246695Saguzovsk 	}
3256695Saguzovsk 	ASSERT(hp->p_halink[!lix].p_lprev == NULL);
3266695Saguzovsk 	/*
3276695Saguzovsk 	 * This bucket is not on any active bucket list yet.
3286695Saguzovsk 	 * Add the bucket to the tail of current active list.
3296695Saguzovsk 	 */
3306695Saguzovsk 	hp->p_halink[lix].p_lnext = &seg_pahhead[lix];
3316695Saguzovsk 	hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev;
3326695Saguzovsk 	seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix];
3336695Saguzovsk 	seg_pahhead[lix].p_lprev = &hp->p_halink[lix];
3346695Saguzovsk 	mutex_exit(&seg_pmem_mtx);
3356695Saguzovsk }
3366695Saguzovsk 
3376695Saguzovsk static void
seg_premove_abuck(struct seg_phash * hp,int athr)3386695Saguzovsk seg_premove_abuck(struct seg_phash *hp, int athr)
3396695Saguzovsk {
3406695Saguzovsk 	int lix;
3416695Saguzovsk 
3426695Saguzovsk 	ASSERT(MUTEX_HELD(&hp->p_hmutex));
3436695Saguzovsk 	ASSERT((struct seg_phash *)hp->p_hnext == hp);
3446695Saguzovsk 	ASSERT((struct seg_phash *)hp->p_hprev == hp);
3456695Saguzovsk 	ASSERT(hp >= seg_phashtab_win &&
3466695Saguzovsk 	    hp < &seg_phashtab_win[seg_phashsize_win]);
3476695Saguzovsk 
3486695Saguzovsk 	if (athr) {
3496695Saguzovsk 		ASSERT(seg_pathr_on);
3506695Saguzovsk 		ASSERT(seg_pahcur <= 1);
3516695Saguzovsk 		/*
3526695Saguzovsk 		 * We are called by asynchronous thread that found this bucket
3536695Saguzovsk 		 * on not currently active (i.e. !seg_pahcur) list. Remove it
3546695Saguzovsk 		 * from there.  Per bucket lock we are holding makes sure
3556695Saguzovsk 		 * seg_pinsert() can't sneak in and add pcp entries to this
3566695Saguzovsk 		 * bucket right before we remove the bucket from its list.
3576695Saguzovsk 		 */
3586695Saguzovsk 		lix = !seg_pahcur;
3596695Saguzovsk 		ASSERT(hp->p_halink[lix].p_lnext != NULL);
3606695Saguzovsk 		ASSERT(hp->p_halink[lix].p_lprev != NULL);
3616695Saguzovsk 		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
3626695Saguzovsk 		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
3636695Saguzovsk 		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
3646695Saguzovsk 		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
3656695Saguzovsk 		hp->p_halink[lix].p_lnext = NULL;
3666695Saguzovsk 		hp->p_halink[lix].p_lprev = NULL;
3676695Saguzovsk 		return;
3686695Saguzovsk 	}
3696695Saguzovsk 
3706695Saguzovsk 	mutex_enter(&seg_pmem_mtx);
3716695Saguzovsk 	lix = seg_pahcur;
3726695Saguzovsk 	ASSERT(lix >= 0 && lix <= 1);
3736695Saguzovsk 
3746695Saguzovsk 	/*
3756695Saguzovsk 	 * If the bucket is on currently active list just remove it from
3766695Saguzovsk 	 * there.
3776695Saguzovsk 	 */
3786695Saguzovsk 	if (hp->p_halink[lix].p_lnext != NULL) {
3796695Saguzovsk 		ASSERT(hp->p_halink[lix].p_lprev != NULL);
3806695Saguzovsk 		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
3816695Saguzovsk 		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
3826695Saguzovsk 		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
3836695Saguzovsk 		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
3846695Saguzovsk 		hp->p_halink[lix].p_lnext = NULL;
3856695Saguzovsk 		hp->p_halink[lix].p_lprev = NULL;
3866695Saguzovsk 		mutex_exit(&seg_pmem_mtx);
3876695Saguzovsk 		return;
3886695Saguzovsk 	}
3896695Saguzovsk 	ASSERT(hp->p_halink[lix].p_lprev == NULL);
3906695Saguzovsk 
3916695Saguzovsk 	/*
3926695Saguzovsk 	 * If asynchronous thread is not running we can remove the bucket from
3936695Saguzovsk 	 * not currently active list. The bucket must be on this list since we
3946695Saguzovsk 	 * already checked that it's not on the other list and the bucket from
3956695Saguzovsk 	 * which we just deleted the last pcp entry must be still on one of the
3966695Saguzovsk 	 * active bucket lists.
3976695Saguzovsk 	 */
3986695Saguzovsk 	lix = !lix;
3996695Saguzovsk 	ASSERT(hp->p_halink[lix].p_lnext != NULL);
4006695Saguzovsk 	ASSERT(hp->p_halink[lix].p_lprev != NULL);
4016695Saguzovsk 
4026695Saguzovsk 	if (!seg_pathr_on) {
4036695Saguzovsk 		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
4046695Saguzovsk 		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
4056695Saguzovsk 		hp->p_halink[lix].p_lnext = NULL;
4066695Saguzovsk 		hp->p_halink[lix].p_lprev = NULL;
4076695Saguzovsk 	}
4086695Saguzovsk 	mutex_exit(&seg_pmem_mtx);
4096695Saguzovsk }
4106695Saguzovsk 
4116695Saguzovsk /*
4126695Saguzovsk  * Check if bucket pointed by hp already has a pcp entry that matches request
4136695Saguzovsk  * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
4146695Saguzovsk  * Also delete matching entries that cover smaller address range but start
4156695Saguzovsk  * at the same address as addr argument. Return the list of deleted entries if
4166695Saguzovsk  * any. This is an internal helper function called from seg_pinsert() only
4176695Saguzovsk  * for non wired shadow lists. The caller already holds a per seg/amp list
4186695Saguzovsk  * lock.
4196695Saguzovsk  */
4206695Saguzovsk static struct seg_pcache *
seg_plookup_checkdup(struct seg_phash * hp,void * htag0,caddr_t addr,size_t len,int * found)4216695Saguzovsk seg_plookup_checkdup(struct seg_phash *hp, void *htag0,
4226695Saguzovsk     caddr_t addr, size_t len, int *found)
4236695Saguzovsk {
4246695Saguzovsk 	struct seg_pcache *pcp;
4256695Saguzovsk 	struct seg_pcache *delcallb_list = NULL;
4266695Saguzovsk 
4276695Saguzovsk 	ASSERT(MUTEX_HELD(&hp->p_hmutex));
4286695Saguzovsk 
4296695Saguzovsk 	*found = 0;
4306695Saguzovsk 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
4316695Saguzovsk 	    pcp = pcp->p_hnext) {
4326695Saguzovsk 		ASSERT(pcp->p_hashp == hp);
4336695Saguzovsk 		if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) {
4346695Saguzovsk 			ASSERT(!IS_PCP_WIRED(pcp));
4356695Saguzovsk 			if (pcp->p_len < len) {
4366695Saguzovsk 				pcache_link_t *plinkp;
4376695Saguzovsk 				if (pcp->p_active) {
4386695Saguzovsk 					continue;
4396695Saguzovsk 				}
4406695Saguzovsk 				plinkp = &pcp->p_plink;
4416695Saguzovsk 				plinkp->p_lprev->p_lnext = plinkp->p_lnext;
4426695Saguzovsk 				plinkp->p_lnext->p_lprev = plinkp->p_lprev;
4436695Saguzovsk 				pcp->p_hprev->p_hnext = pcp->p_hnext;
4446695Saguzovsk 				pcp->p_hnext->p_hprev = pcp->p_hprev;
4456695Saguzovsk 				pcp->p_hprev = delcallb_list;
4466695Saguzovsk 				delcallb_list = pcp;
4476695Saguzovsk 			} else {
4486695Saguzovsk 				*found = 1;
4496695Saguzovsk 				break;
4506695Saguzovsk 			}
4516695Saguzovsk 		}
4526695Saguzovsk 	}
4536695Saguzovsk 	return (delcallb_list);
4546695Saguzovsk }
4556695Saguzovsk 
4566695Saguzovsk /*
4576695Saguzovsk  * lookup an address range in pagelock cache. Return shadow list and bump up
4586695Saguzovsk  * active count. If amp is not NULL use amp as a lookup tag otherwise use seg
4596695Saguzovsk  * as a lookup tag.
4600Sstevel@tonic-gate  */
4610Sstevel@tonic-gate struct page **
seg_plookup(struct seg * seg,struct anon_map * amp,caddr_t addr,size_t len,enum seg_rw rw,uint_t flags)4626695Saguzovsk seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
4636695Saguzovsk     enum seg_rw rw, uint_t flags)
4640Sstevel@tonic-gate {
4650Sstevel@tonic-gate 	struct seg_pcache *pcp;
4660Sstevel@tonic-gate 	struct seg_phash *hp;
4676695Saguzovsk 	void *htag0;
4686695Saguzovsk 
4696695Saguzovsk 	ASSERT(seg != NULL);
4706695Saguzovsk 	ASSERT(rw == S_READ || rw == S_WRITE);
4710Sstevel@tonic-gate 
4720Sstevel@tonic-gate 	/*
4730Sstevel@tonic-gate 	 * Skip pagelock cache, while DR is in progress or
4740Sstevel@tonic-gate 	 * seg_pcache is off.
4750Sstevel@tonic-gate 	 */
4766695Saguzovsk 	if (seg_pdisabled) {
4770Sstevel@tonic-gate 		return (NULL);
4780Sstevel@tonic-gate 	}
4796695Saguzovsk 	ASSERT(seg_phashsize_win != 0);
4800Sstevel@tonic-gate 
4816695Saguzovsk 	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
4826695Saguzovsk 	hp = P_HASHBP(seg, htag0, addr, flags);
4830Sstevel@tonic-gate 	mutex_enter(&hp->p_hmutex);
4840Sstevel@tonic-gate 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
4850Sstevel@tonic-gate 	    pcp = pcp->p_hnext) {
4866695Saguzovsk 		ASSERT(pcp->p_hashp == hp);
4876695Saguzovsk 		if (P_MATCH(pcp, htag0, addr, len)) {
4886695Saguzovsk 			ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
4896695Saguzovsk 			/*
4906695Saguzovsk 			 * If this request wants to write pages
4916695Saguzovsk 			 * but write permissions starting from
4926695Saguzovsk 			 * addr don't cover the entire length len
4936695Saguzovsk 			 * return lookup failure back to the caller.
4946695Saguzovsk 			 * It will check protections and fail this
4956695Saguzovsk 			 * pagelock operation with EACCESS error.
4966695Saguzovsk 			 */
4976695Saguzovsk 			if (rw == S_WRITE && pcp->p_wlen < len) {
4986695Saguzovsk 				break;
4996695Saguzovsk 			}
5006695Saguzovsk 			if (pcp->p_active == UINT_MAX) {
5016695Saguzovsk 				break;
5026695Saguzovsk 			}
5030Sstevel@tonic-gate 			pcp->p_active++;
5046695Saguzovsk 			if (rw == S_WRITE && !pcp->p_write) {
5056695Saguzovsk 				pcp->p_write = 1;
5066695Saguzovsk 			}
5070Sstevel@tonic-gate 			mutex_exit(&hp->p_hmutex);
5080Sstevel@tonic-gate 			return (pcp->p_pp);
5090Sstevel@tonic-gate 		}
5100Sstevel@tonic-gate 	}
5110Sstevel@tonic-gate 	mutex_exit(&hp->p_hmutex);
5120Sstevel@tonic-gate 	return (NULL);
5130Sstevel@tonic-gate }
5140Sstevel@tonic-gate 
5150Sstevel@tonic-gate /*
5166695Saguzovsk  * mark address range inactive. If the cache is off or the address range is
5176695Saguzovsk  * not in the cache or another shadow list that covers bigger range is found
5186695Saguzovsk  * we call the segment driver to reclaim the pages. Otherwise just decrement
5196695Saguzovsk  * active count and set ref bit.  If amp is not NULL use amp as a lookup tag
5206695Saguzovsk  * otherwise use seg as a lookup tag.
5210Sstevel@tonic-gate  */
5220Sstevel@tonic-gate void
seg_pinactive(struct seg * seg,struct anon_map * amp,caddr_t addr,size_t len,struct page ** pp,enum seg_rw rw,uint_t flags,seg_preclaim_cbfunc_t callback)5236695Saguzovsk seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr,
5246695Saguzovsk     size_t len, struct page **pp, enum seg_rw rw, uint_t flags,
5256695Saguzovsk     seg_preclaim_cbfunc_t callback)
5260Sstevel@tonic-gate {
5270Sstevel@tonic-gate 	struct seg_pcache *pcp;
5280Sstevel@tonic-gate 	struct seg_phash *hp;
5296695Saguzovsk 	kmutex_t *pmtx = NULL;
5306695Saguzovsk 	pcache_link_t *pheadp;
5316695Saguzovsk 	void *htag0;
5326695Saguzovsk 	pgcnt_t npages = 0;
5336695Saguzovsk 	int keep = 0;
5340Sstevel@tonic-gate 
5356695Saguzovsk 	ASSERT(seg != NULL);
5366695Saguzovsk 	ASSERT(rw == S_READ || rw == S_WRITE);
5376695Saguzovsk 
5386695Saguzovsk 	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
5396695Saguzovsk 
5406695Saguzovsk 	/*
5416695Saguzovsk 	 * Skip lookup if pcache is not configured.
5426695Saguzovsk 	 */
5436695Saguzovsk 	if (seg_phashsize_win == 0) {
5446695Saguzovsk 		goto out;
5450Sstevel@tonic-gate 	}
5466695Saguzovsk 
5476695Saguzovsk 	/*
5486695Saguzovsk 	 * Grab per seg/amp lock before hash lock if we are going to remove
5496695Saguzovsk 	 * inactive entry from pcache.
5506695Saguzovsk 	 */
5516695Saguzovsk 	if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) {
5526695Saguzovsk 		if (amp == NULL) {
5536695Saguzovsk 			pheadp = &seg->s_phead;
5546695Saguzovsk 			pmtx = &seg->s_pmtx;
5556695Saguzovsk 		} else {
5566695Saguzovsk 			pheadp = &amp->a_phead;
5576695Saguzovsk 			pmtx = &amp->a_pmtx;
5586695Saguzovsk 		}
5596695Saguzovsk 		mutex_enter(pmtx);
5606695Saguzovsk 	}
5616695Saguzovsk 
5626695Saguzovsk 	hp = P_HASHBP(seg, htag0, addr, flags);
5630Sstevel@tonic-gate 	mutex_enter(&hp->p_hmutex);
5646695Saguzovsk again:
5650Sstevel@tonic-gate 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
5660Sstevel@tonic-gate 	    pcp = pcp->p_hnext) {
5676695Saguzovsk 		ASSERT(pcp->p_hashp == hp);
5686695Saguzovsk 		if (P_MATCH_PP(pcp, htag0, addr, len, pp)) {
5696695Saguzovsk 			ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
5706695Saguzovsk 			ASSERT(pcp->p_active);
5716695Saguzovsk 			if (keep) {
5726695Saguzovsk 				/*
5736695Saguzovsk 				 * Don't remove this pcp entry
5746695Saguzovsk 				 * if we didn't find duplicate
5756695Saguzovsk 				 * shadow lists on second search.
5766695Saguzovsk 				 * Somebody removed those duplicates
5776695Saguzovsk 				 * since we dropped hash lock after first
5786695Saguzovsk 				 * search.
5796695Saguzovsk 				 */
5806695Saguzovsk 				ASSERT(pmtx != NULL);
5816695Saguzovsk 				ASSERT(!IS_PFLAGS_WIRED(flags));
5826695Saguzovsk 				mutex_exit(pmtx);
5836695Saguzovsk 				pmtx = NULL;
5846695Saguzovsk 			}
5850Sstevel@tonic-gate 			pcp->p_active--;
5866695Saguzovsk 			if (pcp->p_active == 0 && (pmtx != NULL ||
5876695Saguzovsk 			    (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) {
5886695Saguzovsk 
5896695Saguzovsk 				/*
5906695Saguzovsk 				 * This entry is no longer active.  Remove it
5916695Saguzovsk 				 * now either because pcaching is temporarily
5926695Saguzovsk 				 * disabled or there're other pcp entries that
5936695Saguzovsk 				 * can match this pagelock request (i.e. this
5946695Saguzovsk 				 * entry is a duplicate).
5956695Saguzovsk 				 */
5960Sstevel@tonic-gate 
5970Sstevel@tonic-gate 				ASSERT(callback == pcp->p_callback);
5986695Saguzovsk 				if (pmtx != NULL) {
5996695Saguzovsk 					pcache_link_t *plinkp = &pcp->p_plink;
6006695Saguzovsk 					ASSERT(!IS_PCP_WIRED(pcp));
6016695Saguzovsk 					ASSERT(pheadp->p_lnext != pheadp);
6026695Saguzovsk 					ASSERT(pheadp->p_lprev != pheadp);
6036695Saguzovsk 					plinkp->p_lprev->p_lnext =
6046695Saguzovsk 					    plinkp->p_lnext;
6056695Saguzovsk 					plinkp->p_lnext->p_lprev =
6066695Saguzovsk 					    plinkp->p_lprev;
6076695Saguzovsk 				}
6080Sstevel@tonic-gate 				pcp->p_hprev->p_hnext = pcp->p_hnext;
6090Sstevel@tonic-gate 				pcp->p_hnext->p_hprev = pcp->p_hprev;
6106695Saguzovsk 				if (!IS_PCP_WIRED(pcp) &&
6116695Saguzovsk 				    hp->p_hnext == (struct seg_pcache *)hp) {
6126695Saguzovsk 					/*
6136695Saguzovsk 					 * We removed the last entry from this
6146695Saguzovsk 					 * bucket.  Now remove the bucket from
6156695Saguzovsk 					 * its active list.
6166695Saguzovsk 					 */
6176695Saguzovsk 					seg_premove_abuck(hp, 0);
6186695Saguzovsk 				}
6190Sstevel@tonic-gate 				mutex_exit(&hp->p_hmutex);
6206695Saguzovsk 				if (pmtx != NULL) {
6216695Saguzovsk 					mutex_exit(pmtx);
6226695Saguzovsk 				}
6236695Saguzovsk 				len = pcp->p_len;
6246695Saguzovsk 				npages = btop(len);
6256695Saguzovsk 				if (rw != S_WRITE && pcp->p_write) {
6266695Saguzovsk 					rw = S_WRITE;
6276695Saguzovsk 				}
6286695Saguzovsk 				kmem_cache_free(seg_pkmcache, pcp);
6296695Saguzovsk 				goto out;
6306695Saguzovsk 			} else {
6316695Saguzovsk 				/*
6326695Saguzovsk 				 * We found a matching pcp entry but will not
6336695Saguzovsk 				 * free it right away even if it's no longer
6346695Saguzovsk 				 * active.
6356695Saguzovsk 				 */
6366695Saguzovsk 				if (!pcp->p_active && !IS_PCP_WIRED(pcp)) {
6376695Saguzovsk 					/*
6386695Saguzovsk 					 * Set the reference bit and mark the
6396695Saguzovsk 					 * time of last access to this pcp
6406695Saguzovsk 					 * so that asynchronous thread doesn't
6416695Saguzovsk 					 * free it immediately since
6426695Saguzovsk 					 * it may be reactivated very soon.
6436695Saguzovsk 					 */
644*11066Srafael.vanoni@sun.com 					pcp->p_lbolt = ddi_get_lbolt();
6456695Saguzovsk 					pcp->p_ref = 1;
6466695Saguzovsk 				}
6476695Saguzovsk 				mutex_exit(&hp->p_hmutex);
6486695Saguzovsk 				if (pmtx != NULL) {
6496695Saguzovsk 					mutex_exit(pmtx);
6500Sstevel@tonic-gate 				}
6516695Saguzovsk 				return;
6526695Saguzovsk 			}
6536695Saguzovsk 		} else if (!IS_PFLAGS_WIRED(flags) &&
6546695Saguzovsk 		    P_MATCH(pcp, htag0, addr, len)) {
6556695Saguzovsk 			/*
6566695Saguzovsk 			 * This is a duplicate pcp entry.  This situation may
6576695Saguzovsk 			 * happen if a bigger shadow list that covers our
6586695Saguzovsk 			 * range was added while our entry was still active.
6596695Saguzovsk 			 * Now we can free our pcp entry if it becomes
6606695Saguzovsk 			 * inactive.
6616695Saguzovsk 			 */
6626695Saguzovsk 			if (!pcp->p_active) {
6636695Saguzovsk 				/*
6646695Saguzovsk 				 * Mark this entry as referenced just in case
6656695Saguzovsk 				 * we'll free our own pcp entry soon.
6666695Saguzovsk 				 */
667*11066Srafael.vanoni@sun.com 				pcp->p_lbolt = ddi_get_lbolt();
6686695Saguzovsk 				pcp->p_ref = 1;
6696695Saguzovsk 			}
6706695Saguzovsk 			if (pmtx != NULL) {
6716695Saguzovsk 				/*
6726695Saguzovsk 				 * we are already holding pmtx and found a
6736695Saguzovsk 				 * duplicate.  Don't keep our own pcp entry.
6746695Saguzovsk 				 */
6756695Saguzovsk 				keep = 0;
6766695Saguzovsk 				continue;
6770Sstevel@tonic-gate 			}
6786695Saguzovsk 			/*
6796695Saguzovsk 			 * We have to use mutex_tryenter to attempt to lock
6806695Saguzovsk 			 * seg/amp list lock since we already hold hash lock
6816695Saguzovsk 			 * and seg/amp list lock is above hash lock in lock
6826695Saguzovsk 			 * order.  If mutex_tryenter fails drop hash lock and
6836695Saguzovsk 			 * retake both locks in correct order and research
6846695Saguzovsk 			 * this hash chain.
6856695Saguzovsk 			 */
6866695Saguzovsk 			ASSERT(keep == 0);
6876695Saguzovsk 			if (amp == NULL) {
6886695Saguzovsk 				pheadp = &seg->s_phead;
6896695Saguzovsk 				pmtx = &seg->s_pmtx;
6906695Saguzovsk 			} else {
6916695Saguzovsk 				pheadp = &amp->a_phead;
6926695Saguzovsk 				pmtx = &amp->a_pmtx;
6936695Saguzovsk 			}
6946695Saguzovsk 			if (!mutex_tryenter(pmtx)) {
6956695Saguzovsk 				mutex_exit(&hp->p_hmutex);
6966695Saguzovsk 				mutex_enter(pmtx);
6976695Saguzovsk 				mutex_enter(&hp->p_hmutex);
6986695Saguzovsk 				/*
6996695Saguzovsk 				 * If we don't find bigger shadow list on
7006695Saguzovsk 				 * second search (it may happen since we
7016695Saguzovsk 				 * dropped bucket lock) keep the entry that
7026695Saguzovsk 				 * matches our own shadow list.
7036695Saguzovsk 				 */
7046695Saguzovsk 				keep = 1;
7056695Saguzovsk 				goto again;
7066695Saguzovsk 			}
7070Sstevel@tonic-gate 		}
7080Sstevel@tonic-gate 	}
7090Sstevel@tonic-gate 	mutex_exit(&hp->p_hmutex);
7106695Saguzovsk 	if (pmtx != NULL) {
7116695Saguzovsk 		mutex_exit(pmtx);
7126695Saguzovsk 	}
7130Sstevel@tonic-gate out:
7146695Saguzovsk 	(*callback)(htag0, addr, len, pp, rw, 0);
7156695Saguzovsk 	if (npages) {
7166695Saguzovsk 		mutex_enter(&seg_pmem_mtx);
7176695Saguzovsk 		ASSERT(seg_plocked >= npages);
7186695Saguzovsk 		seg_plocked -= npages;
7196695Saguzovsk 		if (!IS_PFLAGS_WIRED(flags)) {
7206695Saguzovsk 			ASSERT(seg_plocked_window >= npages);
7216695Saguzovsk 			seg_plocked_window -= npages;
7226695Saguzovsk 		}
7236695Saguzovsk 		mutex_exit(&seg_pmem_mtx);
7246695Saguzovsk 	}
7256695Saguzovsk 
7260Sstevel@tonic-gate }
7270Sstevel@tonic-gate 
7286695Saguzovsk #ifdef DEBUG
7296695Saguzovsk static uint32_t p_insert_chk_mtbf = 0;
7306695Saguzovsk #endif
7316695Saguzovsk 
7320Sstevel@tonic-gate /*
7330Sstevel@tonic-gate  * The seg_pinsert_check() is used by segment drivers to predict whether
7340Sstevel@tonic-gate  * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
7350Sstevel@tonic-gate  */
7366695Saguzovsk /*ARGSUSED*/
7370Sstevel@tonic-gate int
seg_pinsert_check(struct seg * seg,struct anon_map * amp,caddr_t addr,size_t len,uint_t flags)7386695Saguzovsk seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr,
7396695Saguzovsk     size_t len, uint_t flags)
7400Sstevel@tonic-gate {
7416695Saguzovsk 	ASSERT(seg != NULL);
7420Sstevel@tonic-gate 
7436695Saguzovsk #ifdef DEBUG
7446695Saguzovsk 	if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) {
7450Sstevel@tonic-gate 		return (SEGP_FAIL);
7460Sstevel@tonic-gate 	}
7476695Saguzovsk #endif
7486695Saguzovsk 
7496695Saguzovsk 	if (seg_pdisabled) {
7500Sstevel@tonic-gate 		return (SEGP_FAIL);
7510Sstevel@tonic-gate 	}
7526695Saguzovsk 	ASSERT(seg_phashsize_win != 0);
7536695Saguzovsk 
7546695Saguzovsk 	if (IS_PFLAGS_WIRED(flags)) {
7556695Saguzovsk 		return (SEGP_SUCCESS);
7566695Saguzovsk 	}
7570Sstevel@tonic-gate 
7586695Saguzovsk 	if (seg_plocked_window + btop(len) > seg_pmaxwindow) {
7596695Saguzovsk 		return (SEGP_FAIL);
7600Sstevel@tonic-gate 	}
7616695Saguzovsk 
7626695Saguzovsk 	if (freemem < desfree) {
7636695Saguzovsk 		return (SEGP_FAIL);
7646695Saguzovsk 	}
7656695Saguzovsk 
7660Sstevel@tonic-gate 	return (SEGP_SUCCESS);
7670Sstevel@tonic-gate }
7680Sstevel@tonic-gate 
7696695Saguzovsk #ifdef DEBUG
7706695Saguzovsk static uint32_t p_insert_mtbf = 0;
7716695Saguzovsk #endif
7720Sstevel@tonic-gate 
7730Sstevel@tonic-gate /*
7746695Saguzovsk  * Insert address range with shadow list into pagelock cache if there's no
7756695Saguzovsk  * shadow list already cached for this address range. If the cache is off or
7766695Saguzovsk  * caching is temporarily disabled or the allowed 'window' is exceeded return
7776695Saguzovsk  * SEGP_FAIL. Otherwise return SEGP_SUCCESS.
7786695Saguzovsk  *
7796695Saguzovsk  * For non wired shadow lists (segvn case) include address in the hashing
7806695Saguzovsk  * function to avoid linking all the entries from the same segment or amp on
7816695Saguzovsk  * the same bucket.  amp is used instead of seg if amp is not NULL. Non wired
7826695Saguzovsk  * pcache entries are also linked on a per segment/amp list so that all
7836695Saguzovsk  * entries can be found quickly during seg/amp purge without walking the
7846695Saguzovsk  * entire pcache hash table.  For wired shadow lists (segspt case) we
7856695Saguzovsk  * don't use address hashing and per segment linking because the caller
7866695Saguzovsk  * currently inserts only one entry per segment that covers the entire
7876695Saguzovsk  * segment. If we used per segment linking even for segspt it would complicate
7886695Saguzovsk  * seg_ppurge_wiredpp() locking.
7896695Saguzovsk  *
7906695Saguzovsk  * Both hash bucket and per seg/amp locks need to be held before adding a non
7916695Saguzovsk  * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken
7926695Saguzovsk  * first.
7936695Saguzovsk  *
7946695Saguzovsk  * This function will also remove from pcache old inactive shadow lists that
7956695Saguzovsk  * overlap with this request but cover smaller range for the same start
7966695Saguzovsk  * address.
7970Sstevel@tonic-gate  */
7980Sstevel@tonic-gate int
seg_pinsert(struct seg * seg,struct anon_map * amp,caddr_t addr,size_t len,size_t wlen,struct page ** pp,enum seg_rw rw,uint_t flags,seg_preclaim_cbfunc_t callback)7996695Saguzovsk seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
8006695Saguzovsk     size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags,
8016695Saguzovsk     seg_preclaim_cbfunc_t callback)
8020Sstevel@tonic-gate {
8030Sstevel@tonic-gate 	struct seg_pcache *pcp;
8040Sstevel@tonic-gate 	struct seg_phash *hp;
8050Sstevel@tonic-gate 	pgcnt_t npages;
8066695Saguzovsk 	pcache_link_t *pheadp;
8076695Saguzovsk 	kmutex_t *pmtx;
8086695Saguzovsk 	struct seg_pcache *delcallb_list = NULL;
8090Sstevel@tonic-gate 
8106695Saguzovsk 	ASSERT(seg != NULL);
8116695Saguzovsk 	ASSERT(rw == S_READ || rw == S_WRITE);
8126695Saguzovsk 	ASSERT(rw == S_READ || wlen == len);
8136695Saguzovsk 	ASSERT(rw == S_WRITE || wlen <= len);
8146695Saguzovsk 	ASSERT(amp == NULL || wlen == len);
8156695Saguzovsk 
8166695Saguzovsk #ifdef DEBUG
8176695Saguzovsk 	if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) {
8180Sstevel@tonic-gate 		return (SEGP_FAIL);
8190Sstevel@tonic-gate 	}
8206695Saguzovsk #endif
8216695Saguzovsk 
8226695Saguzovsk 	if (seg_pdisabled) {
8230Sstevel@tonic-gate 		return (SEGP_FAIL);
8240Sstevel@tonic-gate 	}
8256695Saguzovsk 	ASSERT(seg_phashsize_win != 0);
8266695Saguzovsk 
8276695Saguzovsk 	ASSERT((len & PAGEOFFSET) == 0);
8286695Saguzovsk 	npages = btop(len);
8296695Saguzovsk 	mutex_enter(&seg_pmem_mtx);
8306695Saguzovsk 	if (!IS_PFLAGS_WIRED(flags)) {
8316695Saguzovsk 		if (seg_plocked_window + npages > seg_pmaxwindow) {
8326695Saguzovsk 			mutex_exit(&seg_pmem_mtx);
8330Sstevel@tonic-gate 			return (SEGP_FAIL);
8340Sstevel@tonic-gate 		}
8356695Saguzovsk 		seg_plocked_window += npages;
8360Sstevel@tonic-gate 	}
8370Sstevel@tonic-gate 	seg_plocked += npages;
8386695Saguzovsk 	mutex_exit(&seg_pmem_mtx);
8390Sstevel@tonic-gate 
8406695Saguzovsk 	pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP);
8416695Saguzovsk 	/*
8426695Saguzovsk 	 * If amp is not NULL set htag0 to amp otherwise set it to seg.
8436695Saguzovsk 	 */
8446695Saguzovsk 	if (amp == NULL) {
8456695Saguzovsk 		pcp->p_htag0 = (void *)seg;
8466695Saguzovsk 		pcp->p_flags = flags & 0xffff;
8476695Saguzovsk 	} else {
8486695Saguzovsk 		pcp->p_htag0 = (void *)amp;
8496695Saguzovsk 		pcp->p_flags = (flags & 0xffff) | SEGP_AMP;
8506695Saguzovsk 	}
8510Sstevel@tonic-gate 	pcp->p_addr = addr;
8520Sstevel@tonic-gate 	pcp->p_len = len;
8536695Saguzovsk 	pcp->p_wlen = wlen;
8540Sstevel@tonic-gate 	pcp->p_pp = pp;
8556695Saguzovsk 	pcp->p_write = (rw == S_WRITE);
8560Sstevel@tonic-gate 	pcp->p_callback = callback;
8570Sstevel@tonic-gate 	pcp->p_active = 1;
8580Sstevel@tonic-gate 
8596695Saguzovsk 	hp = P_HASHBP(seg, pcp->p_htag0, addr, flags);
8606695Saguzovsk 	if (!IS_PFLAGS_WIRED(flags)) {
8616695Saguzovsk 		int found;
8626695Saguzovsk 		void *htag0;
8636695Saguzovsk 		if (amp == NULL) {
8646695Saguzovsk 			pheadp = &seg->s_phead;
8656695Saguzovsk 			pmtx = &seg->s_pmtx;
8666695Saguzovsk 			htag0 = (void *)seg;
8676695Saguzovsk 		} else {
8686695Saguzovsk 			pheadp = &amp->a_phead;
8696695Saguzovsk 			pmtx = &amp->a_pmtx;
8706695Saguzovsk 			htag0 = (void *)amp;
8716695Saguzovsk 		}
8726695Saguzovsk 		mutex_enter(pmtx);
8736695Saguzovsk 		mutex_enter(&hp->p_hmutex);
8746695Saguzovsk 		delcallb_list = seg_plookup_checkdup(hp, htag0, addr,
8756695Saguzovsk 		    len, &found);
8766695Saguzovsk 		if (found) {
8776695Saguzovsk 			mutex_exit(&hp->p_hmutex);
8786695Saguzovsk 			mutex_exit(pmtx);
8796695Saguzovsk 			mutex_enter(&seg_pmem_mtx);
8806695Saguzovsk 			seg_plocked -= npages;
8816695Saguzovsk 			seg_plocked_window -= npages;
8826695Saguzovsk 			mutex_exit(&seg_pmem_mtx);
8836695Saguzovsk 			kmem_cache_free(seg_pkmcache, pcp);
8846695Saguzovsk 			goto out;
8856695Saguzovsk 		}
8866695Saguzovsk 		pcp->p_plink.p_lnext = pheadp->p_lnext;
8876695Saguzovsk 		pcp->p_plink.p_lprev = pheadp;
8886695Saguzovsk 		pheadp->p_lnext->p_lprev = &pcp->p_plink;
8896695Saguzovsk 		pheadp->p_lnext = &pcp->p_plink;
8906695Saguzovsk 	} else {
8916695Saguzovsk 		mutex_enter(&hp->p_hmutex);
8926695Saguzovsk 	}
8936695Saguzovsk 	pcp->p_hashp = hp;
8940Sstevel@tonic-gate 	pcp->p_hnext = hp->p_hnext;
8950Sstevel@tonic-gate 	pcp->p_hprev = (struct seg_pcache *)hp;
8960Sstevel@tonic-gate 	hp->p_hnext->p_hprev = pcp;
8970Sstevel@tonic-gate 	hp->p_hnext = pcp;
8986695Saguzovsk 	if (!IS_PFLAGS_WIRED(flags) &&
8996695Saguzovsk 	    hp->p_hprev == pcp) {
9006695Saguzovsk 		seg_padd_abuck(hp);
9016695Saguzovsk 	}
9020Sstevel@tonic-gate 	mutex_exit(&hp->p_hmutex);
9036695Saguzovsk 	if (!IS_PFLAGS_WIRED(flags)) {
9046695Saguzovsk 		mutex_exit(pmtx);
9056695Saguzovsk 	}
9066695Saguzovsk 
9076695Saguzovsk out:
9086695Saguzovsk 	npages = 0;
9096695Saguzovsk 	while (delcallb_list != NULL) {
9106695Saguzovsk 		pcp = delcallb_list;
9116695Saguzovsk 		delcallb_list = pcp->p_hprev;
9126695Saguzovsk 		ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active);
9136695Saguzovsk 		(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
9146695Saguzovsk 		    pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
9156695Saguzovsk 		npages += btop(pcp->p_len);
9166695Saguzovsk 		kmem_cache_free(seg_pkmcache, pcp);
9176695Saguzovsk 	}
9186695Saguzovsk 	if (npages) {
9196695Saguzovsk 		ASSERT(!IS_PFLAGS_WIRED(flags));
9206695Saguzovsk 		mutex_enter(&seg_pmem_mtx);
9216695Saguzovsk 		ASSERT(seg_plocked >= npages);
9226695Saguzovsk 		ASSERT(seg_plocked_window >= npages);
9236695Saguzovsk 		seg_plocked -= npages;
9246695Saguzovsk 		seg_plocked_window -= npages;
9256695Saguzovsk 		mutex_exit(&seg_pmem_mtx);
9266695Saguzovsk 	}
9276695Saguzovsk 
9280Sstevel@tonic-gate 	return (SEGP_SUCCESS);
9290Sstevel@tonic-gate }
9300Sstevel@tonic-gate 
9310Sstevel@tonic-gate /*
9326695Saguzovsk  * purge entries from the pagelock cache if not active
9336695Saguzovsk  * and not recently used.
9340Sstevel@tonic-gate  */
9350Sstevel@tonic-gate static void
seg_ppurge_async(int force)9366695Saguzovsk seg_ppurge_async(int force)
9370Sstevel@tonic-gate {
9380Sstevel@tonic-gate 	struct seg_pcache *delcallb_list = NULL;
9390Sstevel@tonic-gate 	struct seg_pcache *pcp;
9400Sstevel@tonic-gate 	struct seg_phash *hp;
9410Sstevel@tonic-gate 	pgcnt_t npages = 0;
9420Sstevel@tonic-gate 	pgcnt_t npages_window = 0;
9436695Saguzovsk 	pgcnt_t	npgs_to_purge;
9446695Saguzovsk 	pgcnt_t npgs_purged = 0;
9456695Saguzovsk 	int hlinks = 0;
9466695Saguzovsk 	int hlix;
9476695Saguzovsk 	pcache_link_t *hlinkp;
9486695Saguzovsk 	pcache_link_t *hlnextp = NULL;
9496695Saguzovsk 	int lowmem;
9506695Saguzovsk 	int trim;
9516695Saguzovsk 
9526695Saguzovsk 	ASSERT(seg_phashsize_win != 0);
9530Sstevel@tonic-gate 
9540Sstevel@tonic-gate 	/*
9556695Saguzovsk 	 * if the cache is off or empty, return
9560Sstevel@tonic-gate 	 */
9576695Saguzovsk 	if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) {
9580Sstevel@tonic-gate 		return;
9590Sstevel@tonic-gate 	}
9606695Saguzovsk 
9616695Saguzovsk 	if (!force) {
9626695Saguzovsk 		lowmem = 0;
9636695Saguzovsk 		trim = 0;
9646695Saguzovsk 		if (freemem < lotsfree + needfree) {
9656695Saguzovsk 			spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0);
9666695Saguzovsk 			if (fmem <= 5 * (desfree >> 2)) {
9676695Saguzovsk 				lowmem = 1;
9686695Saguzovsk 			} else if (fmem <= 7 * (lotsfree >> 3)) {
9696695Saguzovsk 				if (seg_plocked_window >=
9706695Saguzovsk 				    (availrmem_initial >> 1)) {
9716695Saguzovsk 					lowmem = 1;
9726695Saguzovsk 				}
9736695Saguzovsk 			} else if (fmem < lotsfree) {
9746695Saguzovsk 				if (seg_plocked_window >=
9756695Saguzovsk 				    3 * (availrmem_initial >> 2)) {
9766695Saguzovsk 					lowmem = 1;
9776695Saguzovsk 				}
9786695Saguzovsk 			}
9796695Saguzovsk 		}
9806695Saguzovsk 		if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) {
9816695Saguzovsk 			trim = 1;
9826695Saguzovsk 		}
9836695Saguzovsk 		if (!lowmem && !trim) {
9846695Saguzovsk 			return;
9856695Saguzovsk 		}
9866695Saguzovsk 		npgs_to_purge = seg_plocked_window >>
9876695Saguzovsk 		    seg_pshrink_shift;
9886695Saguzovsk 		if (lowmem) {
9896695Saguzovsk 			npgs_to_purge = MIN(npgs_to_purge,
9906695Saguzovsk 			    MAX(seg_pmaxapurge_npages, desfree));
9916695Saguzovsk 		} else {
9926695Saguzovsk 			npgs_to_purge = MIN(npgs_to_purge,
9936695Saguzovsk 			    seg_pmaxapurge_npages);
9946695Saguzovsk 		}
9956695Saguzovsk 		if (npgs_to_purge == 0) {
9966695Saguzovsk 			return;
9976695Saguzovsk 		}
9986695Saguzovsk 	} else {
9996695Saguzovsk 		struct seg_phash_wired *hpw;
10006695Saguzovsk 
10016695Saguzovsk 		ASSERT(seg_phashsize_wired != 0);
10026695Saguzovsk 
10036695Saguzovsk 		for (hpw = seg_phashtab_wired;
10046695Saguzovsk 		    hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) {
10056695Saguzovsk 
10066695Saguzovsk 			if (hpw->p_hnext == (struct seg_pcache *)hpw) {
10076695Saguzovsk 				continue;
10086695Saguzovsk 			}
10096695Saguzovsk 
10106695Saguzovsk 			mutex_enter(&hpw->p_hmutex);
10116695Saguzovsk 
10126695Saguzovsk 			for (pcp = hpw->p_hnext;
10136695Saguzovsk 			    pcp != (struct seg_pcache *)hpw;
10146695Saguzovsk 			    pcp = pcp->p_hnext) {
10156695Saguzovsk 
10166695Saguzovsk 				ASSERT(IS_PCP_WIRED(pcp));
10176695Saguzovsk 				ASSERT(pcp->p_hashp ==
10186695Saguzovsk 				    (struct seg_phash *)hpw);
10196695Saguzovsk 
10206695Saguzovsk 				if (pcp->p_active) {
10216695Saguzovsk 					continue;
10226695Saguzovsk 				}
10236695Saguzovsk 				pcp->p_hprev->p_hnext = pcp->p_hnext;
10246695Saguzovsk 				pcp->p_hnext->p_hprev = pcp->p_hprev;
10256695Saguzovsk 				pcp->p_hprev = delcallb_list;
10266695Saguzovsk 				delcallb_list = pcp;
10276695Saguzovsk 			}
10286695Saguzovsk 			mutex_exit(&hpw->p_hmutex);
10296695Saguzovsk 		}
10306695Saguzovsk 	}
10316695Saguzovsk 
10326695Saguzovsk 	mutex_enter(&seg_pmem_mtx);
10336695Saguzovsk 	if (seg_pathr_on) {
10346695Saguzovsk 		mutex_exit(&seg_pmem_mtx);
10356695Saguzovsk 		goto runcb;
10366695Saguzovsk 	}
10376695Saguzovsk 	seg_pathr_on = 1;
10386695Saguzovsk 	mutex_exit(&seg_pmem_mtx);
10396695Saguzovsk 	ASSERT(seg_pahcur <= 1);
10406695Saguzovsk 	hlix = !seg_pahcur;
10416695Saguzovsk 
10426695Saguzovsk again:
10436695Saguzovsk 	for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix];
10446695Saguzovsk 	    hlinkp = hlnextp) {
10456695Saguzovsk 
10466695Saguzovsk 		hlnextp = hlinkp->p_lnext;
10476695Saguzovsk 		ASSERT(hlnextp != NULL);
10486695Saguzovsk 
10496695Saguzovsk 		hp = hlink2phash(hlinkp, hlix);
10506695Saguzovsk 		if (hp->p_hnext == (struct seg_pcache *)hp) {
10516695Saguzovsk 			seg_pathr_empty_ahb++;
10526695Saguzovsk 			continue;
10536695Saguzovsk 		}
10546695Saguzovsk 		seg_pathr_full_ahb++;
10550Sstevel@tonic-gate 		mutex_enter(&hp->p_hmutex);
10566695Saguzovsk 
10576695Saguzovsk 		for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
10586695Saguzovsk 		    pcp = pcp->p_hnext) {
10596695Saguzovsk 			pcache_link_t *pheadp;
10606695Saguzovsk 			pcache_link_t *plinkp;
10616695Saguzovsk 			void *htag0;
10626695Saguzovsk 			kmutex_t *pmtx;
10636695Saguzovsk 
10646695Saguzovsk 			ASSERT(!IS_PCP_WIRED(pcp));
10656695Saguzovsk 			ASSERT(pcp->p_hashp == hp);
10666695Saguzovsk 
10676695Saguzovsk 			if (pcp->p_active) {
10686695Saguzovsk 				continue;
10696695Saguzovsk 			}
10706695Saguzovsk 			if (!force && pcp->p_ref &&
10716695Saguzovsk 			    PCP_AGE(pcp) < seg_pmax_pcpage) {
10726695Saguzovsk 				pcp->p_ref = 0;
10736695Saguzovsk 				continue;
10746695Saguzovsk 			}
10756695Saguzovsk 			plinkp = &pcp->p_plink;
10766695Saguzovsk 			htag0 = pcp->p_htag0;
10776695Saguzovsk 			if (pcp->p_flags & SEGP_AMP) {
10786695Saguzovsk 				pheadp = &((amp_t *)htag0)->a_phead;
10796695Saguzovsk 				pmtx = &((amp_t *)htag0)->a_pmtx;
10806695Saguzovsk 			} else {
10816695Saguzovsk 				pheadp = &((seg_t *)htag0)->s_phead;
10826695Saguzovsk 				pmtx = &((seg_t *)htag0)->s_pmtx;
10836695Saguzovsk 			}
10846695Saguzovsk 			if (!mutex_tryenter(pmtx)) {
10856695Saguzovsk 				continue;
10866695Saguzovsk 			}
10876695Saguzovsk 			ASSERT(pheadp->p_lnext != pheadp);
10886695Saguzovsk 			ASSERT(pheadp->p_lprev != pheadp);
10896695Saguzovsk 			plinkp->p_lprev->p_lnext =
10906695Saguzovsk 			    plinkp->p_lnext;
10916695Saguzovsk 			plinkp->p_lnext->p_lprev =
10926695Saguzovsk 			    plinkp->p_lprev;
10936695Saguzovsk 			pcp->p_hprev->p_hnext = pcp->p_hnext;
10946695Saguzovsk 			pcp->p_hnext->p_hprev = pcp->p_hprev;
10956695Saguzovsk 			mutex_exit(pmtx);
10966695Saguzovsk 			pcp->p_hprev = delcallb_list;
10976695Saguzovsk 			delcallb_list = pcp;
10986695Saguzovsk 			npgs_purged += btop(pcp->p_len);
10996695Saguzovsk 		}
11006695Saguzovsk 		if (hp->p_hnext == (struct seg_pcache *)hp) {
11016695Saguzovsk 			seg_premove_abuck(hp, 1);
11026695Saguzovsk 		}
11036695Saguzovsk 		mutex_exit(&hp->p_hmutex);
11046695Saguzovsk 		if (npgs_purged >= seg_plocked_window) {
11056695Saguzovsk 			break;
11066695Saguzovsk 		}
11076695Saguzovsk 		if (!force) {
11086695Saguzovsk 			if (npgs_purged >= npgs_to_purge) {
11096695Saguzovsk 				break;
11106695Saguzovsk 			}
11116695Saguzovsk 			if (!trim && !(seg_pathr_full_ahb & 15)) {
11126695Saguzovsk 				ASSERT(lowmem);
11136695Saguzovsk 				if (freemem >= lotsfree + needfree) {
11146695Saguzovsk 					break;
11156695Saguzovsk 				}
11166695Saguzovsk 			}
11176695Saguzovsk 		}
11186695Saguzovsk 	}
11196695Saguzovsk 
11206695Saguzovsk 	if (hlinkp == &seg_pahhead[hlix]) {
11216695Saguzovsk 		/*
11226695Saguzovsk 		 * We processed the entire hlix active bucket list
11236695Saguzovsk 		 * but didn't find enough pages to reclaim.
11246695Saguzovsk 		 * Switch the lists and walk the other list
11256695Saguzovsk 		 * if we haven't done it yet.
11266695Saguzovsk 		 */
11276695Saguzovsk 		mutex_enter(&seg_pmem_mtx);
11286695Saguzovsk 		ASSERT(seg_pathr_on);
11296695Saguzovsk 		ASSERT(seg_pahcur == !hlix);
11306695Saguzovsk 		seg_pahcur = hlix;
11316695Saguzovsk 		mutex_exit(&seg_pmem_mtx);
11326695Saguzovsk 		if (++hlinks < 2) {
11336695Saguzovsk 			hlix = !hlix;
11346695Saguzovsk 			goto again;
11356695Saguzovsk 		}
11366695Saguzovsk 	} else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] &&
11376695Saguzovsk 	    seg_pahhead[hlix].p_lnext != hlinkp) {
11386695Saguzovsk 		ASSERT(hlinkp != NULL);
11396695Saguzovsk 		ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]);
11406695Saguzovsk 		ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]);
11416695Saguzovsk 		ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]);
11420Sstevel@tonic-gate 
11430Sstevel@tonic-gate 		/*
11446695Saguzovsk 		 * Reinsert the header to point to hlinkp
11456695Saguzovsk 		 * so that we start from hlinkp bucket next time around.
11460Sstevel@tonic-gate 		 */
11476695Saguzovsk 		seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev;
11486695Saguzovsk 		seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext;
11496695Saguzovsk 		seg_pahhead[hlix].p_lnext = hlinkp;
11506695Saguzovsk 		seg_pahhead[hlix].p_lprev = hlinkp->p_lprev;
11516695Saguzovsk 		hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix];
11526695Saguzovsk 		hlinkp->p_lprev = &seg_pahhead[hlix];
11536695Saguzovsk 	}
11546695Saguzovsk 
11556695Saguzovsk 	mutex_enter(&seg_pmem_mtx);
11566695Saguzovsk 	ASSERT(seg_pathr_on);
11576695Saguzovsk 	seg_pathr_on = 0;
11586695Saguzovsk 	mutex_exit(&seg_pmem_mtx);
11590Sstevel@tonic-gate 
11606695Saguzovsk runcb:
11616695Saguzovsk 	/*
11626695Saguzovsk 	 * Run the delayed callback list. segments/amps can't go away until
11636695Saguzovsk 	 * callback is executed since they must have non 0 softlockcnt. That's
11646695Saguzovsk 	 * why we don't need to hold as/seg/amp locks to execute the callback.
11656695Saguzovsk 	 */
11666695Saguzovsk 	while (delcallb_list != NULL) {
11676695Saguzovsk 		pcp = delcallb_list;
11686695Saguzovsk 		delcallb_list = pcp->p_hprev;
11696695Saguzovsk 		ASSERT(!pcp->p_active);
11706695Saguzovsk 		(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
11716695Saguzovsk 		    pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1);
11726695Saguzovsk 		npages += btop(pcp->p_len);
11736695Saguzovsk 		if (!IS_PCP_WIRED(pcp)) {
11746695Saguzovsk 			npages_window += btop(pcp->p_len);
11756695Saguzovsk 		}
11766695Saguzovsk 		kmem_cache_free(seg_pkmcache, pcp);
11776695Saguzovsk 	}
11786695Saguzovsk 	if (npages) {
11796695Saguzovsk 		mutex_enter(&seg_pmem_mtx);
11806695Saguzovsk 		ASSERT(seg_plocked >= npages);
11816695Saguzovsk 		ASSERT(seg_plocked_window >= npages_window);
11826695Saguzovsk 		seg_plocked -= npages;
11836695Saguzovsk 		seg_plocked_window -= npages_window;
11846695Saguzovsk 		mutex_exit(&seg_pmem_mtx);
11856695Saguzovsk 	}
11866695Saguzovsk }
11876695Saguzovsk 
11886695Saguzovsk /*
11896695Saguzovsk  * Remove cached pages for segment(s) entries from hashtable.  The segments
11906695Saguzovsk  * are identified by pp array. This is useful for multiple seg's cached on
11916695Saguzovsk  * behalf of dummy segment (ISM/DISM) with common pp array.
11926695Saguzovsk  */
11936695Saguzovsk void
seg_ppurge_wiredpp(struct page ** pp)11946695Saguzovsk seg_ppurge_wiredpp(struct page **pp)
11956695Saguzovsk {
11966695Saguzovsk 	struct seg_pcache *pcp;
11976695Saguzovsk 	struct seg_phash_wired *hp;
11986695Saguzovsk 	pgcnt_t npages = 0;
11996695Saguzovsk 	struct	seg_pcache *delcallb_list = NULL;
12006695Saguzovsk 
12016695Saguzovsk 	/*
12026695Saguzovsk 	 * if the cache is empty, return
12036695Saguzovsk 	 */
12046695Saguzovsk 	if (seg_plocked == 0) {
12056695Saguzovsk 		return;
12066695Saguzovsk 	}
12076695Saguzovsk 	ASSERT(seg_phashsize_wired != 0);
12086695Saguzovsk 
12096695Saguzovsk 	for (hp = seg_phashtab_wired;
12106695Saguzovsk 	    hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) {
12116695Saguzovsk 		if (hp->p_hnext == (struct seg_pcache *)hp) {
12126695Saguzovsk 			continue;
12136695Saguzovsk 		}
12146695Saguzovsk 		mutex_enter(&hp->p_hmutex);
12156695Saguzovsk 		pcp = hp->p_hnext;
12166695Saguzovsk 		while (pcp != (struct seg_pcache *)hp) {
12176695Saguzovsk 			ASSERT(pcp->p_hashp == (struct seg_phash *)hp);
12186695Saguzovsk 			ASSERT(IS_PCP_WIRED(pcp));
12190Sstevel@tonic-gate 			/*
12206695Saguzovsk 			 * purge entries which are not active
12210Sstevel@tonic-gate 			 */
12226695Saguzovsk 			if (!pcp->p_active && pcp->p_pp == pp) {
12236695Saguzovsk 				ASSERT(pcp->p_htag0 != NULL);
12246695Saguzovsk 				pcp->p_hprev->p_hnext = pcp->p_hnext;
12256695Saguzovsk 				pcp->p_hnext->p_hprev = pcp->p_hprev;
12266695Saguzovsk 				pcp->p_hprev = delcallb_list;
12276695Saguzovsk 				delcallb_list = pcp;
12280Sstevel@tonic-gate 			}
12290Sstevel@tonic-gate 			pcp = pcp->p_hnext;
12300Sstevel@tonic-gate 		}
12310Sstevel@tonic-gate 		mutex_exit(&hp->p_hmutex);
12326695Saguzovsk 		/*
12336695Saguzovsk 		 * segments can't go away until callback is executed since
12346695Saguzovsk 		 * they must have non 0 softlockcnt. That's why we don't
12356695Saguzovsk 		 * need to hold as/seg locks to execute the callback.
12366695Saguzovsk 		 */
12376695Saguzovsk 		while (delcallb_list != NULL) {
12386695Saguzovsk 			int done;
12396695Saguzovsk 			pcp = delcallb_list;
12406695Saguzovsk 			delcallb_list = pcp->p_hprev;
12416695Saguzovsk 			ASSERT(!pcp->p_active);
12426695Saguzovsk 			done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
12436695Saguzovsk 			    pcp->p_len, pcp->p_pp,
12446695Saguzovsk 			    pcp->p_write ? S_WRITE : S_READ, 1);
12456695Saguzovsk 			npages += btop(pcp->p_len);
12466695Saguzovsk 			ASSERT(IS_PCP_WIRED(pcp));
12476695Saguzovsk 			kmem_cache_free(seg_pkmcache, pcp);
12486695Saguzovsk 			if (done) {
12496695Saguzovsk 				ASSERT(delcallb_list == NULL);
12506695Saguzovsk 				goto out;
12516695Saguzovsk 			}
12526695Saguzovsk 		}
12530Sstevel@tonic-gate 	}
12540Sstevel@tonic-gate 
12556695Saguzovsk out:
12566695Saguzovsk 	mutex_enter(&seg_pmem_mtx);
12576695Saguzovsk 	ASSERT(seg_plocked >= npages);
12580Sstevel@tonic-gate 	seg_plocked -= npages;
12596695Saguzovsk 	mutex_exit(&seg_pmem_mtx);
12600Sstevel@tonic-gate }
12610Sstevel@tonic-gate 
12620Sstevel@tonic-gate /*
12630Sstevel@tonic-gate  * purge all entries for a given segment. Since we
12640Sstevel@tonic-gate  * callback into the segment driver directly for page
12650Sstevel@tonic-gate  * reclaim the caller needs to hold the right locks.
12660Sstevel@tonic-gate  */
12670Sstevel@tonic-gate void
seg_ppurge(struct seg * seg,struct anon_map * amp,uint_t flags)12686695Saguzovsk seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags)
12690Sstevel@tonic-gate {
12700Sstevel@tonic-gate 	struct seg_pcache *delcallb_list = NULL;
12710Sstevel@tonic-gate 	struct seg_pcache *pcp;
12720Sstevel@tonic-gate 	struct seg_phash *hp;
12730Sstevel@tonic-gate 	pgcnt_t npages = 0;
12746695Saguzovsk 	void *htag0;
12750Sstevel@tonic-gate 
12766695Saguzovsk 	if (seg_plocked == 0) {
12770Sstevel@tonic-gate 		return;
12780Sstevel@tonic-gate 	}
12796695Saguzovsk 	ASSERT(seg_phashsize_win != 0);
12806695Saguzovsk 
12816695Saguzovsk 	/*
12826695Saguzovsk 	 * If amp is not NULL use amp as a lookup tag otherwise use seg
12836695Saguzovsk 	 * as a lookup tag.
12846695Saguzovsk 	 */
12856695Saguzovsk 	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
12866695Saguzovsk 	ASSERT(htag0 != NULL);
12876695Saguzovsk 	if (IS_PFLAGS_WIRED(flags)) {
12886695Saguzovsk 		hp = P_HASHBP(seg, htag0, 0, flags);
12896695Saguzovsk 		mutex_enter(&hp->p_hmutex);
12906695Saguzovsk 		pcp = hp->p_hnext;
12916695Saguzovsk 		while (pcp != (struct seg_pcache *)hp) {
12926695Saguzovsk 			ASSERT(pcp->p_hashp == hp);
12936695Saguzovsk 			ASSERT(IS_PCP_WIRED(pcp));
12946695Saguzovsk 			if (pcp->p_htag0 == htag0) {
12956695Saguzovsk 				if (pcp->p_active) {
12966695Saguzovsk 					break;
12976695Saguzovsk 				}
12986695Saguzovsk 				pcp->p_hprev->p_hnext = pcp->p_hnext;
12996695Saguzovsk 				pcp->p_hnext->p_hprev = pcp->p_hprev;
13006695Saguzovsk 				pcp->p_hprev = delcallb_list;
13016695Saguzovsk 				delcallb_list = pcp;
13026695Saguzovsk 			}
13036695Saguzovsk 			pcp = pcp->p_hnext;
13046695Saguzovsk 		}
13056695Saguzovsk 		mutex_exit(&hp->p_hmutex);
13066695Saguzovsk 	} else {
13076695Saguzovsk 		pcache_link_t *plinkp;
13086695Saguzovsk 		pcache_link_t *pheadp;
13096695Saguzovsk 		kmutex_t *pmtx;
13106695Saguzovsk 
13116695Saguzovsk 		if (amp == NULL) {
13126695Saguzovsk 			ASSERT(seg != NULL);
13136695Saguzovsk 			pheadp = &seg->s_phead;
13146695Saguzovsk 			pmtx = &seg->s_pmtx;
13156695Saguzovsk 		} else {
13166695Saguzovsk 			pheadp = &amp->a_phead;
13176695Saguzovsk 			pmtx = &amp->a_pmtx;
13186695Saguzovsk 		}
13196695Saguzovsk 		mutex_enter(pmtx);
13206695Saguzovsk 		while ((plinkp = pheadp->p_lnext) != pheadp) {
13216695Saguzovsk 			pcp = plink2pcache(plinkp);
13226695Saguzovsk 			ASSERT(!IS_PCP_WIRED(pcp));
13236695Saguzovsk 			ASSERT(pcp->p_htag0 == htag0);
13246695Saguzovsk 			hp = pcp->p_hashp;
13256695Saguzovsk 			mutex_enter(&hp->p_hmutex);
13260Sstevel@tonic-gate 			if (pcp->p_active) {
13276695Saguzovsk 				mutex_exit(&hp->p_hmutex);
13280Sstevel@tonic-gate 				break;
13290Sstevel@tonic-gate 			}
13306695Saguzovsk 			ASSERT(plinkp->p_lprev == pheadp);
13316695Saguzovsk 			pheadp->p_lnext = plinkp->p_lnext;
13326695Saguzovsk 			plinkp->p_lnext->p_lprev = pheadp;
13330Sstevel@tonic-gate 			pcp->p_hprev->p_hnext = pcp->p_hnext;
13340Sstevel@tonic-gate 			pcp->p_hnext->p_hprev = pcp->p_hprev;
13350Sstevel@tonic-gate 			pcp->p_hprev = delcallb_list;
13360Sstevel@tonic-gate 			delcallb_list = pcp;
13376695Saguzovsk 			if (hp->p_hnext == (struct seg_pcache *)hp) {
13386695Saguzovsk 				seg_premove_abuck(hp, 0);
13396695Saguzovsk 			}
13406695Saguzovsk 			mutex_exit(&hp->p_hmutex);
13410Sstevel@tonic-gate 		}
13426695Saguzovsk 		mutex_exit(pmtx);
13430Sstevel@tonic-gate 	}
13440Sstevel@tonic-gate 	while (delcallb_list != NULL) {
13450Sstevel@tonic-gate 		pcp = delcallb_list;
13460Sstevel@tonic-gate 		delcallb_list = pcp->p_hprev;
13476695Saguzovsk 		ASSERT(!pcp->p_active);
13486695Saguzovsk 		(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len,
13496695Saguzovsk 		    pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
13506695Saguzovsk 		npages += btop(pcp->p_len);
13516695Saguzovsk 		kmem_cache_free(seg_pkmcache, pcp);
13520Sstevel@tonic-gate 	}
13536695Saguzovsk 	mutex_enter(&seg_pmem_mtx);
13546695Saguzovsk 	ASSERT(seg_plocked >= npages);
13550Sstevel@tonic-gate 	seg_plocked -= npages;
13566695Saguzovsk 	if (!IS_PFLAGS_WIRED(flags)) {
13576695Saguzovsk 		ASSERT(seg_plocked_window >= npages);
13586695Saguzovsk 		seg_plocked_window -= npages;
13596695Saguzovsk 	}
13606695Saguzovsk 	mutex_exit(&seg_pmem_mtx);
13610Sstevel@tonic-gate }
13620Sstevel@tonic-gate 
13630Sstevel@tonic-gate static void seg_pinit_mem_config(void);
13640Sstevel@tonic-gate 
13650Sstevel@tonic-gate /*
13660Sstevel@tonic-gate  * setup the pagelock cache
13670Sstevel@tonic-gate  */
13680Sstevel@tonic-gate static void
seg_pinit(void)13690Sstevel@tonic-gate seg_pinit(void)
13700Sstevel@tonic-gate {
13710Sstevel@tonic-gate 	struct seg_phash *hp;
13726695Saguzovsk 	ulong_t i;
13736695Saguzovsk 	pgcnt_t physmegs;
13746695Saguzovsk 
13756695Saguzovsk 	seg_plocked = 0;
13766695Saguzovsk 	seg_plocked_window = 0;
13776695Saguzovsk 
13786695Saguzovsk 	if (segpcache_enabled == 0) {
13796695Saguzovsk 		seg_phashsize_win = 0;
13806695Saguzovsk 		seg_phashsize_wired = 0;
13816695Saguzovsk 		seg_pdisabled = 1;
13826695Saguzovsk 		return;
13836695Saguzovsk 	}
13840Sstevel@tonic-gate 
13856695Saguzovsk 	seg_pdisabled = 0;
13866695Saguzovsk 	seg_pkmcache = kmem_cache_create("seg_pcache",
13876695Saguzovsk 	    sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0);
13886695Saguzovsk 	if (segpcache_pcp_maxage_ticks <= 0) {
13896695Saguzovsk 		segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz;
13906695Saguzovsk 	}
13916695Saguzovsk 	seg_pmax_pcpage = segpcache_pcp_maxage_ticks;
13926695Saguzovsk 	seg_pathr_empty_ahb = 0;
13936695Saguzovsk 	seg_pathr_full_ahb = 0;
13946695Saguzovsk 	seg_pshrink_shift = segpcache_shrink_shift;
13956695Saguzovsk 	seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes);
13960Sstevel@tonic-gate 
13976695Saguzovsk 	mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL);
13986695Saguzovsk 	mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL);
13996695Saguzovsk 	mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL);
14006695Saguzovsk 	cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL);
14016695Saguzovsk 
14026695Saguzovsk 	physmegs = physmem >> (20 - PAGESHIFT);
14030Sstevel@tonic-gate 
14046695Saguzovsk 	/*
14056695Saguzovsk 	 * If segpcache_hashsize_win was not set in /etc/system or it has
14066695Saguzovsk 	 * absurd value set it to a default.
14076695Saguzovsk 	 */
14086695Saguzovsk 	if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) {
14096695Saguzovsk 		/*
14106695Saguzovsk 		 * Create one bucket per 32K (or at least per 8 pages) of
14116695Saguzovsk 		 * available memory.
14126695Saguzovsk 		 */
14136695Saguzovsk 		pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8);
14146695Saguzovsk 		segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket);
14156695Saguzovsk 	}
14166695Saguzovsk 	if (!ISP2(segpcache_hashsize_win)) {
14176695Saguzovsk 		ulong_t rndfac = ~(1UL <<
14186695Saguzovsk 		    (highbit(segpcache_hashsize_win) - 1));
14196695Saguzovsk 		rndfac &= segpcache_hashsize_win;
14206695Saguzovsk 		segpcache_hashsize_win += rndfac;
14216695Saguzovsk 		segpcache_hashsize_win = 1 <<
14226695Saguzovsk 		    (highbit(segpcache_hashsize_win) - 1);
14236695Saguzovsk 	}
14246695Saguzovsk 	seg_phashsize_win = segpcache_hashsize_win;
14256695Saguzovsk 	seg_phashtab_win = kmem_zalloc(
14266695Saguzovsk 	    seg_phashsize_win * sizeof (struct seg_phash),
14276695Saguzovsk 	    KM_SLEEP);
14286695Saguzovsk 	for (i = 0; i < seg_phashsize_win; i++) {
14296695Saguzovsk 		hp = &seg_phashtab_win[i];
14306695Saguzovsk 		hp->p_hnext = (struct seg_pcache *)hp;
14316695Saguzovsk 		hp->p_hprev = (struct seg_pcache *)hp;
14326695Saguzovsk 		mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
14336695Saguzovsk 	}
14340Sstevel@tonic-gate 
14356695Saguzovsk 	seg_pahcur = 0;
14366695Saguzovsk 	seg_pathr_on = 0;
14376695Saguzovsk 	seg_pahhead[0].p_lnext = &seg_pahhead[0];
14386695Saguzovsk 	seg_pahhead[0].p_lprev = &seg_pahhead[0];
14396695Saguzovsk 	seg_pahhead[1].p_lnext = &seg_pahhead[1];
14406695Saguzovsk 	seg_pahhead[1].p_lprev = &seg_pahhead[1];
14416695Saguzovsk 
14426695Saguzovsk 	/*
14436695Saguzovsk 	 * If segpcache_hashsize_wired was not set in /etc/system or it has
14446695Saguzovsk 	 * absurd value set it to a default.
14456695Saguzovsk 	 */
14466695Saguzovsk 	if (segpcache_hashsize_wired == 0 ||
14476695Saguzovsk 	    segpcache_hashsize_wired > physmem / 4) {
14486695Saguzovsk 		/*
14496695Saguzovsk 		 * Choose segpcache_hashsize_wired based on physmem.
14506695Saguzovsk 		 * Create a bucket per 128K bytes upto 256K buckets.
14516695Saguzovsk 		 */
14526695Saguzovsk 		if (physmegs < 20 * 1024) {
14536695Saguzovsk 			segpcache_hashsize_wired = MAX(1024, physmegs << 3);
14546695Saguzovsk 		} else {
14556695Saguzovsk 			segpcache_hashsize_wired = 256 * 1024;
14560Sstevel@tonic-gate 		}
14570Sstevel@tonic-gate 	}
14586695Saguzovsk 	if (!ISP2(segpcache_hashsize_wired)) {
14596695Saguzovsk 		segpcache_hashsize_wired = 1 <<
14606695Saguzovsk 		    highbit(segpcache_hashsize_wired);
14616695Saguzovsk 	}
14626695Saguzovsk 	seg_phashsize_wired = segpcache_hashsize_wired;
14636695Saguzovsk 	seg_phashtab_wired = kmem_zalloc(
14646695Saguzovsk 	    seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP);
14656695Saguzovsk 	for (i = 0; i < seg_phashsize_wired; i++) {
14666695Saguzovsk 		hp = (struct seg_phash *)&seg_phashtab_wired[i];
14676695Saguzovsk 		hp->p_hnext = (struct seg_pcache *)hp;
14686695Saguzovsk 		hp->p_hprev = (struct seg_pcache *)hp;
14696695Saguzovsk 		mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
14706695Saguzovsk 	}
14710Sstevel@tonic-gate 
14726695Saguzovsk 	if (segpcache_maxwindow == 0) {
14736695Saguzovsk 		if (physmegs < 64) {
14746695Saguzovsk 			/* 3% of memory */
14756695Saguzovsk 			segpcache_maxwindow = availrmem >> 5;
14766695Saguzovsk 		} else if (physmegs < 512) {
14776695Saguzovsk 			/* 12% of memory */
14786695Saguzovsk 			segpcache_maxwindow = availrmem >> 3;
14796695Saguzovsk 		} else if (physmegs < 1024) {
14806695Saguzovsk 			/* 25% of memory */
14816695Saguzovsk 			segpcache_maxwindow = availrmem >> 2;
14826695Saguzovsk 		} else if (physmegs < 2048) {
14836695Saguzovsk 			/* 50% of memory */
14846695Saguzovsk 			segpcache_maxwindow = availrmem >> 1;
14856695Saguzovsk 		} else {
14866695Saguzovsk 			/* no limit */
14876695Saguzovsk 			segpcache_maxwindow = (pgcnt_t)-1;
14886695Saguzovsk 		}
14896695Saguzovsk 	}
14906695Saguzovsk 	seg_pmaxwindow = segpcache_maxwindow;
14910Sstevel@tonic-gate 	seg_pinit_mem_config();
14920Sstevel@tonic-gate }
14930Sstevel@tonic-gate 
14940Sstevel@tonic-gate /*
14950Sstevel@tonic-gate  * called by pageout if memory is low
14960Sstevel@tonic-gate  */
14970Sstevel@tonic-gate void
seg_preap(void)14980Sstevel@tonic-gate seg_preap(void)
14990Sstevel@tonic-gate {
15000Sstevel@tonic-gate 	/*
15016695Saguzovsk 	 * if the cache is off or empty, return
15020Sstevel@tonic-gate 	 */
15036695Saguzovsk 	if (seg_plocked_window == 0) {
15040Sstevel@tonic-gate 		return;
15050Sstevel@tonic-gate 	}
15066695Saguzovsk 	ASSERT(seg_phashsize_win != 0);
15076695Saguzovsk 
15086695Saguzovsk 	/*
15096695Saguzovsk 	 * If somebody is already purging pcache
15106695Saguzovsk 	 * just return.
15116695Saguzovsk 	 */
15126695Saguzovsk 	if (seg_pdisabled) {
15136695Saguzovsk 		return;
15146695Saguzovsk 	}
15156695Saguzovsk 
15166695Saguzovsk 	cv_signal(&seg_pasync_cv);
15170Sstevel@tonic-gate }
15180Sstevel@tonic-gate 
15190Sstevel@tonic-gate /*
15200Sstevel@tonic-gate  * run as a backgroud thread and reclaim pagelock
15210Sstevel@tonic-gate  * pages which have not been used recently
15220Sstevel@tonic-gate  */
15230Sstevel@tonic-gate void
seg_pasync_thread(void)15240Sstevel@tonic-gate seg_pasync_thread(void)
15250Sstevel@tonic-gate {
15260Sstevel@tonic-gate 	callb_cpr_t cpr_info;
15270Sstevel@tonic-gate 
15286695Saguzovsk 	if (seg_phashsize_win == 0) {
15296695Saguzovsk 		thread_exit();
15306695Saguzovsk 		/*NOTREACHED*/
15310Sstevel@tonic-gate 	}
15320Sstevel@tonic-gate 
15336695Saguzovsk 	seg_pasync_thr = curthread;
15346695Saguzovsk 
15356695Saguzovsk 	CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx,
15366695Saguzovsk 	    callb_generic_cpr, "seg_pasync");
15376695Saguzovsk 
15386695Saguzovsk 	if (segpcache_reap_ticks <= 0) {
15396695Saguzovsk 		segpcache_reap_ticks = segpcache_reap_sec * hz;
15406695Saguzovsk 	}
15410Sstevel@tonic-gate 
15426695Saguzovsk 	mutex_enter(&seg_pasync_mtx);
15436695Saguzovsk 	for (;;) {
15446695Saguzovsk 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
1545*11066Srafael.vanoni@sun.com 		(void) cv_reltimedwait(&seg_pasync_cv, &seg_pasync_mtx,
1546*11066Srafael.vanoni@sun.com 		    segpcache_reap_ticks, TR_CLOCK_TICK);
15476695Saguzovsk 		CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx);
15486695Saguzovsk 		if (seg_pdisabled == 0) {
15496695Saguzovsk 			seg_ppurge_async(0);
15506695Saguzovsk 		}
15510Sstevel@tonic-gate 	}
15520Sstevel@tonic-gate }
15530Sstevel@tonic-gate 
15540Sstevel@tonic-gate static struct kmem_cache *seg_cache;
15550Sstevel@tonic-gate 
15560Sstevel@tonic-gate /*
15570Sstevel@tonic-gate  * Initialize segment management data structures.
15580Sstevel@tonic-gate  */
15590Sstevel@tonic-gate void
seg_init(void)15600Sstevel@tonic-gate seg_init(void)
15610Sstevel@tonic-gate {
15620Sstevel@tonic-gate 	kstat_t *ksp;
15630Sstevel@tonic-gate 
15646695Saguzovsk 	seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
15656695Saguzovsk 	    0, NULL, NULL, NULL, NULL, NULL, 0);
15660Sstevel@tonic-gate 
15670Sstevel@tonic-gate 	ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
15685928Sjj204856 	    segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
15690Sstevel@tonic-gate 	if (ksp) {
15700Sstevel@tonic-gate 		ksp->ks_data = (void *)segadvstat_ptr;
15710Sstevel@tonic-gate 		kstat_install(ksp);
15720Sstevel@tonic-gate 	}
15730Sstevel@tonic-gate 
15740Sstevel@tonic-gate 	seg_pinit();
15750Sstevel@tonic-gate }
15760Sstevel@tonic-gate 
15770Sstevel@tonic-gate /*
15780Sstevel@tonic-gate  * Allocate a segment to cover [base, base+size]
15790Sstevel@tonic-gate  * and attach it to the specified address space.
15800Sstevel@tonic-gate  */
15810Sstevel@tonic-gate struct seg *
seg_alloc(struct as * as,caddr_t base,size_t size)15820Sstevel@tonic-gate seg_alloc(struct as *as, caddr_t base, size_t size)
15830Sstevel@tonic-gate {
15840Sstevel@tonic-gate 	struct seg *new;
15850Sstevel@tonic-gate 	caddr_t segbase;
15860Sstevel@tonic-gate 	size_t segsize;
15870Sstevel@tonic-gate 
15880Sstevel@tonic-gate 	segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
15890Sstevel@tonic-gate 	segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
15900Sstevel@tonic-gate 	    (uintptr_t)segbase;
15910Sstevel@tonic-gate 
15920Sstevel@tonic-gate 	if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
15930Sstevel@tonic-gate 		return ((struct seg *)NULL);	/* bad virtual addr range */
15940Sstevel@tonic-gate 
15950Sstevel@tonic-gate 	if (as != &kas &&
15960Sstevel@tonic-gate 	    valid_usr_range(segbase, segsize, 0, as,
15970Sstevel@tonic-gate 	    as->a_userlimit) != RANGE_OKAY)
15980Sstevel@tonic-gate 		return ((struct seg *)NULL);	/* bad virtual addr range */
15990Sstevel@tonic-gate 
16000Sstevel@tonic-gate 	new = kmem_cache_alloc(seg_cache, KM_SLEEP);
16010Sstevel@tonic-gate 	new->s_ops = NULL;
16020Sstevel@tonic-gate 	new->s_data = NULL;
16030Sstevel@tonic-gate 	new->s_szc = 0;
16040Sstevel@tonic-gate 	new->s_flags = 0;
16056695Saguzovsk 	mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL);
16066695Saguzovsk 	new->s_phead.p_lnext = &new->s_phead;
16076695Saguzovsk 	new->s_phead.p_lprev = &new->s_phead;
16080Sstevel@tonic-gate 	if (seg_attach(as, segbase, segsize, new) < 0) {
16090Sstevel@tonic-gate 		kmem_cache_free(seg_cache, new);
16100Sstevel@tonic-gate 		return ((struct seg *)NULL);
16110Sstevel@tonic-gate 	}
16120Sstevel@tonic-gate 	/* caller must fill in ops, data */
16130Sstevel@tonic-gate 	return (new);
16140Sstevel@tonic-gate }
16150Sstevel@tonic-gate 
16160Sstevel@tonic-gate /*
16170Sstevel@tonic-gate  * Attach a segment to the address space.  Used by seg_alloc()
16180Sstevel@tonic-gate  * and for kernel startup to attach to static segments.
16190Sstevel@tonic-gate  */
16200Sstevel@tonic-gate int
seg_attach(struct as * as,caddr_t base,size_t size,struct seg * seg)16210Sstevel@tonic-gate seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
16220Sstevel@tonic-gate {
16230Sstevel@tonic-gate 	seg->s_as = as;
16240Sstevel@tonic-gate 	seg->s_base = base;
16250Sstevel@tonic-gate 	seg->s_size = size;
16260Sstevel@tonic-gate 
16270Sstevel@tonic-gate 	/*
16280Sstevel@tonic-gate 	 * as_addseg() will add the segment at the appropraite point
16290Sstevel@tonic-gate 	 * in the list. It will return -1 if there is overlap with
16300Sstevel@tonic-gate 	 * an already existing segment.
16310Sstevel@tonic-gate 	 */
16320Sstevel@tonic-gate 	return (as_addseg(as, seg));
16330Sstevel@tonic-gate }
16340Sstevel@tonic-gate 
16350Sstevel@tonic-gate /*
16360Sstevel@tonic-gate  * Unmap a segment and free it from its associated address space.
16370Sstevel@tonic-gate  * This should be called by anybody who's finished with a whole segment's
16380Sstevel@tonic-gate  * mapping.  Just calls SEGOP_UNMAP() on the whole mapping .  It is the
16390Sstevel@tonic-gate  * responsibility of the segment driver to unlink the the segment
16400Sstevel@tonic-gate  * from the address space, and to free public and private data structures
16410Sstevel@tonic-gate  * associated with the segment.  (This is typically done by a call to
16420Sstevel@tonic-gate  * seg_free()).
16430Sstevel@tonic-gate  */
16440Sstevel@tonic-gate void
seg_unmap(struct seg * seg)16450Sstevel@tonic-gate seg_unmap(struct seg *seg)
16460Sstevel@tonic-gate {
16470Sstevel@tonic-gate #ifdef DEBUG
16480Sstevel@tonic-gate 	int ret;
16490Sstevel@tonic-gate #endif /* DEBUG */
16500Sstevel@tonic-gate 
16510Sstevel@tonic-gate 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
16520Sstevel@tonic-gate 
16530Sstevel@tonic-gate 	/* Shouldn't have called seg_unmap if mapping isn't yet established */
16540Sstevel@tonic-gate 	ASSERT(seg->s_data != NULL);
16550Sstevel@tonic-gate 
16560Sstevel@tonic-gate 	/* Unmap the whole mapping */
16570Sstevel@tonic-gate #ifdef DEBUG
16580Sstevel@tonic-gate 	ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
16590Sstevel@tonic-gate 	ASSERT(ret == 0);
16600Sstevel@tonic-gate #else
16610Sstevel@tonic-gate 	SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
16620Sstevel@tonic-gate #endif /* DEBUG */
16630Sstevel@tonic-gate }
16640Sstevel@tonic-gate 
16650Sstevel@tonic-gate /*
16660Sstevel@tonic-gate  * Free the segment from its associated as. This should only be called
16670Sstevel@tonic-gate  * if a mapping to the segment has not yet been established (e.g., if
16680Sstevel@tonic-gate  * an error occurs in the middle of doing an as_map when the segment
16690Sstevel@tonic-gate  * has already been partially set up) or if it has already been deleted
16700Sstevel@tonic-gate  * (e.g., from a segment driver unmap routine if the unmap applies to the
16710Sstevel@tonic-gate  * entire segment). If the mapping is currently set up then seg_unmap() should
16720Sstevel@tonic-gate  * be called instead.
16730Sstevel@tonic-gate  */
16740Sstevel@tonic-gate void
seg_free(struct seg * seg)16750Sstevel@tonic-gate seg_free(struct seg *seg)
16760Sstevel@tonic-gate {
16770Sstevel@tonic-gate 	register struct as *as = seg->s_as;
16780Sstevel@tonic-gate 	struct seg *tseg = as_removeseg(as, seg);
16790Sstevel@tonic-gate 
16800Sstevel@tonic-gate 	ASSERT(tseg == seg);
16810Sstevel@tonic-gate 
16820Sstevel@tonic-gate 	/*
16830Sstevel@tonic-gate 	 * If the segment private data field is NULL,
16840Sstevel@tonic-gate 	 * then segment driver is not attached yet.
16850Sstevel@tonic-gate 	 */
16860Sstevel@tonic-gate 	if (seg->s_data != NULL)
16870Sstevel@tonic-gate 		SEGOP_FREE(seg);
16880Sstevel@tonic-gate 
16896695Saguzovsk 	mutex_destroy(&seg->s_pmtx);
16906695Saguzovsk 	ASSERT(seg->s_phead.p_lnext == &seg->s_phead);
16916695Saguzovsk 	ASSERT(seg->s_phead.p_lprev == &seg->s_phead);
16920Sstevel@tonic-gate 	kmem_cache_free(seg_cache, seg);
16930Sstevel@tonic-gate }
16940Sstevel@tonic-gate 
16950Sstevel@tonic-gate /*ARGSUSED*/
16960Sstevel@tonic-gate static void
seg_p_mem_config_post_add(void * arg,pgcnt_t delta_pages)16970Sstevel@tonic-gate seg_p_mem_config_post_add(
16980Sstevel@tonic-gate 	void *arg,
16990Sstevel@tonic-gate 	pgcnt_t delta_pages)
17000Sstevel@tonic-gate {
17010Sstevel@tonic-gate 	/* Nothing to do. */
17020Sstevel@tonic-gate }
17030Sstevel@tonic-gate 
17043480Sjfrank void
seg_p_enable(void)17053480Sjfrank seg_p_enable(void)
17063480Sjfrank {
17076695Saguzovsk 	mutex_enter(&seg_pcache_mtx);
17086695Saguzovsk 	ASSERT(seg_pdisabled != 0);
17096695Saguzovsk 	seg_pdisabled--;
17106695Saguzovsk 	mutex_exit(&seg_pcache_mtx);
17113480Sjfrank }
17123480Sjfrank 
17133480Sjfrank /*
17143480Sjfrank  * seg_p_disable - disables seg_pcache, and then attempts to empty the
17153480Sjfrank  * cache.
17163480Sjfrank  * Returns SEGP_SUCCESS if the cache was successfully emptied, or
17173480Sjfrank  * SEGP_FAIL if the cache could not be emptied.
17183480Sjfrank  */
17193480Sjfrank int
seg_p_disable(void)17203480Sjfrank seg_p_disable(void)
17213480Sjfrank {
17223480Sjfrank 	pgcnt_t	old_plocked;
17233480Sjfrank 	int stall_count = 0;
17243480Sjfrank 
17256695Saguzovsk 	mutex_enter(&seg_pcache_mtx);
17266695Saguzovsk 	seg_pdisabled++;
17276695Saguzovsk 	ASSERT(seg_pdisabled != 0);
17286695Saguzovsk 	mutex_exit(&seg_pcache_mtx);
17293480Sjfrank 
17303480Sjfrank 	/*
17313480Sjfrank 	 * Attempt to empty the cache. Terminate if seg_plocked does not
17323480Sjfrank 	 * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
17333480Sjfrank 	 */
17343480Sjfrank 	while (seg_plocked != 0) {
17356695Saguzovsk 		ASSERT(seg_phashsize_win != 0);
17363480Sjfrank 		old_plocked = seg_plocked;
17376695Saguzovsk 		seg_ppurge_async(1);
17383480Sjfrank 		if (seg_plocked == old_plocked) {
17393480Sjfrank 			if (stall_count++ > SEGP_STALL_THRESHOLD) {
17403480Sjfrank 				return (SEGP_FAIL);
17413480Sjfrank 			}
17423480Sjfrank 		} else
17433480Sjfrank 			stall_count = 0;
17443480Sjfrank 		if (seg_plocked != 0)
17453480Sjfrank 			delay(hz/SEGP_PREDEL_DELAY_FACTOR);
17463480Sjfrank 	}
17473480Sjfrank 	return (SEGP_SUCCESS);
17483480Sjfrank }
17493480Sjfrank 
17500Sstevel@tonic-gate /*
17510Sstevel@tonic-gate  * Attempt to purge seg_pcache.  May need to return before this has
17520Sstevel@tonic-gate  * completed to allow other pre_del callbacks to unlock pages. This is
17530Sstevel@tonic-gate  * ok because:
17546695Saguzovsk  *	1) The seg_pdisabled flag has been set so at least we won't
17550Sstevel@tonic-gate  *	cache anymore locks and the locks we couldn't purge
17560Sstevel@tonic-gate  *	will not be held if they do get released by a subsequent
17570Sstevel@tonic-gate  *	pre-delete callback.
17580Sstevel@tonic-gate  *
17590Sstevel@tonic-gate  *	2) The rest of the memory delete thread processing does not
17600Sstevel@tonic-gate  *	depend on the changes made in this pre-delete callback. No
17610Sstevel@tonic-gate  *	panics will result, the worst that will happen is that the
17620Sstevel@tonic-gate  *	DR code will timeout and cancel the delete.
17630Sstevel@tonic-gate  */
17640Sstevel@tonic-gate /*ARGSUSED*/
17650Sstevel@tonic-gate static int
seg_p_mem_config_pre_del(void * arg,pgcnt_t delta_pages)17660Sstevel@tonic-gate seg_p_mem_config_pre_del(
17670Sstevel@tonic-gate 	void *arg,
17680Sstevel@tonic-gate 	pgcnt_t delta_pages)
17690Sstevel@tonic-gate {
17706695Saguzovsk 	if (seg_phashsize_win == 0) {
17716695Saguzovsk 		return (0);
17726695Saguzovsk 	}
17733480Sjfrank 	if (seg_p_disable() != SEGP_SUCCESS)
17743480Sjfrank 		cmn_err(CE_NOTE,
17753480Sjfrank 		    "!Pre-delete couldn't purge"" pagelock cache - continuing");
17760Sstevel@tonic-gate 	return (0);
17770Sstevel@tonic-gate }
17780Sstevel@tonic-gate 
17790Sstevel@tonic-gate /*ARGSUSED*/
17800Sstevel@tonic-gate static void
seg_p_mem_config_post_del(void * arg,pgcnt_t delta_pages,int cancelled)17810Sstevel@tonic-gate seg_p_mem_config_post_del(
17820Sstevel@tonic-gate 	void *arg,
17830Sstevel@tonic-gate 	pgcnt_t delta_pages,
17840Sstevel@tonic-gate 	int cancelled)
17850Sstevel@tonic-gate {
17866695Saguzovsk 	if (seg_phashsize_win == 0) {
17876695Saguzovsk 		return;
17886695Saguzovsk 	}
17893480Sjfrank 	seg_p_enable();
17900Sstevel@tonic-gate }
17910Sstevel@tonic-gate 
17920Sstevel@tonic-gate static kphysm_setup_vector_t seg_p_mem_config_vec = {
17930Sstevel@tonic-gate 	KPHYSM_SETUP_VECTOR_VERSION,
17940Sstevel@tonic-gate 	seg_p_mem_config_post_add,
17950Sstevel@tonic-gate 	seg_p_mem_config_pre_del,
17960Sstevel@tonic-gate 	seg_p_mem_config_post_del,
17970Sstevel@tonic-gate };
17980Sstevel@tonic-gate 
17990Sstevel@tonic-gate static void
seg_pinit_mem_config(void)18000Sstevel@tonic-gate seg_pinit_mem_config(void)
18010Sstevel@tonic-gate {
18020Sstevel@tonic-gate 	int ret;
18030Sstevel@tonic-gate 
18040Sstevel@tonic-gate 	ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
18050Sstevel@tonic-gate 	/*
18060Sstevel@tonic-gate 	 * Want to catch this in the debug kernel. At run time, if the
18070Sstevel@tonic-gate 	 * callbacks don't get run all will be OK as the disable just makes
18080Sstevel@tonic-gate 	 * it more likely that the pages can be collected.
18090Sstevel@tonic-gate 	 */
18100Sstevel@tonic-gate 	ASSERT(ret == 0);
18110Sstevel@tonic-gate }
18123247Sgjelinek 
18133247Sgjelinek /*
18143247Sgjelinek  * Verify that segment is not a shared anonymous segment which reserves
18153247Sgjelinek  * swap.  zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
18163247Sgjelinek  * from one zone to another if any segments are shared.  This is because the
18173247Sgjelinek  * last process to exit will credit the swap reservation.  This could lead
18183247Sgjelinek  * to the swap being reserved by one zone, and credited to another.
18193247Sgjelinek  */
18203247Sgjelinek boolean_t
seg_can_change_zones(struct seg * seg)18213247Sgjelinek seg_can_change_zones(struct seg *seg)
18223247Sgjelinek {
18233247Sgjelinek 	struct segvn_data *svd;
18243247Sgjelinek 
18253247Sgjelinek 	if (seg->s_ops == &segspt_shmops)
18263247Sgjelinek 		return (B_FALSE);
18273247Sgjelinek 
18283247Sgjelinek 	if (seg->s_ops == &segvn_ops) {
18293247Sgjelinek 		svd = (struct segvn_data *)seg->s_data;
18303247Sgjelinek 		if (svd->type == MAP_SHARED &&
18313247Sgjelinek 		    svd->amp != NULL &&
18323247Sgjelinek 		    svd->amp->swresv > 0)
18333247Sgjelinek 		return (B_FALSE);
18343247Sgjelinek 	}
18353247Sgjelinek 	return (B_TRUE);
18363247Sgjelinek }
18373247Sgjelinek 
18383247Sgjelinek /*
18393247Sgjelinek  * Return swap reserved by a segment backing a private mapping.
18403247Sgjelinek  */
18413247Sgjelinek size_t
seg_swresv(struct seg * seg)18423247Sgjelinek seg_swresv(struct seg *seg)
18433247Sgjelinek {
18443247Sgjelinek 	struct segvn_data *svd;
18453247Sgjelinek 	size_t swap = 0;
18463247Sgjelinek 
18473247Sgjelinek 	if (seg->s_ops == &segvn_ops) {
18483247Sgjelinek 		svd = (struct segvn_data *)seg->s_data;
18493247Sgjelinek 		if (svd->type == MAP_PRIVATE && svd->swresv > 0)
18503247Sgjelinek 			swap = svd->swresv;
18513247Sgjelinek 	}
18523247Sgjelinek 	return (swap);
18533247Sgjelinek }
1854