10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 53247Sgjelinek * Common Development and Distribution License (the "License"). 63247Sgjelinek * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate /* 22*11066Srafael.vanoni@sun.com * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 230Sstevel@tonic-gate * Use is subject to license terms. 240Sstevel@tonic-gate */ 250Sstevel@tonic-gate 260Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 270Sstevel@tonic-gate /* All Rights Reserved */ 280Sstevel@tonic-gate 290Sstevel@tonic-gate /* 300Sstevel@tonic-gate * University Copyright- Copyright (c) 1982, 1986, 1988 310Sstevel@tonic-gate * The Regents of the University of California 320Sstevel@tonic-gate * All Rights Reserved 330Sstevel@tonic-gate * 340Sstevel@tonic-gate * University Acknowledgment- Portions of this document are derived from 350Sstevel@tonic-gate * software developed by the University of California, Berkeley, and its 360Sstevel@tonic-gate * contributors. 370Sstevel@tonic-gate */ 380Sstevel@tonic-gate 390Sstevel@tonic-gate /* 400Sstevel@tonic-gate * VM - segment management. 410Sstevel@tonic-gate */ 420Sstevel@tonic-gate 430Sstevel@tonic-gate #include <sys/types.h> 440Sstevel@tonic-gate #include <sys/inttypes.h> 450Sstevel@tonic-gate #include <sys/t_lock.h> 460Sstevel@tonic-gate #include <sys/param.h> 470Sstevel@tonic-gate #include <sys/systm.h> 480Sstevel@tonic-gate #include <sys/kmem.h> 496695Saguzovsk #include <sys/sysmacros.h> 500Sstevel@tonic-gate #include <sys/vmsystm.h> 516695Saguzovsk #include <sys/tuneable.h> 520Sstevel@tonic-gate #include <sys/debug.h> 536695Saguzovsk #include <sys/fs/swapnode.h> 540Sstevel@tonic-gate #include <sys/cmn_err.h> 550Sstevel@tonic-gate #include <sys/callb.h> 560Sstevel@tonic-gate #include <sys/mem_config.h> 573247Sgjelinek #include <sys/mman.h> 580Sstevel@tonic-gate 590Sstevel@tonic-gate #include <vm/hat.h> 600Sstevel@tonic-gate #include <vm/as.h> 610Sstevel@tonic-gate #include <vm/seg.h> 620Sstevel@tonic-gate #include <vm/seg_kmem.h> 633247Sgjelinek #include <vm/seg_spt.h> 643247Sgjelinek #include <vm/seg_vn.h> 656695Saguzovsk #include <vm/anon.h> 666695Saguzovsk 670Sstevel@tonic-gate /* 680Sstevel@tonic-gate * kstats for segment advise 690Sstevel@tonic-gate */ 700Sstevel@tonic-gate segadvstat_t segadvstat = { 710Sstevel@tonic-gate { "MADV_FREE_hit", KSTAT_DATA_ULONG }, 720Sstevel@tonic-gate { "MADV_FREE_miss", KSTAT_DATA_ULONG }, 730Sstevel@tonic-gate }; 740Sstevel@tonic-gate 750Sstevel@tonic-gate kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat; 760Sstevel@tonic-gate uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t); 770Sstevel@tonic-gate 780Sstevel@tonic-gate /* 790Sstevel@tonic-gate * entry in the segment page cache 800Sstevel@tonic-gate */ 810Sstevel@tonic-gate struct seg_pcache { 826695Saguzovsk struct seg_pcache *p_hnext; /* list for hashed blocks */ 836695Saguzovsk struct seg_pcache *p_hprev; 846695Saguzovsk pcache_link_t p_plink; /* per segment/amp list */ 856695Saguzovsk void *p_htag0; /* segment/amp pointer */ 866695Saguzovsk caddr_t p_addr; /* base address/anon_idx */ 876695Saguzovsk size_t p_len; /* total bytes */ 886695Saguzovsk size_t p_wlen; /* writtable bytes at p_addr */ 896695Saguzovsk struct page **p_pp; /* pp shadow list */ 906695Saguzovsk seg_preclaim_cbfunc_t p_callback; /* reclaim callback function */ 916695Saguzovsk clock_t p_lbolt; /* lbolt from last use */ 926695Saguzovsk struct seg_phash *p_hashp; /* our pcache hash bucket */ 936695Saguzovsk uint_t p_active; /* active count */ 946695Saguzovsk uchar_t p_write; /* true if S_WRITE */ 956695Saguzovsk uchar_t p_ref; /* reference byte */ 966695Saguzovsk ushort_t p_flags; /* bit flags */ 970Sstevel@tonic-gate }; 980Sstevel@tonic-gate 990Sstevel@tonic-gate struct seg_phash { 1006695Saguzovsk struct seg_pcache *p_hnext; /* list for hashed blocks */ 1016695Saguzovsk struct seg_pcache *p_hprev; 1026695Saguzovsk kmutex_t p_hmutex; /* protects hash bucket */ 1036695Saguzovsk pcache_link_t p_halink[2]; /* active bucket linkages */ 1046695Saguzovsk }; 1056695Saguzovsk 1066695Saguzovsk struct seg_phash_wired { 1076695Saguzovsk struct seg_pcache *p_hnext; /* list for hashed blocks */ 1086695Saguzovsk struct seg_pcache *p_hprev; 1096695Saguzovsk kmutex_t p_hmutex; /* protects hash bucket */ 1100Sstevel@tonic-gate }; 1110Sstevel@tonic-gate 1126695Saguzovsk /* 1136695Saguzovsk * A parameter to control a maximum number of bytes that can be 1146695Saguzovsk * purged from pcache at a time. 1156695Saguzovsk */ 1166695Saguzovsk #define P_MAX_APURGE_BYTES (1024 * 1024 * 1024) 1176695Saguzovsk 1186695Saguzovsk /* 1196695Saguzovsk * log2(fraction of pcache to reclaim at a time). 1206695Saguzovsk */ 1216695Saguzovsk #define P_SHRINK_SHFT (5) 1226695Saguzovsk 1236695Saguzovsk /* 1246695Saguzovsk * The following variables can be tuned via /etc/system. 1256695Saguzovsk */ 1266695Saguzovsk 1276695Saguzovsk int segpcache_enabled = 1; /* if 1, shadow lists are cached */ 1286695Saguzovsk pgcnt_t segpcache_maxwindow = 0; /* max # of pages that can be cached */ 1296695Saguzovsk ulong_t segpcache_hashsize_win = 0; /* # of non wired buckets */ 1306695Saguzovsk ulong_t segpcache_hashsize_wired = 0; /* # of wired buckets */ 1316695Saguzovsk int segpcache_reap_sec = 1; /* reap check rate in secs */ 1326695Saguzovsk clock_t segpcache_reap_ticks = 0; /* reap interval in ticks */ 1336695Saguzovsk int segpcache_pcp_maxage_sec = 1; /* pcp max age in secs */ 1346695Saguzovsk clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */ 1356695Saguzovsk int segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */ 1366695Saguzovsk pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */ 1370Sstevel@tonic-gate 1386695Saguzovsk static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */ 1396695Saguzovsk static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */ 1406695Saguzovsk static kcondvar_t seg_pasync_cv; 1416695Saguzovsk 1426695Saguzovsk #pragma align 64(pctrl1) 1436695Saguzovsk #pragma align 64(pctrl2) 1446695Saguzovsk #pragma align 64(pctrl3) 1450Sstevel@tonic-gate 1466695Saguzovsk /* 1476695Saguzovsk * Keep frequently used variables together in one cache line. 1486695Saguzovsk */ 1496695Saguzovsk static struct p_ctrl1 { 1506695Saguzovsk uint_t p_disabled; /* if not 0, caching temporarily off */ 1516695Saguzovsk pgcnt_t p_maxwin; /* max # of pages that can be cached */ 1526695Saguzovsk size_t p_hashwin_sz; /* # of non wired buckets */ 1536695Saguzovsk struct seg_phash *p_htabwin; /* hash table for non wired entries */ 1546695Saguzovsk size_t p_hashwired_sz; /* # of wired buckets */ 1556695Saguzovsk struct seg_phash_wired *p_htabwired; /* hash table for wired entries */ 1566695Saguzovsk kmem_cache_t *p_kmcache; /* kmem cache for seg_pcache structs */ 1576695Saguzovsk #ifdef _LP64 1586695Saguzovsk ulong_t pad[1]; 1596695Saguzovsk #endif /* _LP64 */ 1606695Saguzovsk } pctrl1; 1616695Saguzovsk 1626695Saguzovsk static struct p_ctrl2 { 1636695Saguzovsk kmutex_t p_mem_mtx; /* protects window counter and p_halinks */ 1646695Saguzovsk pgcnt_t p_locked_win; /* # pages from window */ 1656695Saguzovsk pgcnt_t p_locked; /* # of pages cached by pagelock */ 1666695Saguzovsk uchar_t p_ahcur; /* current active links for insert/delete */ 1676695Saguzovsk uchar_t p_athr_on; /* async reclaim thread is running. */ 1686695Saguzovsk pcache_link_t p_ahhead[2]; /* active buckets linkages */ 1696695Saguzovsk } pctrl2; 1700Sstevel@tonic-gate 1716695Saguzovsk static struct p_ctrl3 { 1726695Saguzovsk clock_t p_pcp_maxage; /* max pcp age in ticks */ 1736695Saguzovsk ulong_t p_athr_empty_ahb; /* athread walk stats */ 1746695Saguzovsk ulong_t p_athr_full_ahb; /* athread walk stats */ 1756695Saguzovsk pgcnt_t p_maxapurge_npages; /* max pages to purge at a time */ 1766695Saguzovsk int p_shrink_shft; /* reap shift factor */ 1776695Saguzovsk #ifdef _LP64 1786695Saguzovsk ulong_t pad[3]; 1796695Saguzovsk #endif /* _LP64 */ 1806695Saguzovsk } pctrl3; 1810Sstevel@tonic-gate 1826695Saguzovsk #define seg_pdisabled pctrl1.p_disabled 1836695Saguzovsk #define seg_pmaxwindow pctrl1.p_maxwin 1846695Saguzovsk #define seg_phashsize_win pctrl1.p_hashwin_sz 1856695Saguzovsk #define seg_phashtab_win pctrl1.p_htabwin 1866695Saguzovsk #define seg_phashsize_wired pctrl1.p_hashwired_sz 1876695Saguzovsk #define seg_phashtab_wired pctrl1.p_htabwired 1886695Saguzovsk #define seg_pkmcache pctrl1.p_kmcache 1896695Saguzovsk #define seg_pmem_mtx pctrl2.p_mem_mtx 1906695Saguzovsk #define seg_plocked_window pctrl2.p_locked_win 1916695Saguzovsk #define seg_plocked pctrl2.p_locked 1926695Saguzovsk #define seg_pahcur pctrl2.p_ahcur 1936695Saguzovsk #define seg_pathr_on pctrl2.p_athr_on 1946695Saguzovsk #define seg_pahhead pctrl2.p_ahhead 1956695Saguzovsk #define seg_pmax_pcpage pctrl3.p_pcp_maxage 1966695Saguzovsk #define seg_pathr_empty_ahb pctrl3.p_athr_empty_ahb 1976695Saguzovsk #define seg_pathr_full_ahb pctrl3.p_athr_full_ahb 1986695Saguzovsk #define seg_pshrink_shift pctrl3.p_shrink_shft 1996695Saguzovsk #define seg_pmaxapurge_npages pctrl3.p_maxapurge_npages 2006695Saguzovsk 2016695Saguzovsk #define P_HASHWIN_MASK (seg_phashsize_win - 1) 2026695Saguzovsk #define P_HASHWIRED_MASK (seg_phashsize_wired - 1) 2036695Saguzovsk #define P_BASESHIFT (6) 2046695Saguzovsk 2056695Saguzovsk kthread_t *seg_pasync_thr; 2066695Saguzovsk 2076695Saguzovsk extern struct seg_ops segvn_ops; 2086695Saguzovsk extern struct seg_ops segspt_shmops; 2090Sstevel@tonic-gate 2106695Saguzovsk #define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED) 2116695Saguzovsk #define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags) 2126695Saguzovsk 213*11066Srafael.vanoni@sun.com #define LBOLT_DELTA(t) ((ulong_t)(ddi_get_lbolt() - (t))) 2146695Saguzovsk 2156695Saguzovsk #define PCP_AGE(pcp) LBOLT_DELTA((pcp)->p_lbolt) 2166695Saguzovsk 2176695Saguzovsk /* 2186695Saguzovsk * htag0 argument can be a seg or amp pointer. 2196695Saguzovsk */ 2206695Saguzovsk #define P_HASHBP(seg, htag0, addr, flags) \ 2216695Saguzovsk (IS_PFLAGS_WIRED((flags)) ? \ 2226695Saguzovsk ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \ 2236695Saguzovsk ((uintptr_t)(htag0) >> P_BASESHIFT)]) : \ 2246695Saguzovsk (&seg_phashtab_win[P_HASHWIN_MASK & \ 2256695Saguzovsk (((uintptr_t)(htag0) >> 3) ^ \ 2266695Saguzovsk ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ? \ 2276695Saguzovsk (flags >> 16) : page_get_shift((seg)->s_szc))))])) 2280Sstevel@tonic-gate 2296695Saguzovsk /* 2306695Saguzovsk * htag0 argument can be a seg or amp pointer. 2316695Saguzovsk */ 2326695Saguzovsk #define P_MATCH(pcp, htag0, addr, len) \ 2336695Saguzovsk ((pcp)->p_htag0 == (htag0) && \ 2346695Saguzovsk (pcp)->p_addr == (addr) && \ 2356695Saguzovsk (pcp)->p_len >= (len)) 2360Sstevel@tonic-gate 2376695Saguzovsk #define P_MATCH_PP(pcp, htag0, addr, len, pp) \ 2386695Saguzovsk ((pcp)->p_pp == (pp) && \ 2396695Saguzovsk (pcp)->p_htag0 == (htag0) && \ 2406695Saguzovsk (pcp)->p_addr == (addr) && \ 2416695Saguzovsk (pcp)->p_len >= (len)) 2426695Saguzovsk 2436695Saguzovsk #define plink2pcache(pl) ((struct seg_pcache *)((uintptr_t)(pl) - \ 2446695Saguzovsk offsetof(struct seg_pcache, p_plink))) 2456695Saguzovsk 2466695Saguzovsk #define hlink2phash(hl, l) ((struct seg_phash *)((uintptr_t)(hl) - \ 2476695Saguzovsk offsetof(struct seg_phash, p_halink[l]))) 2480Sstevel@tonic-gate 2490Sstevel@tonic-gate /* 2506695Saguzovsk * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from 2516695Saguzovsk * active hash bucket lists. We maintain active bucket lists to reduce the 2526695Saguzovsk * overhead of finding active buckets during asynchronous purging since there 2536695Saguzovsk * can be 10s of millions of buckets on a large system but only a small subset 2546695Saguzovsk * of them in actual use. 2556695Saguzovsk * 2566695Saguzovsk * There're 2 active bucket lists. Current active list (as per seg_pahcur) is 2576695Saguzovsk * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete 2586695Saguzovsk * buckets. The other list is used by asynchronous purge thread. This allows 2596695Saguzovsk * the purge thread to walk its active list without holding seg_pmem_mtx for a 2606695Saguzovsk * long time. When asynchronous thread is done with its list it switches to 2616695Saguzovsk * current active list and makes the list it just finished processing as 2626695Saguzovsk * current active list. 2636695Saguzovsk * 2646695Saguzovsk * seg_padd_abuck() only adds the bucket to current list if the bucket is not 2656695Saguzovsk * yet on any list. seg_premove_abuck() may remove the bucket from either 2666695Saguzovsk * list. If the bucket is on current list it will be always removed. Otherwise 2676695Saguzovsk * the bucket is only removed if asynchronous purge thread is not currently 2686695Saguzovsk * running or seg_premove_abuck() is called by asynchronous purge thread 2696695Saguzovsk * itself. A given bucket can only be on one of active lists at a time. These 2706695Saguzovsk * routines should be called with per bucket lock held. The routines use 2716695Saguzovsk * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after 2726695Saguzovsk * the first entry is added to the bucket chain and seg_premove_abuck() must 2736695Saguzovsk * be called after the last pcp entry is deleted from its chain. Per bucket 2746695Saguzovsk * lock should be held by the callers. This avoids a potential race condition 2756695Saguzovsk * when seg_premove_abuck() removes a bucket after pcp entries are added to 2766695Saguzovsk * its list after the caller checked that the bucket has no entries. (this 2776695Saguzovsk * race would cause a loss of an active bucket from the active lists). 2786695Saguzovsk * 2796695Saguzovsk * Both lists are circular doubly linked lists anchored at seg_pahhead heads. 2806695Saguzovsk * New entries are added to the end of the list since LRU is used as the 2816695Saguzovsk * purging policy. 2826695Saguzovsk */ 2836695Saguzovsk static void 2846695Saguzovsk seg_padd_abuck(struct seg_phash *hp) 2856695Saguzovsk { 2866695Saguzovsk int lix; 2876695Saguzovsk 2886695Saguzovsk ASSERT(MUTEX_HELD(&hp->p_hmutex)); 2896695Saguzovsk ASSERT((struct seg_phash *)hp->p_hnext != hp); 2906695Saguzovsk ASSERT((struct seg_phash *)hp->p_hprev != hp); 2916695Saguzovsk ASSERT(hp->p_hnext == hp->p_hprev); 2926695Saguzovsk ASSERT(!IS_PCP_WIRED(hp->p_hnext)); 2936695Saguzovsk ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp); 2946695Saguzovsk ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp); 2956695Saguzovsk ASSERT(hp >= seg_phashtab_win && 2966695Saguzovsk hp < &seg_phashtab_win[seg_phashsize_win]); 2976695Saguzovsk 2986695Saguzovsk /* 2996695Saguzovsk * This bucket can already be on one of active lists 3006695Saguzovsk * since seg_premove_abuck() may have failed to remove it 3016695Saguzovsk * before. 3026695Saguzovsk */ 3036695Saguzovsk mutex_enter(&seg_pmem_mtx); 3046695Saguzovsk lix = seg_pahcur; 3056695Saguzovsk ASSERT(lix >= 0 && lix <= 1); 3066695Saguzovsk if (hp->p_halink[lix].p_lnext != NULL) { 3076695Saguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL); 3086695Saguzovsk ASSERT(hp->p_halink[!lix].p_lnext == NULL); 3096695Saguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL); 3106695Saguzovsk mutex_exit(&seg_pmem_mtx); 3116695Saguzovsk return; 3126695Saguzovsk } 3136695Saguzovsk ASSERT(hp->p_halink[lix].p_lprev == NULL); 3146695Saguzovsk 3156695Saguzovsk /* 3166695Saguzovsk * If this bucket is still on list !lix async thread can't yet remove 3176695Saguzovsk * it since we hold here per bucket lock. In this case just return 3186695Saguzovsk * since async thread will eventually find and process this bucket. 3196695Saguzovsk */ 3206695Saguzovsk if (hp->p_halink[!lix].p_lnext != NULL) { 3216695Saguzovsk ASSERT(hp->p_halink[!lix].p_lprev != NULL); 3226695Saguzovsk mutex_exit(&seg_pmem_mtx); 3236695Saguzovsk return; 3246695Saguzovsk } 3256695Saguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL); 3266695Saguzovsk /* 3276695Saguzovsk * This bucket is not on any active bucket list yet. 3286695Saguzovsk * Add the bucket to the tail of current active list. 3296695Saguzovsk */ 3306695Saguzovsk hp->p_halink[lix].p_lnext = &seg_pahhead[lix]; 3316695Saguzovsk hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev; 3326695Saguzovsk seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix]; 3336695Saguzovsk seg_pahhead[lix].p_lprev = &hp->p_halink[lix]; 3346695Saguzovsk mutex_exit(&seg_pmem_mtx); 3356695Saguzovsk } 3366695Saguzovsk 3376695Saguzovsk static void 3386695Saguzovsk seg_premove_abuck(struct seg_phash *hp, int athr) 3396695Saguzovsk { 3406695Saguzovsk int lix; 3416695Saguzovsk 3426695Saguzovsk ASSERT(MUTEX_HELD(&hp->p_hmutex)); 3436695Saguzovsk ASSERT((struct seg_phash *)hp->p_hnext == hp); 3446695Saguzovsk ASSERT((struct seg_phash *)hp->p_hprev == hp); 3456695Saguzovsk ASSERT(hp >= seg_phashtab_win && 3466695Saguzovsk hp < &seg_phashtab_win[seg_phashsize_win]); 3476695Saguzovsk 3486695Saguzovsk if (athr) { 3496695Saguzovsk ASSERT(seg_pathr_on); 3506695Saguzovsk ASSERT(seg_pahcur <= 1); 3516695Saguzovsk /* 3526695Saguzovsk * We are called by asynchronous thread that found this bucket 3536695Saguzovsk * on not currently active (i.e. !seg_pahcur) list. Remove it 3546695Saguzovsk * from there. Per bucket lock we are holding makes sure 3556695Saguzovsk * seg_pinsert() can't sneak in and add pcp entries to this 3566695Saguzovsk * bucket right before we remove the bucket from its list. 3576695Saguzovsk */ 3586695Saguzovsk lix = !seg_pahcur; 3596695Saguzovsk ASSERT(hp->p_halink[lix].p_lnext != NULL); 3606695Saguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL); 3616695Saguzovsk ASSERT(hp->p_halink[!lix].p_lnext == NULL); 3626695Saguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL); 3636695Saguzovsk hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 3646695Saguzovsk hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 3656695Saguzovsk hp->p_halink[lix].p_lnext = NULL; 3666695Saguzovsk hp->p_halink[lix].p_lprev = NULL; 3676695Saguzovsk return; 3686695Saguzovsk } 3696695Saguzovsk 3706695Saguzovsk mutex_enter(&seg_pmem_mtx); 3716695Saguzovsk lix = seg_pahcur; 3726695Saguzovsk ASSERT(lix >= 0 && lix <= 1); 3736695Saguzovsk 3746695Saguzovsk /* 3756695Saguzovsk * If the bucket is on currently active list just remove it from 3766695Saguzovsk * there. 3776695Saguzovsk */ 3786695Saguzovsk if (hp->p_halink[lix].p_lnext != NULL) { 3796695Saguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL); 3806695Saguzovsk ASSERT(hp->p_halink[!lix].p_lnext == NULL); 3816695Saguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL); 3826695Saguzovsk hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 3836695Saguzovsk hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 3846695Saguzovsk hp->p_halink[lix].p_lnext = NULL; 3856695Saguzovsk hp->p_halink[lix].p_lprev = NULL; 3866695Saguzovsk mutex_exit(&seg_pmem_mtx); 3876695Saguzovsk return; 3886695Saguzovsk } 3896695Saguzovsk ASSERT(hp->p_halink[lix].p_lprev == NULL); 3906695Saguzovsk 3916695Saguzovsk /* 3926695Saguzovsk * If asynchronous thread is not running we can remove the bucket from 3936695Saguzovsk * not currently active list. The bucket must be on this list since we 3946695Saguzovsk * already checked that it's not on the other list and the bucket from 3956695Saguzovsk * which we just deleted the last pcp entry must be still on one of the 3966695Saguzovsk * active bucket lists. 3976695Saguzovsk */ 3986695Saguzovsk lix = !lix; 3996695Saguzovsk ASSERT(hp->p_halink[lix].p_lnext != NULL); 4006695Saguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL); 4016695Saguzovsk 4026695Saguzovsk if (!seg_pathr_on) { 4036695Saguzovsk hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 4046695Saguzovsk hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 4056695Saguzovsk hp->p_halink[lix].p_lnext = NULL; 4066695Saguzovsk hp->p_halink[lix].p_lprev = NULL; 4076695Saguzovsk } 4086695Saguzovsk mutex_exit(&seg_pmem_mtx); 4096695Saguzovsk } 4106695Saguzovsk 4116695Saguzovsk /* 4126695Saguzovsk * Check if bucket pointed by hp already has a pcp entry that matches request 4136695Saguzovsk * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise. 4146695Saguzovsk * Also delete matching entries that cover smaller address range but start 4156695Saguzovsk * at the same address as addr argument. Return the list of deleted entries if 4166695Saguzovsk * any. This is an internal helper function called from seg_pinsert() only 4176695Saguzovsk * for non wired shadow lists. The caller already holds a per seg/amp list 4186695Saguzovsk * lock. 4196695Saguzovsk */ 4206695Saguzovsk static struct seg_pcache * 4216695Saguzovsk seg_plookup_checkdup(struct seg_phash *hp, void *htag0, 4226695Saguzovsk caddr_t addr, size_t len, int *found) 4236695Saguzovsk { 4246695Saguzovsk struct seg_pcache *pcp; 4256695Saguzovsk struct seg_pcache *delcallb_list = NULL; 4266695Saguzovsk 4276695Saguzovsk ASSERT(MUTEX_HELD(&hp->p_hmutex)); 4286695Saguzovsk 4296695Saguzovsk *found = 0; 4306695Saguzovsk for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 4316695Saguzovsk pcp = pcp->p_hnext) { 4326695Saguzovsk ASSERT(pcp->p_hashp == hp); 4336695Saguzovsk if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) { 4346695Saguzovsk ASSERT(!IS_PCP_WIRED(pcp)); 4356695Saguzovsk if (pcp->p_len < len) { 4366695Saguzovsk pcache_link_t *plinkp; 4376695Saguzovsk if (pcp->p_active) { 4386695Saguzovsk continue; 4396695Saguzovsk } 4406695Saguzovsk plinkp = &pcp->p_plink; 4416695Saguzovsk plinkp->p_lprev->p_lnext = plinkp->p_lnext; 4426695Saguzovsk plinkp->p_lnext->p_lprev = plinkp->p_lprev; 4436695Saguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 4446695Saguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 4456695Saguzovsk pcp->p_hprev = delcallb_list; 4466695Saguzovsk delcallb_list = pcp; 4476695Saguzovsk } else { 4486695Saguzovsk *found = 1; 4496695Saguzovsk break; 4506695Saguzovsk } 4516695Saguzovsk } 4526695Saguzovsk } 4536695Saguzovsk return (delcallb_list); 4546695Saguzovsk } 4556695Saguzovsk 4566695Saguzovsk /* 4576695Saguzovsk * lookup an address range in pagelock cache. Return shadow list and bump up 4586695Saguzovsk * active count. If amp is not NULL use amp as a lookup tag otherwise use seg 4596695Saguzovsk * as a lookup tag. 4600Sstevel@tonic-gate */ 4610Sstevel@tonic-gate struct page ** 4626695Saguzovsk seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, 4636695Saguzovsk enum seg_rw rw, uint_t flags) 4640Sstevel@tonic-gate { 4650Sstevel@tonic-gate struct seg_pcache *pcp; 4660Sstevel@tonic-gate struct seg_phash *hp; 4676695Saguzovsk void *htag0; 4686695Saguzovsk 4696695Saguzovsk ASSERT(seg != NULL); 4706695Saguzovsk ASSERT(rw == S_READ || rw == S_WRITE); 4710Sstevel@tonic-gate 4720Sstevel@tonic-gate /* 4730Sstevel@tonic-gate * Skip pagelock cache, while DR is in progress or 4740Sstevel@tonic-gate * seg_pcache is off. 4750Sstevel@tonic-gate */ 4766695Saguzovsk if (seg_pdisabled) { 4770Sstevel@tonic-gate return (NULL); 4780Sstevel@tonic-gate } 4796695Saguzovsk ASSERT(seg_phashsize_win != 0); 4800Sstevel@tonic-gate 4816695Saguzovsk htag0 = (amp == NULL ? (void *)seg : (void *)amp); 4826695Saguzovsk hp = P_HASHBP(seg, htag0, addr, flags); 4830Sstevel@tonic-gate mutex_enter(&hp->p_hmutex); 4840Sstevel@tonic-gate for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 4850Sstevel@tonic-gate pcp = pcp->p_hnext) { 4866695Saguzovsk ASSERT(pcp->p_hashp == hp); 4876695Saguzovsk if (P_MATCH(pcp, htag0, addr, len)) { 4886695Saguzovsk ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp)); 4896695Saguzovsk /* 4906695Saguzovsk * If this request wants to write pages 4916695Saguzovsk * but write permissions starting from 4926695Saguzovsk * addr don't cover the entire length len 4936695Saguzovsk * return lookup failure back to the caller. 4946695Saguzovsk * It will check protections and fail this 4956695Saguzovsk * pagelock operation with EACCESS error. 4966695Saguzovsk */ 4976695Saguzovsk if (rw == S_WRITE && pcp->p_wlen < len) { 4986695Saguzovsk break; 4996695Saguzovsk } 5006695Saguzovsk if (pcp->p_active == UINT_MAX) { 5016695Saguzovsk break; 5026695Saguzovsk } 5030Sstevel@tonic-gate pcp->p_active++; 5046695Saguzovsk if (rw == S_WRITE && !pcp->p_write) { 5056695Saguzovsk pcp->p_write = 1; 5066695Saguzovsk } 5070Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 5080Sstevel@tonic-gate return (pcp->p_pp); 5090Sstevel@tonic-gate } 5100Sstevel@tonic-gate } 5110Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 5120Sstevel@tonic-gate return (NULL); 5130Sstevel@tonic-gate } 5140Sstevel@tonic-gate 5150Sstevel@tonic-gate /* 5166695Saguzovsk * mark address range inactive. If the cache is off or the address range is 5176695Saguzovsk * not in the cache or another shadow list that covers bigger range is found 5186695Saguzovsk * we call the segment driver to reclaim the pages. Otherwise just decrement 5196695Saguzovsk * active count and set ref bit. If amp is not NULL use amp as a lookup tag 5206695Saguzovsk * otherwise use seg as a lookup tag. 5210Sstevel@tonic-gate */ 5220Sstevel@tonic-gate void 5236695Saguzovsk seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr, 5246695Saguzovsk size_t len, struct page **pp, enum seg_rw rw, uint_t flags, 5256695Saguzovsk seg_preclaim_cbfunc_t callback) 5260Sstevel@tonic-gate { 5270Sstevel@tonic-gate struct seg_pcache *pcp; 5280Sstevel@tonic-gate struct seg_phash *hp; 5296695Saguzovsk kmutex_t *pmtx = NULL; 5306695Saguzovsk pcache_link_t *pheadp; 5316695Saguzovsk void *htag0; 5326695Saguzovsk pgcnt_t npages = 0; 5336695Saguzovsk int keep = 0; 5340Sstevel@tonic-gate 5356695Saguzovsk ASSERT(seg != NULL); 5366695Saguzovsk ASSERT(rw == S_READ || rw == S_WRITE); 5376695Saguzovsk 5386695Saguzovsk htag0 = (amp == NULL ? (void *)seg : (void *)amp); 5396695Saguzovsk 5406695Saguzovsk /* 5416695Saguzovsk * Skip lookup if pcache is not configured. 5426695Saguzovsk */ 5436695Saguzovsk if (seg_phashsize_win == 0) { 5446695Saguzovsk goto out; 5450Sstevel@tonic-gate } 5466695Saguzovsk 5476695Saguzovsk /* 5486695Saguzovsk * Grab per seg/amp lock before hash lock if we are going to remove 5496695Saguzovsk * inactive entry from pcache. 5506695Saguzovsk */ 5516695Saguzovsk if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) { 5526695Saguzovsk if (amp == NULL) { 5536695Saguzovsk pheadp = &seg->s_phead; 5546695Saguzovsk pmtx = &seg->s_pmtx; 5556695Saguzovsk } else { 5566695Saguzovsk pheadp = &->a_phead; 5576695Saguzovsk pmtx = &->a_pmtx; 5586695Saguzovsk } 5596695Saguzovsk mutex_enter(pmtx); 5606695Saguzovsk } 5616695Saguzovsk 5626695Saguzovsk hp = P_HASHBP(seg, htag0, addr, flags); 5630Sstevel@tonic-gate mutex_enter(&hp->p_hmutex); 5646695Saguzovsk again: 5650Sstevel@tonic-gate for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 5660Sstevel@tonic-gate pcp = pcp->p_hnext) { 5676695Saguzovsk ASSERT(pcp->p_hashp == hp); 5686695Saguzovsk if (P_MATCH_PP(pcp, htag0, addr, len, pp)) { 5696695Saguzovsk ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp)); 5706695Saguzovsk ASSERT(pcp->p_active); 5716695Saguzovsk if (keep) { 5726695Saguzovsk /* 5736695Saguzovsk * Don't remove this pcp entry 5746695Saguzovsk * if we didn't find duplicate 5756695Saguzovsk * shadow lists on second search. 5766695Saguzovsk * Somebody removed those duplicates 5776695Saguzovsk * since we dropped hash lock after first 5786695Saguzovsk * search. 5796695Saguzovsk */ 5806695Saguzovsk ASSERT(pmtx != NULL); 5816695Saguzovsk ASSERT(!IS_PFLAGS_WIRED(flags)); 5826695Saguzovsk mutex_exit(pmtx); 5836695Saguzovsk pmtx = NULL; 5846695Saguzovsk } 5850Sstevel@tonic-gate pcp->p_active--; 5866695Saguzovsk if (pcp->p_active == 0 && (pmtx != NULL || 5876695Saguzovsk (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) { 5886695Saguzovsk 5896695Saguzovsk /* 5906695Saguzovsk * This entry is no longer active. Remove it 5916695Saguzovsk * now either because pcaching is temporarily 5926695Saguzovsk * disabled or there're other pcp entries that 5936695Saguzovsk * can match this pagelock request (i.e. this 5946695Saguzovsk * entry is a duplicate). 5956695Saguzovsk */ 5960Sstevel@tonic-gate 5970Sstevel@tonic-gate ASSERT(callback == pcp->p_callback); 5986695Saguzovsk if (pmtx != NULL) { 5996695Saguzovsk pcache_link_t *plinkp = &pcp->p_plink; 6006695Saguzovsk ASSERT(!IS_PCP_WIRED(pcp)); 6016695Saguzovsk ASSERT(pheadp->p_lnext != pheadp); 6026695Saguzovsk ASSERT(pheadp->p_lprev != pheadp); 6036695Saguzovsk plinkp->p_lprev->p_lnext = 6046695Saguzovsk plinkp->p_lnext; 6056695Saguzovsk plinkp->p_lnext->p_lprev = 6066695Saguzovsk plinkp->p_lprev; 6076695Saguzovsk } 6080Sstevel@tonic-gate pcp->p_hprev->p_hnext = pcp->p_hnext; 6090Sstevel@tonic-gate pcp->p_hnext->p_hprev = pcp->p_hprev; 6106695Saguzovsk if (!IS_PCP_WIRED(pcp) && 6116695Saguzovsk hp->p_hnext == (struct seg_pcache *)hp) { 6126695Saguzovsk /* 6136695Saguzovsk * We removed the last entry from this 6146695Saguzovsk * bucket. Now remove the bucket from 6156695Saguzovsk * its active list. 6166695Saguzovsk */ 6176695Saguzovsk seg_premove_abuck(hp, 0); 6186695Saguzovsk } 6190Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 6206695Saguzovsk if (pmtx != NULL) { 6216695Saguzovsk mutex_exit(pmtx); 6226695Saguzovsk } 6236695Saguzovsk len = pcp->p_len; 6246695Saguzovsk npages = btop(len); 6256695Saguzovsk if (rw != S_WRITE && pcp->p_write) { 6266695Saguzovsk rw = S_WRITE; 6276695Saguzovsk } 6286695Saguzovsk kmem_cache_free(seg_pkmcache, pcp); 6296695Saguzovsk goto out; 6306695Saguzovsk } else { 6316695Saguzovsk /* 6326695Saguzovsk * We found a matching pcp entry but will not 6336695Saguzovsk * free it right away even if it's no longer 6346695Saguzovsk * active. 6356695Saguzovsk */ 6366695Saguzovsk if (!pcp->p_active && !IS_PCP_WIRED(pcp)) { 6376695Saguzovsk /* 6386695Saguzovsk * Set the reference bit and mark the 6396695Saguzovsk * time of last access to this pcp 6406695Saguzovsk * so that asynchronous thread doesn't 6416695Saguzovsk * free it immediately since 6426695Saguzovsk * it may be reactivated very soon. 6436695Saguzovsk */ 644*11066Srafael.vanoni@sun.com pcp->p_lbolt = ddi_get_lbolt(); 6456695Saguzovsk pcp->p_ref = 1; 6466695Saguzovsk } 6476695Saguzovsk mutex_exit(&hp->p_hmutex); 6486695Saguzovsk if (pmtx != NULL) { 6496695Saguzovsk mutex_exit(pmtx); 6500Sstevel@tonic-gate } 6516695Saguzovsk return; 6526695Saguzovsk } 6536695Saguzovsk } else if (!IS_PFLAGS_WIRED(flags) && 6546695Saguzovsk P_MATCH(pcp, htag0, addr, len)) { 6556695Saguzovsk /* 6566695Saguzovsk * This is a duplicate pcp entry. This situation may 6576695Saguzovsk * happen if a bigger shadow list that covers our 6586695Saguzovsk * range was added while our entry was still active. 6596695Saguzovsk * Now we can free our pcp entry if it becomes 6606695Saguzovsk * inactive. 6616695Saguzovsk */ 6626695Saguzovsk if (!pcp->p_active) { 6636695Saguzovsk /* 6646695Saguzovsk * Mark this entry as referenced just in case 6656695Saguzovsk * we'll free our own pcp entry soon. 6666695Saguzovsk */ 667*11066Srafael.vanoni@sun.com pcp->p_lbolt = ddi_get_lbolt(); 6686695Saguzovsk pcp->p_ref = 1; 6696695Saguzovsk } 6706695Saguzovsk if (pmtx != NULL) { 6716695Saguzovsk /* 6726695Saguzovsk * we are already holding pmtx and found a 6736695Saguzovsk * duplicate. Don't keep our own pcp entry. 6746695Saguzovsk */ 6756695Saguzovsk keep = 0; 6766695Saguzovsk continue; 6770Sstevel@tonic-gate } 6786695Saguzovsk /* 6796695Saguzovsk * We have to use mutex_tryenter to attempt to lock 6806695Saguzovsk * seg/amp list lock since we already hold hash lock 6816695Saguzovsk * and seg/amp list lock is above hash lock in lock 6826695Saguzovsk * order. If mutex_tryenter fails drop hash lock and 6836695Saguzovsk * retake both locks in correct order and research 6846695Saguzovsk * this hash chain. 6856695Saguzovsk */ 6866695Saguzovsk ASSERT(keep == 0); 6876695Saguzovsk if (amp == NULL) { 6886695Saguzovsk pheadp = &seg->s_phead; 6896695Saguzovsk pmtx = &seg->s_pmtx; 6906695Saguzovsk } else { 6916695Saguzovsk pheadp = &->a_phead; 6926695Saguzovsk pmtx = &->a_pmtx; 6936695Saguzovsk } 6946695Saguzovsk if (!mutex_tryenter(pmtx)) { 6956695Saguzovsk mutex_exit(&hp->p_hmutex); 6966695Saguzovsk mutex_enter(pmtx); 6976695Saguzovsk mutex_enter(&hp->p_hmutex); 6986695Saguzovsk /* 6996695Saguzovsk * If we don't find bigger shadow list on 7006695Saguzovsk * second search (it may happen since we 7016695Saguzovsk * dropped bucket lock) keep the entry that 7026695Saguzovsk * matches our own shadow list. 7036695Saguzovsk */ 7046695Saguzovsk keep = 1; 7056695Saguzovsk goto again; 7066695Saguzovsk } 7070Sstevel@tonic-gate } 7080Sstevel@tonic-gate } 7090Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 7106695Saguzovsk if (pmtx != NULL) { 7116695Saguzovsk mutex_exit(pmtx); 7126695Saguzovsk } 7130Sstevel@tonic-gate out: 7146695Saguzovsk (*callback)(htag0, addr, len, pp, rw, 0); 7156695Saguzovsk if (npages) { 7166695Saguzovsk mutex_enter(&seg_pmem_mtx); 7176695Saguzovsk ASSERT(seg_plocked >= npages); 7186695Saguzovsk seg_plocked -= npages; 7196695Saguzovsk if (!IS_PFLAGS_WIRED(flags)) { 7206695Saguzovsk ASSERT(seg_plocked_window >= npages); 7216695Saguzovsk seg_plocked_window -= npages; 7226695Saguzovsk } 7236695Saguzovsk mutex_exit(&seg_pmem_mtx); 7246695Saguzovsk } 7256695Saguzovsk 7260Sstevel@tonic-gate } 7270Sstevel@tonic-gate 7286695Saguzovsk #ifdef DEBUG 7296695Saguzovsk static uint32_t p_insert_chk_mtbf = 0; 7306695Saguzovsk #endif 7316695Saguzovsk 7320Sstevel@tonic-gate /* 7330Sstevel@tonic-gate * The seg_pinsert_check() is used by segment drivers to predict whether 7340Sstevel@tonic-gate * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing. 7350Sstevel@tonic-gate */ 7366695Saguzovsk /*ARGSUSED*/ 7370Sstevel@tonic-gate int 7386695Saguzovsk seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr, 7396695Saguzovsk size_t len, uint_t flags) 7400Sstevel@tonic-gate { 7416695Saguzovsk ASSERT(seg != NULL); 7420Sstevel@tonic-gate 7436695Saguzovsk #ifdef DEBUG 7446695Saguzovsk if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) { 7450Sstevel@tonic-gate return (SEGP_FAIL); 7460Sstevel@tonic-gate } 7476695Saguzovsk #endif 7486695Saguzovsk 7496695Saguzovsk if (seg_pdisabled) { 7500Sstevel@tonic-gate return (SEGP_FAIL); 7510Sstevel@tonic-gate } 7526695Saguzovsk ASSERT(seg_phashsize_win != 0); 7536695Saguzovsk 7546695Saguzovsk if (IS_PFLAGS_WIRED(flags)) { 7556695Saguzovsk return (SEGP_SUCCESS); 7566695Saguzovsk } 7570Sstevel@tonic-gate 7586695Saguzovsk if (seg_plocked_window + btop(len) > seg_pmaxwindow) { 7596695Saguzovsk return (SEGP_FAIL); 7600Sstevel@tonic-gate } 7616695Saguzovsk 7626695Saguzovsk if (freemem < desfree) { 7636695Saguzovsk return (SEGP_FAIL); 7646695Saguzovsk } 7656695Saguzovsk 7660Sstevel@tonic-gate return (SEGP_SUCCESS); 7670Sstevel@tonic-gate } 7680Sstevel@tonic-gate 7696695Saguzovsk #ifdef DEBUG 7706695Saguzovsk static uint32_t p_insert_mtbf = 0; 7716695Saguzovsk #endif 7720Sstevel@tonic-gate 7730Sstevel@tonic-gate /* 7746695Saguzovsk * Insert address range with shadow list into pagelock cache if there's no 7756695Saguzovsk * shadow list already cached for this address range. If the cache is off or 7766695Saguzovsk * caching is temporarily disabled or the allowed 'window' is exceeded return 7776695Saguzovsk * SEGP_FAIL. Otherwise return SEGP_SUCCESS. 7786695Saguzovsk * 7796695Saguzovsk * For non wired shadow lists (segvn case) include address in the hashing 7806695Saguzovsk * function to avoid linking all the entries from the same segment or amp on 7816695Saguzovsk * the same bucket. amp is used instead of seg if amp is not NULL. Non wired 7826695Saguzovsk * pcache entries are also linked on a per segment/amp list so that all 7836695Saguzovsk * entries can be found quickly during seg/amp purge without walking the 7846695Saguzovsk * entire pcache hash table. For wired shadow lists (segspt case) we 7856695Saguzovsk * don't use address hashing and per segment linking because the caller 7866695Saguzovsk * currently inserts only one entry per segment that covers the entire 7876695Saguzovsk * segment. If we used per segment linking even for segspt it would complicate 7886695Saguzovsk * seg_ppurge_wiredpp() locking. 7896695Saguzovsk * 7906695Saguzovsk * Both hash bucket and per seg/amp locks need to be held before adding a non 7916695Saguzovsk * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken 7926695Saguzovsk * first. 7936695Saguzovsk * 7946695Saguzovsk * This function will also remove from pcache old inactive shadow lists that 7956695Saguzovsk * overlap with this request but cover smaller range for the same start 7966695Saguzovsk * address. 7970Sstevel@tonic-gate */ 7980Sstevel@tonic-gate int 7996695Saguzovsk seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, 8006695Saguzovsk size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags, 8016695Saguzovsk seg_preclaim_cbfunc_t callback) 8020Sstevel@tonic-gate { 8030Sstevel@tonic-gate struct seg_pcache *pcp; 8040Sstevel@tonic-gate struct seg_phash *hp; 8050Sstevel@tonic-gate pgcnt_t npages; 8066695Saguzovsk pcache_link_t *pheadp; 8076695Saguzovsk kmutex_t *pmtx; 8086695Saguzovsk struct seg_pcache *delcallb_list = NULL; 8090Sstevel@tonic-gate 8106695Saguzovsk ASSERT(seg != NULL); 8116695Saguzovsk ASSERT(rw == S_READ || rw == S_WRITE); 8126695Saguzovsk ASSERT(rw == S_READ || wlen == len); 8136695Saguzovsk ASSERT(rw == S_WRITE || wlen <= len); 8146695Saguzovsk ASSERT(amp == NULL || wlen == len); 8156695Saguzovsk 8166695Saguzovsk #ifdef DEBUG 8176695Saguzovsk if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) { 8180Sstevel@tonic-gate return (SEGP_FAIL); 8190Sstevel@tonic-gate } 8206695Saguzovsk #endif 8216695Saguzovsk 8226695Saguzovsk if (seg_pdisabled) { 8230Sstevel@tonic-gate return (SEGP_FAIL); 8240Sstevel@tonic-gate } 8256695Saguzovsk ASSERT(seg_phashsize_win != 0); 8266695Saguzovsk 8276695Saguzovsk ASSERT((len & PAGEOFFSET) == 0); 8286695Saguzovsk npages = btop(len); 8296695Saguzovsk mutex_enter(&seg_pmem_mtx); 8306695Saguzovsk if (!IS_PFLAGS_WIRED(flags)) { 8316695Saguzovsk if (seg_plocked_window + npages > seg_pmaxwindow) { 8326695Saguzovsk mutex_exit(&seg_pmem_mtx); 8330Sstevel@tonic-gate return (SEGP_FAIL); 8340Sstevel@tonic-gate } 8356695Saguzovsk seg_plocked_window += npages; 8360Sstevel@tonic-gate } 8370Sstevel@tonic-gate seg_plocked += npages; 8386695Saguzovsk mutex_exit(&seg_pmem_mtx); 8390Sstevel@tonic-gate 8406695Saguzovsk pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP); 8416695Saguzovsk /* 8426695Saguzovsk * If amp is not NULL set htag0 to amp otherwise set it to seg. 8436695Saguzovsk */ 8446695Saguzovsk if (amp == NULL) { 8456695Saguzovsk pcp->p_htag0 = (void *)seg; 8466695Saguzovsk pcp->p_flags = flags & 0xffff; 8476695Saguzovsk } else { 8486695Saguzovsk pcp->p_htag0 = (void *)amp; 8496695Saguzovsk pcp->p_flags = (flags & 0xffff) | SEGP_AMP; 8506695Saguzovsk } 8510Sstevel@tonic-gate pcp->p_addr = addr; 8520Sstevel@tonic-gate pcp->p_len = len; 8536695Saguzovsk pcp->p_wlen = wlen; 8540Sstevel@tonic-gate pcp->p_pp = pp; 8556695Saguzovsk pcp->p_write = (rw == S_WRITE); 8560Sstevel@tonic-gate pcp->p_callback = callback; 8570Sstevel@tonic-gate pcp->p_active = 1; 8580Sstevel@tonic-gate 8596695Saguzovsk hp = P_HASHBP(seg, pcp->p_htag0, addr, flags); 8606695Saguzovsk if (!IS_PFLAGS_WIRED(flags)) { 8616695Saguzovsk int found; 8626695Saguzovsk void *htag0; 8636695Saguzovsk if (amp == NULL) { 8646695Saguzovsk pheadp = &seg->s_phead; 8656695Saguzovsk pmtx = &seg->s_pmtx; 8666695Saguzovsk htag0 = (void *)seg; 8676695Saguzovsk } else { 8686695Saguzovsk pheadp = &->a_phead; 8696695Saguzovsk pmtx = &->a_pmtx; 8706695Saguzovsk htag0 = (void *)amp; 8716695Saguzovsk } 8726695Saguzovsk mutex_enter(pmtx); 8736695Saguzovsk mutex_enter(&hp->p_hmutex); 8746695Saguzovsk delcallb_list = seg_plookup_checkdup(hp, htag0, addr, 8756695Saguzovsk len, &found); 8766695Saguzovsk if (found) { 8776695Saguzovsk mutex_exit(&hp->p_hmutex); 8786695Saguzovsk mutex_exit(pmtx); 8796695Saguzovsk mutex_enter(&seg_pmem_mtx); 8806695Saguzovsk seg_plocked -= npages; 8816695Saguzovsk seg_plocked_window -= npages; 8826695Saguzovsk mutex_exit(&seg_pmem_mtx); 8836695Saguzovsk kmem_cache_free(seg_pkmcache, pcp); 8846695Saguzovsk goto out; 8856695Saguzovsk } 8866695Saguzovsk pcp->p_plink.p_lnext = pheadp->p_lnext; 8876695Saguzovsk pcp->p_plink.p_lprev = pheadp; 8886695Saguzovsk pheadp->p_lnext->p_lprev = &pcp->p_plink; 8896695Saguzovsk pheadp->p_lnext = &pcp->p_plink; 8906695Saguzovsk } else { 8916695Saguzovsk mutex_enter(&hp->p_hmutex); 8926695Saguzovsk } 8936695Saguzovsk pcp->p_hashp = hp; 8940Sstevel@tonic-gate pcp->p_hnext = hp->p_hnext; 8950Sstevel@tonic-gate pcp->p_hprev = (struct seg_pcache *)hp; 8960Sstevel@tonic-gate hp->p_hnext->p_hprev = pcp; 8970Sstevel@tonic-gate hp->p_hnext = pcp; 8986695Saguzovsk if (!IS_PFLAGS_WIRED(flags) && 8996695Saguzovsk hp->p_hprev == pcp) { 9006695Saguzovsk seg_padd_abuck(hp); 9016695Saguzovsk } 9020Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 9036695Saguzovsk if (!IS_PFLAGS_WIRED(flags)) { 9046695Saguzovsk mutex_exit(pmtx); 9056695Saguzovsk } 9066695Saguzovsk 9076695Saguzovsk out: 9086695Saguzovsk npages = 0; 9096695Saguzovsk while (delcallb_list != NULL) { 9106695Saguzovsk pcp = delcallb_list; 9116695Saguzovsk delcallb_list = pcp->p_hprev; 9126695Saguzovsk ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active); 9136695Saguzovsk (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 9146695Saguzovsk pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0); 9156695Saguzovsk npages += btop(pcp->p_len); 9166695Saguzovsk kmem_cache_free(seg_pkmcache, pcp); 9176695Saguzovsk } 9186695Saguzovsk if (npages) { 9196695Saguzovsk ASSERT(!IS_PFLAGS_WIRED(flags)); 9206695Saguzovsk mutex_enter(&seg_pmem_mtx); 9216695Saguzovsk ASSERT(seg_plocked >= npages); 9226695Saguzovsk ASSERT(seg_plocked_window >= npages); 9236695Saguzovsk seg_plocked -= npages; 9246695Saguzovsk seg_plocked_window -= npages; 9256695Saguzovsk mutex_exit(&seg_pmem_mtx); 9266695Saguzovsk } 9276695Saguzovsk 9280Sstevel@tonic-gate return (SEGP_SUCCESS); 9290Sstevel@tonic-gate } 9300Sstevel@tonic-gate 9310Sstevel@tonic-gate /* 9326695Saguzovsk * purge entries from the pagelock cache if not active 9336695Saguzovsk * and not recently used. 9340Sstevel@tonic-gate */ 9350Sstevel@tonic-gate static void 9366695Saguzovsk seg_ppurge_async(int force) 9370Sstevel@tonic-gate { 9380Sstevel@tonic-gate struct seg_pcache *delcallb_list = NULL; 9390Sstevel@tonic-gate struct seg_pcache *pcp; 9400Sstevel@tonic-gate struct seg_phash *hp; 9410Sstevel@tonic-gate pgcnt_t npages = 0; 9420Sstevel@tonic-gate pgcnt_t npages_window = 0; 9436695Saguzovsk pgcnt_t npgs_to_purge; 9446695Saguzovsk pgcnt_t npgs_purged = 0; 9456695Saguzovsk int hlinks = 0; 9466695Saguzovsk int hlix; 9476695Saguzovsk pcache_link_t *hlinkp; 9486695Saguzovsk pcache_link_t *hlnextp = NULL; 9496695Saguzovsk int lowmem; 9506695Saguzovsk int trim; 9516695Saguzovsk 9526695Saguzovsk ASSERT(seg_phashsize_win != 0); 9530Sstevel@tonic-gate 9540Sstevel@tonic-gate /* 9556695Saguzovsk * if the cache is off or empty, return 9560Sstevel@tonic-gate */ 9576695Saguzovsk if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) { 9580Sstevel@tonic-gate return; 9590Sstevel@tonic-gate } 9606695Saguzovsk 9616695Saguzovsk if (!force) { 9626695Saguzovsk lowmem = 0; 9636695Saguzovsk trim = 0; 9646695Saguzovsk if (freemem < lotsfree + needfree) { 9656695Saguzovsk spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0); 9666695Saguzovsk if (fmem <= 5 * (desfree >> 2)) { 9676695Saguzovsk lowmem = 1; 9686695Saguzovsk } else if (fmem <= 7 * (lotsfree >> 3)) { 9696695Saguzovsk if (seg_plocked_window >= 9706695Saguzovsk (availrmem_initial >> 1)) { 9716695Saguzovsk lowmem = 1; 9726695Saguzovsk } 9736695Saguzovsk } else if (fmem < lotsfree) { 9746695Saguzovsk if (seg_plocked_window >= 9756695Saguzovsk 3 * (availrmem_initial >> 2)) { 9766695Saguzovsk lowmem = 1; 9776695Saguzovsk } 9786695Saguzovsk } 9796695Saguzovsk } 9806695Saguzovsk if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) { 9816695Saguzovsk trim = 1; 9826695Saguzovsk } 9836695Saguzovsk if (!lowmem && !trim) { 9846695Saguzovsk return; 9856695Saguzovsk } 9866695Saguzovsk npgs_to_purge = seg_plocked_window >> 9876695Saguzovsk seg_pshrink_shift; 9886695Saguzovsk if (lowmem) { 9896695Saguzovsk npgs_to_purge = MIN(npgs_to_purge, 9906695Saguzovsk MAX(seg_pmaxapurge_npages, desfree)); 9916695Saguzovsk } else { 9926695Saguzovsk npgs_to_purge = MIN(npgs_to_purge, 9936695Saguzovsk seg_pmaxapurge_npages); 9946695Saguzovsk } 9956695Saguzovsk if (npgs_to_purge == 0) { 9966695Saguzovsk return; 9976695Saguzovsk } 9986695Saguzovsk } else { 9996695Saguzovsk struct seg_phash_wired *hpw; 10006695Saguzovsk 10016695Saguzovsk ASSERT(seg_phashsize_wired != 0); 10026695Saguzovsk 10036695Saguzovsk for (hpw = seg_phashtab_wired; 10046695Saguzovsk hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) { 10056695Saguzovsk 10066695Saguzovsk if (hpw->p_hnext == (struct seg_pcache *)hpw) { 10076695Saguzovsk continue; 10086695Saguzovsk } 10096695Saguzovsk 10106695Saguzovsk mutex_enter(&hpw->p_hmutex); 10116695Saguzovsk 10126695Saguzovsk for (pcp = hpw->p_hnext; 10136695Saguzovsk pcp != (struct seg_pcache *)hpw; 10146695Saguzovsk pcp = pcp->p_hnext) { 10156695Saguzovsk 10166695Saguzovsk ASSERT(IS_PCP_WIRED(pcp)); 10176695Saguzovsk ASSERT(pcp->p_hashp == 10186695Saguzovsk (struct seg_phash *)hpw); 10196695Saguzovsk 10206695Saguzovsk if (pcp->p_active) { 10216695Saguzovsk continue; 10226695Saguzovsk } 10236695Saguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 10246695Saguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 10256695Saguzovsk pcp->p_hprev = delcallb_list; 10266695Saguzovsk delcallb_list = pcp; 10276695Saguzovsk } 10286695Saguzovsk mutex_exit(&hpw->p_hmutex); 10296695Saguzovsk } 10306695Saguzovsk } 10316695Saguzovsk 10326695Saguzovsk mutex_enter(&seg_pmem_mtx); 10336695Saguzovsk if (seg_pathr_on) { 10346695Saguzovsk mutex_exit(&seg_pmem_mtx); 10356695Saguzovsk goto runcb; 10366695Saguzovsk } 10376695Saguzovsk seg_pathr_on = 1; 10386695Saguzovsk mutex_exit(&seg_pmem_mtx); 10396695Saguzovsk ASSERT(seg_pahcur <= 1); 10406695Saguzovsk hlix = !seg_pahcur; 10416695Saguzovsk 10426695Saguzovsk again: 10436695Saguzovsk for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix]; 10446695Saguzovsk hlinkp = hlnextp) { 10456695Saguzovsk 10466695Saguzovsk hlnextp = hlinkp->p_lnext; 10476695Saguzovsk ASSERT(hlnextp != NULL); 10486695Saguzovsk 10496695Saguzovsk hp = hlink2phash(hlinkp, hlix); 10506695Saguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) { 10516695Saguzovsk seg_pathr_empty_ahb++; 10526695Saguzovsk continue; 10536695Saguzovsk } 10546695Saguzovsk seg_pathr_full_ahb++; 10550Sstevel@tonic-gate mutex_enter(&hp->p_hmutex); 10566695Saguzovsk 10576695Saguzovsk for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 10586695Saguzovsk pcp = pcp->p_hnext) { 10596695Saguzovsk pcache_link_t *pheadp; 10606695Saguzovsk pcache_link_t *plinkp; 10616695Saguzovsk void *htag0; 10626695Saguzovsk kmutex_t *pmtx; 10636695Saguzovsk 10646695Saguzovsk ASSERT(!IS_PCP_WIRED(pcp)); 10656695Saguzovsk ASSERT(pcp->p_hashp == hp); 10666695Saguzovsk 10676695Saguzovsk if (pcp->p_active) { 10686695Saguzovsk continue; 10696695Saguzovsk } 10706695Saguzovsk if (!force && pcp->p_ref && 10716695Saguzovsk PCP_AGE(pcp) < seg_pmax_pcpage) { 10726695Saguzovsk pcp->p_ref = 0; 10736695Saguzovsk continue; 10746695Saguzovsk } 10756695Saguzovsk plinkp = &pcp->p_plink; 10766695Saguzovsk htag0 = pcp->p_htag0; 10776695Saguzovsk if (pcp->p_flags & SEGP_AMP) { 10786695Saguzovsk pheadp = &((amp_t *)htag0)->a_phead; 10796695Saguzovsk pmtx = &((amp_t *)htag0)->a_pmtx; 10806695Saguzovsk } else { 10816695Saguzovsk pheadp = &((seg_t *)htag0)->s_phead; 10826695Saguzovsk pmtx = &((seg_t *)htag0)->s_pmtx; 10836695Saguzovsk } 10846695Saguzovsk if (!mutex_tryenter(pmtx)) { 10856695Saguzovsk continue; 10866695Saguzovsk } 10876695Saguzovsk ASSERT(pheadp->p_lnext != pheadp); 10886695Saguzovsk ASSERT(pheadp->p_lprev != pheadp); 10896695Saguzovsk plinkp->p_lprev->p_lnext = 10906695Saguzovsk plinkp->p_lnext; 10916695Saguzovsk plinkp->p_lnext->p_lprev = 10926695Saguzovsk plinkp->p_lprev; 10936695Saguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 10946695Saguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 10956695Saguzovsk mutex_exit(pmtx); 10966695Saguzovsk pcp->p_hprev = delcallb_list; 10976695Saguzovsk delcallb_list = pcp; 10986695Saguzovsk npgs_purged += btop(pcp->p_len); 10996695Saguzovsk } 11006695Saguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) { 11016695Saguzovsk seg_premove_abuck(hp, 1); 11026695Saguzovsk } 11036695Saguzovsk mutex_exit(&hp->p_hmutex); 11046695Saguzovsk if (npgs_purged >= seg_plocked_window) { 11056695Saguzovsk break; 11066695Saguzovsk } 11076695Saguzovsk if (!force) { 11086695Saguzovsk if (npgs_purged >= npgs_to_purge) { 11096695Saguzovsk break; 11106695Saguzovsk } 11116695Saguzovsk if (!trim && !(seg_pathr_full_ahb & 15)) { 11126695Saguzovsk ASSERT(lowmem); 11136695Saguzovsk if (freemem >= lotsfree + needfree) { 11146695Saguzovsk break; 11156695Saguzovsk } 11166695Saguzovsk } 11176695Saguzovsk } 11186695Saguzovsk } 11196695Saguzovsk 11206695Saguzovsk if (hlinkp == &seg_pahhead[hlix]) { 11216695Saguzovsk /* 11226695Saguzovsk * We processed the entire hlix active bucket list 11236695Saguzovsk * but didn't find enough pages to reclaim. 11246695Saguzovsk * Switch the lists and walk the other list 11256695Saguzovsk * if we haven't done it yet. 11266695Saguzovsk */ 11276695Saguzovsk mutex_enter(&seg_pmem_mtx); 11286695Saguzovsk ASSERT(seg_pathr_on); 11296695Saguzovsk ASSERT(seg_pahcur == !hlix); 11306695Saguzovsk seg_pahcur = hlix; 11316695Saguzovsk mutex_exit(&seg_pmem_mtx); 11326695Saguzovsk if (++hlinks < 2) { 11336695Saguzovsk hlix = !hlix; 11346695Saguzovsk goto again; 11356695Saguzovsk } 11366695Saguzovsk } else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] && 11376695Saguzovsk seg_pahhead[hlix].p_lnext != hlinkp) { 11386695Saguzovsk ASSERT(hlinkp != NULL); 11396695Saguzovsk ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]); 11406695Saguzovsk ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]); 11416695Saguzovsk ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]); 11420Sstevel@tonic-gate 11430Sstevel@tonic-gate /* 11446695Saguzovsk * Reinsert the header to point to hlinkp 11456695Saguzovsk * so that we start from hlinkp bucket next time around. 11460Sstevel@tonic-gate */ 11476695Saguzovsk seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev; 11486695Saguzovsk seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext; 11496695Saguzovsk seg_pahhead[hlix].p_lnext = hlinkp; 11506695Saguzovsk seg_pahhead[hlix].p_lprev = hlinkp->p_lprev; 11516695Saguzovsk hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix]; 11526695Saguzovsk hlinkp->p_lprev = &seg_pahhead[hlix]; 11536695Saguzovsk } 11546695Saguzovsk 11556695Saguzovsk mutex_enter(&seg_pmem_mtx); 11566695Saguzovsk ASSERT(seg_pathr_on); 11576695Saguzovsk seg_pathr_on = 0; 11586695Saguzovsk mutex_exit(&seg_pmem_mtx); 11590Sstevel@tonic-gate 11606695Saguzovsk runcb: 11616695Saguzovsk /* 11626695Saguzovsk * Run the delayed callback list. segments/amps can't go away until 11636695Saguzovsk * callback is executed since they must have non 0 softlockcnt. That's 11646695Saguzovsk * why we don't need to hold as/seg/amp locks to execute the callback. 11656695Saguzovsk */ 11666695Saguzovsk while (delcallb_list != NULL) { 11676695Saguzovsk pcp = delcallb_list; 11686695Saguzovsk delcallb_list = pcp->p_hprev; 11696695Saguzovsk ASSERT(!pcp->p_active); 11706695Saguzovsk (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 11716695Saguzovsk pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1); 11726695Saguzovsk npages += btop(pcp->p_len); 11736695Saguzovsk if (!IS_PCP_WIRED(pcp)) { 11746695Saguzovsk npages_window += btop(pcp->p_len); 11756695Saguzovsk } 11766695Saguzovsk kmem_cache_free(seg_pkmcache, pcp); 11776695Saguzovsk } 11786695Saguzovsk if (npages) { 11796695Saguzovsk mutex_enter(&seg_pmem_mtx); 11806695Saguzovsk ASSERT(seg_plocked >= npages); 11816695Saguzovsk ASSERT(seg_plocked_window >= npages_window); 11826695Saguzovsk seg_plocked -= npages; 11836695Saguzovsk seg_plocked_window -= npages_window; 11846695Saguzovsk mutex_exit(&seg_pmem_mtx); 11856695Saguzovsk } 11866695Saguzovsk } 11876695Saguzovsk 11886695Saguzovsk /* 11896695Saguzovsk * Remove cached pages for segment(s) entries from hashtable. The segments 11906695Saguzovsk * are identified by pp array. This is useful for multiple seg's cached on 11916695Saguzovsk * behalf of dummy segment (ISM/DISM) with common pp array. 11926695Saguzovsk */ 11936695Saguzovsk void 11946695Saguzovsk seg_ppurge_wiredpp(struct page **pp) 11956695Saguzovsk { 11966695Saguzovsk struct seg_pcache *pcp; 11976695Saguzovsk struct seg_phash_wired *hp; 11986695Saguzovsk pgcnt_t npages = 0; 11996695Saguzovsk struct seg_pcache *delcallb_list = NULL; 12006695Saguzovsk 12016695Saguzovsk /* 12026695Saguzovsk * if the cache is empty, return 12036695Saguzovsk */ 12046695Saguzovsk if (seg_plocked == 0) { 12056695Saguzovsk return; 12066695Saguzovsk } 12076695Saguzovsk ASSERT(seg_phashsize_wired != 0); 12086695Saguzovsk 12096695Saguzovsk for (hp = seg_phashtab_wired; 12106695Saguzovsk hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) { 12116695Saguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) { 12126695Saguzovsk continue; 12136695Saguzovsk } 12146695Saguzovsk mutex_enter(&hp->p_hmutex); 12156695Saguzovsk pcp = hp->p_hnext; 12166695Saguzovsk while (pcp != (struct seg_pcache *)hp) { 12176695Saguzovsk ASSERT(pcp->p_hashp == (struct seg_phash *)hp); 12186695Saguzovsk ASSERT(IS_PCP_WIRED(pcp)); 12190Sstevel@tonic-gate /* 12206695Saguzovsk * purge entries which are not active 12210Sstevel@tonic-gate */ 12226695Saguzovsk if (!pcp->p_active && pcp->p_pp == pp) { 12236695Saguzovsk ASSERT(pcp->p_htag0 != NULL); 12246695Saguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 12256695Saguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 12266695Saguzovsk pcp->p_hprev = delcallb_list; 12276695Saguzovsk delcallb_list = pcp; 12280Sstevel@tonic-gate } 12290Sstevel@tonic-gate pcp = pcp->p_hnext; 12300Sstevel@tonic-gate } 12310Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 12326695Saguzovsk /* 12336695Saguzovsk * segments can't go away until callback is executed since 12346695Saguzovsk * they must have non 0 softlockcnt. That's why we don't 12356695Saguzovsk * need to hold as/seg locks to execute the callback. 12366695Saguzovsk */ 12376695Saguzovsk while (delcallb_list != NULL) { 12386695Saguzovsk int done; 12396695Saguzovsk pcp = delcallb_list; 12406695Saguzovsk delcallb_list = pcp->p_hprev; 12416695Saguzovsk ASSERT(!pcp->p_active); 12426695Saguzovsk done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 12436695Saguzovsk pcp->p_len, pcp->p_pp, 12446695Saguzovsk pcp->p_write ? S_WRITE : S_READ, 1); 12456695Saguzovsk npages += btop(pcp->p_len); 12466695Saguzovsk ASSERT(IS_PCP_WIRED(pcp)); 12476695Saguzovsk kmem_cache_free(seg_pkmcache, pcp); 12486695Saguzovsk if (done) { 12496695Saguzovsk ASSERT(delcallb_list == NULL); 12506695Saguzovsk goto out; 12516695Saguzovsk } 12526695Saguzovsk } 12530Sstevel@tonic-gate } 12540Sstevel@tonic-gate 12556695Saguzovsk out: 12566695Saguzovsk mutex_enter(&seg_pmem_mtx); 12576695Saguzovsk ASSERT(seg_plocked >= npages); 12580Sstevel@tonic-gate seg_plocked -= npages; 12596695Saguzovsk mutex_exit(&seg_pmem_mtx); 12600Sstevel@tonic-gate } 12610Sstevel@tonic-gate 12620Sstevel@tonic-gate /* 12630Sstevel@tonic-gate * purge all entries for a given segment. Since we 12640Sstevel@tonic-gate * callback into the segment driver directly for page 12650Sstevel@tonic-gate * reclaim the caller needs to hold the right locks. 12660Sstevel@tonic-gate */ 12670Sstevel@tonic-gate void 12686695Saguzovsk seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags) 12690Sstevel@tonic-gate { 12700Sstevel@tonic-gate struct seg_pcache *delcallb_list = NULL; 12710Sstevel@tonic-gate struct seg_pcache *pcp; 12720Sstevel@tonic-gate struct seg_phash *hp; 12730Sstevel@tonic-gate pgcnt_t npages = 0; 12746695Saguzovsk void *htag0; 12750Sstevel@tonic-gate 12766695Saguzovsk if (seg_plocked == 0) { 12770Sstevel@tonic-gate return; 12780Sstevel@tonic-gate } 12796695Saguzovsk ASSERT(seg_phashsize_win != 0); 12806695Saguzovsk 12816695Saguzovsk /* 12826695Saguzovsk * If amp is not NULL use amp as a lookup tag otherwise use seg 12836695Saguzovsk * as a lookup tag. 12846695Saguzovsk */ 12856695Saguzovsk htag0 = (amp == NULL ? (void *)seg : (void *)amp); 12866695Saguzovsk ASSERT(htag0 != NULL); 12876695Saguzovsk if (IS_PFLAGS_WIRED(flags)) { 12886695Saguzovsk hp = P_HASHBP(seg, htag0, 0, flags); 12896695Saguzovsk mutex_enter(&hp->p_hmutex); 12906695Saguzovsk pcp = hp->p_hnext; 12916695Saguzovsk while (pcp != (struct seg_pcache *)hp) { 12926695Saguzovsk ASSERT(pcp->p_hashp == hp); 12936695Saguzovsk ASSERT(IS_PCP_WIRED(pcp)); 12946695Saguzovsk if (pcp->p_htag0 == htag0) { 12956695Saguzovsk if (pcp->p_active) { 12966695Saguzovsk break; 12976695Saguzovsk } 12986695Saguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 12996695Saguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 13006695Saguzovsk pcp->p_hprev = delcallb_list; 13016695Saguzovsk delcallb_list = pcp; 13026695Saguzovsk } 13036695Saguzovsk pcp = pcp->p_hnext; 13046695Saguzovsk } 13056695Saguzovsk mutex_exit(&hp->p_hmutex); 13066695Saguzovsk } else { 13076695Saguzovsk pcache_link_t *plinkp; 13086695Saguzovsk pcache_link_t *pheadp; 13096695Saguzovsk kmutex_t *pmtx; 13106695Saguzovsk 13116695Saguzovsk if (amp == NULL) { 13126695Saguzovsk ASSERT(seg != NULL); 13136695Saguzovsk pheadp = &seg->s_phead; 13146695Saguzovsk pmtx = &seg->s_pmtx; 13156695Saguzovsk } else { 13166695Saguzovsk pheadp = &->a_phead; 13176695Saguzovsk pmtx = &->a_pmtx; 13186695Saguzovsk } 13196695Saguzovsk mutex_enter(pmtx); 13206695Saguzovsk while ((plinkp = pheadp->p_lnext) != pheadp) { 13216695Saguzovsk pcp = plink2pcache(plinkp); 13226695Saguzovsk ASSERT(!IS_PCP_WIRED(pcp)); 13236695Saguzovsk ASSERT(pcp->p_htag0 == htag0); 13246695Saguzovsk hp = pcp->p_hashp; 13256695Saguzovsk mutex_enter(&hp->p_hmutex); 13260Sstevel@tonic-gate if (pcp->p_active) { 13276695Saguzovsk mutex_exit(&hp->p_hmutex); 13280Sstevel@tonic-gate break; 13290Sstevel@tonic-gate } 13306695Saguzovsk ASSERT(plinkp->p_lprev == pheadp); 13316695Saguzovsk pheadp->p_lnext = plinkp->p_lnext; 13326695Saguzovsk plinkp->p_lnext->p_lprev = pheadp; 13330Sstevel@tonic-gate pcp->p_hprev->p_hnext = pcp->p_hnext; 13340Sstevel@tonic-gate pcp->p_hnext->p_hprev = pcp->p_hprev; 13350Sstevel@tonic-gate pcp->p_hprev = delcallb_list; 13360Sstevel@tonic-gate delcallb_list = pcp; 13376695Saguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) { 13386695Saguzovsk seg_premove_abuck(hp, 0); 13396695Saguzovsk } 13406695Saguzovsk mutex_exit(&hp->p_hmutex); 13410Sstevel@tonic-gate } 13426695Saguzovsk mutex_exit(pmtx); 13430Sstevel@tonic-gate } 13440Sstevel@tonic-gate while (delcallb_list != NULL) { 13450Sstevel@tonic-gate pcp = delcallb_list; 13460Sstevel@tonic-gate delcallb_list = pcp->p_hprev; 13476695Saguzovsk ASSERT(!pcp->p_active); 13486695Saguzovsk (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len, 13496695Saguzovsk pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0); 13506695Saguzovsk npages += btop(pcp->p_len); 13516695Saguzovsk kmem_cache_free(seg_pkmcache, pcp); 13520Sstevel@tonic-gate } 13536695Saguzovsk mutex_enter(&seg_pmem_mtx); 13546695Saguzovsk ASSERT(seg_plocked >= npages); 13550Sstevel@tonic-gate seg_plocked -= npages; 13566695Saguzovsk if (!IS_PFLAGS_WIRED(flags)) { 13576695Saguzovsk ASSERT(seg_plocked_window >= npages); 13586695Saguzovsk seg_plocked_window -= npages; 13596695Saguzovsk } 13606695Saguzovsk mutex_exit(&seg_pmem_mtx); 13610Sstevel@tonic-gate } 13620Sstevel@tonic-gate 13630Sstevel@tonic-gate static void seg_pinit_mem_config(void); 13640Sstevel@tonic-gate 13650Sstevel@tonic-gate /* 13660Sstevel@tonic-gate * setup the pagelock cache 13670Sstevel@tonic-gate */ 13680Sstevel@tonic-gate static void 13690Sstevel@tonic-gate seg_pinit(void) 13700Sstevel@tonic-gate { 13710Sstevel@tonic-gate struct seg_phash *hp; 13726695Saguzovsk ulong_t i; 13736695Saguzovsk pgcnt_t physmegs; 13746695Saguzovsk 13756695Saguzovsk seg_plocked = 0; 13766695Saguzovsk seg_plocked_window = 0; 13776695Saguzovsk 13786695Saguzovsk if (segpcache_enabled == 0) { 13796695Saguzovsk seg_phashsize_win = 0; 13806695Saguzovsk seg_phashsize_wired = 0; 13816695Saguzovsk seg_pdisabled = 1; 13826695Saguzovsk return; 13836695Saguzovsk } 13840Sstevel@tonic-gate 13856695Saguzovsk seg_pdisabled = 0; 13866695Saguzovsk seg_pkmcache = kmem_cache_create("seg_pcache", 13876695Saguzovsk sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0); 13886695Saguzovsk if (segpcache_pcp_maxage_ticks <= 0) { 13896695Saguzovsk segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz; 13906695Saguzovsk } 13916695Saguzovsk seg_pmax_pcpage = segpcache_pcp_maxage_ticks; 13926695Saguzovsk seg_pathr_empty_ahb = 0; 13936695Saguzovsk seg_pathr_full_ahb = 0; 13946695Saguzovsk seg_pshrink_shift = segpcache_shrink_shift; 13956695Saguzovsk seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes); 13960Sstevel@tonic-gate 13976695Saguzovsk mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL); 13986695Saguzovsk mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL); 13996695Saguzovsk mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL); 14006695Saguzovsk cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL); 14016695Saguzovsk 14026695Saguzovsk physmegs = physmem >> (20 - PAGESHIFT); 14030Sstevel@tonic-gate 14046695Saguzovsk /* 14056695Saguzovsk * If segpcache_hashsize_win was not set in /etc/system or it has 14066695Saguzovsk * absurd value set it to a default. 14076695Saguzovsk */ 14086695Saguzovsk if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) { 14096695Saguzovsk /* 14106695Saguzovsk * Create one bucket per 32K (or at least per 8 pages) of 14116695Saguzovsk * available memory. 14126695Saguzovsk */ 14136695Saguzovsk pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8); 14146695Saguzovsk segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket); 14156695Saguzovsk } 14166695Saguzovsk if (!ISP2(segpcache_hashsize_win)) { 14176695Saguzovsk ulong_t rndfac = ~(1UL << 14186695Saguzovsk (highbit(segpcache_hashsize_win) - 1)); 14196695Saguzovsk rndfac &= segpcache_hashsize_win; 14206695Saguzovsk segpcache_hashsize_win += rndfac; 14216695Saguzovsk segpcache_hashsize_win = 1 << 14226695Saguzovsk (highbit(segpcache_hashsize_win) - 1); 14236695Saguzovsk } 14246695Saguzovsk seg_phashsize_win = segpcache_hashsize_win; 14256695Saguzovsk seg_phashtab_win = kmem_zalloc( 14266695Saguzovsk seg_phashsize_win * sizeof (struct seg_phash), 14276695Saguzovsk KM_SLEEP); 14286695Saguzovsk for (i = 0; i < seg_phashsize_win; i++) { 14296695Saguzovsk hp = &seg_phashtab_win[i]; 14306695Saguzovsk hp->p_hnext = (struct seg_pcache *)hp; 14316695Saguzovsk hp->p_hprev = (struct seg_pcache *)hp; 14326695Saguzovsk mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); 14336695Saguzovsk } 14340Sstevel@tonic-gate 14356695Saguzovsk seg_pahcur = 0; 14366695Saguzovsk seg_pathr_on = 0; 14376695Saguzovsk seg_pahhead[0].p_lnext = &seg_pahhead[0]; 14386695Saguzovsk seg_pahhead[0].p_lprev = &seg_pahhead[0]; 14396695Saguzovsk seg_pahhead[1].p_lnext = &seg_pahhead[1]; 14406695Saguzovsk seg_pahhead[1].p_lprev = &seg_pahhead[1]; 14416695Saguzovsk 14426695Saguzovsk /* 14436695Saguzovsk * If segpcache_hashsize_wired was not set in /etc/system or it has 14446695Saguzovsk * absurd value set it to a default. 14456695Saguzovsk */ 14466695Saguzovsk if (segpcache_hashsize_wired == 0 || 14476695Saguzovsk segpcache_hashsize_wired > physmem / 4) { 14486695Saguzovsk /* 14496695Saguzovsk * Choose segpcache_hashsize_wired based on physmem. 14506695Saguzovsk * Create a bucket per 128K bytes upto 256K buckets. 14516695Saguzovsk */ 14526695Saguzovsk if (physmegs < 20 * 1024) { 14536695Saguzovsk segpcache_hashsize_wired = MAX(1024, physmegs << 3); 14546695Saguzovsk } else { 14556695Saguzovsk segpcache_hashsize_wired = 256 * 1024; 14560Sstevel@tonic-gate } 14570Sstevel@tonic-gate } 14586695Saguzovsk if (!ISP2(segpcache_hashsize_wired)) { 14596695Saguzovsk segpcache_hashsize_wired = 1 << 14606695Saguzovsk highbit(segpcache_hashsize_wired); 14616695Saguzovsk } 14626695Saguzovsk seg_phashsize_wired = segpcache_hashsize_wired; 14636695Saguzovsk seg_phashtab_wired = kmem_zalloc( 14646695Saguzovsk seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP); 14656695Saguzovsk for (i = 0; i < seg_phashsize_wired; i++) { 14666695Saguzovsk hp = (struct seg_phash *)&seg_phashtab_wired[i]; 14676695Saguzovsk hp->p_hnext = (struct seg_pcache *)hp; 14686695Saguzovsk hp->p_hprev = (struct seg_pcache *)hp; 14696695Saguzovsk mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); 14706695Saguzovsk } 14710Sstevel@tonic-gate 14726695Saguzovsk if (segpcache_maxwindow == 0) { 14736695Saguzovsk if (physmegs < 64) { 14746695Saguzovsk /* 3% of memory */ 14756695Saguzovsk segpcache_maxwindow = availrmem >> 5; 14766695Saguzovsk } else if (physmegs < 512) { 14776695Saguzovsk /* 12% of memory */ 14786695Saguzovsk segpcache_maxwindow = availrmem >> 3; 14796695Saguzovsk } else if (physmegs < 1024) { 14806695Saguzovsk /* 25% of memory */ 14816695Saguzovsk segpcache_maxwindow = availrmem >> 2; 14826695Saguzovsk } else if (physmegs < 2048) { 14836695Saguzovsk /* 50% of memory */ 14846695Saguzovsk segpcache_maxwindow = availrmem >> 1; 14856695Saguzovsk } else { 14866695Saguzovsk /* no limit */ 14876695Saguzovsk segpcache_maxwindow = (pgcnt_t)-1; 14886695Saguzovsk } 14896695Saguzovsk } 14906695Saguzovsk seg_pmaxwindow = segpcache_maxwindow; 14910Sstevel@tonic-gate seg_pinit_mem_config(); 14920Sstevel@tonic-gate } 14930Sstevel@tonic-gate 14940Sstevel@tonic-gate /* 14950Sstevel@tonic-gate * called by pageout if memory is low 14960Sstevel@tonic-gate */ 14970Sstevel@tonic-gate void 14980Sstevel@tonic-gate seg_preap(void) 14990Sstevel@tonic-gate { 15000Sstevel@tonic-gate /* 15016695Saguzovsk * if the cache is off or empty, return 15020Sstevel@tonic-gate */ 15036695Saguzovsk if (seg_plocked_window == 0) { 15040Sstevel@tonic-gate return; 15050Sstevel@tonic-gate } 15066695Saguzovsk ASSERT(seg_phashsize_win != 0); 15076695Saguzovsk 15086695Saguzovsk /* 15096695Saguzovsk * If somebody is already purging pcache 15106695Saguzovsk * just return. 15116695Saguzovsk */ 15126695Saguzovsk if (seg_pdisabled) { 15136695Saguzovsk return; 15146695Saguzovsk } 15156695Saguzovsk 15166695Saguzovsk cv_signal(&seg_pasync_cv); 15170Sstevel@tonic-gate } 15180Sstevel@tonic-gate 15190Sstevel@tonic-gate /* 15200Sstevel@tonic-gate * run as a backgroud thread and reclaim pagelock 15210Sstevel@tonic-gate * pages which have not been used recently 15220Sstevel@tonic-gate */ 15230Sstevel@tonic-gate void 15240Sstevel@tonic-gate seg_pasync_thread(void) 15250Sstevel@tonic-gate { 15260Sstevel@tonic-gate callb_cpr_t cpr_info; 15270Sstevel@tonic-gate 15286695Saguzovsk if (seg_phashsize_win == 0) { 15296695Saguzovsk thread_exit(); 15306695Saguzovsk /*NOTREACHED*/ 15310Sstevel@tonic-gate } 15320Sstevel@tonic-gate 15336695Saguzovsk seg_pasync_thr = curthread; 15346695Saguzovsk 15356695Saguzovsk CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx, 15366695Saguzovsk callb_generic_cpr, "seg_pasync"); 15376695Saguzovsk 15386695Saguzovsk if (segpcache_reap_ticks <= 0) { 15396695Saguzovsk segpcache_reap_ticks = segpcache_reap_sec * hz; 15406695Saguzovsk } 15410Sstevel@tonic-gate 15426695Saguzovsk mutex_enter(&seg_pasync_mtx); 15436695Saguzovsk for (;;) { 15446695Saguzovsk CALLB_CPR_SAFE_BEGIN(&cpr_info); 1545*11066Srafael.vanoni@sun.com (void) cv_reltimedwait(&seg_pasync_cv, &seg_pasync_mtx, 1546*11066Srafael.vanoni@sun.com segpcache_reap_ticks, TR_CLOCK_TICK); 15476695Saguzovsk CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx); 15486695Saguzovsk if (seg_pdisabled == 0) { 15496695Saguzovsk seg_ppurge_async(0); 15506695Saguzovsk } 15510Sstevel@tonic-gate } 15520Sstevel@tonic-gate } 15530Sstevel@tonic-gate 15540Sstevel@tonic-gate static struct kmem_cache *seg_cache; 15550Sstevel@tonic-gate 15560Sstevel@tonic-gate /* 15570Sstevel@tonic-gate * Initialize segment management data structures. 15580Sstevel@tonic-gate */ 15590Sstevel@tonic-gate void 15600Sstevel@tonic-gate seg_init(void) 15610Sstevel@tonic-gate { 15620Sstevel@tonic-gate kstat_t *ksp; 15630Sstevel@tonic-gate 15646695Saguzovsk seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg), 15656695Saguzovsk 0, NULL, NULL, NULL, NULL, NULL, 0); 15660Sstevel@tonic-gate 15670Sstevel@tonic-gate ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED, 15685928Sjj204856 segadvstat_ndata, KSTAT_FLAG_VIRTUAL); 15690Sstevel@tonic-gate if (ksp) { 15700Sstevel@tonic-gate ksp->ks_data = (void *)segadvstat_ptr; 15710Sstevel@tonic-gate kstat_install(ksp); 15720Sstevel@tonic-gate } 15730Sstevel@tonic-gate 15740Sstevel@tonic-gate seg_pinit(); 15750Sstevel@tonic-gate } 15760Sstevel@tonic-gate 15770Sstevel@tonic-gate /* 15780Sstevel@tonic-gate * Allocate a segment to cover [base, base+size] 15790Sstevel@tonic-gate * and attach it to the specified address space. 15800Sstevel@tonic-gate */ 15810Sstevel@tonic-gate struct seg * 15820Sstevel@tonic-gate seg_alloc(struct as *as, caddr_t base, size_t size) 15830Sstevel@tonic-gate { 15840Sstevel@tonic-gate struct seg *new; 15850Sstevel@tonic-gate caddr_t segbase; 15860Sstevel@tonic-gate size_t segsize; 15870Sstevel@tonic-gate 15880Sstevel@tonic-gate segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK); 15890Sstevel@tonic-gate segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) - 15900Sstevel@tonic-gate (uintptr_t)segbase; 15910Sstevel@tonic-gate 15920Sstevel@tonic-gate if (!valid_va_range(&segbase, &segsize, segsize, AH_LO)) 15930Sstevel@tonic-gate return ((struct seg *)NULL); /* bad virtual addr range */ 15940Sstevel@tonic-gate 15950Sstevel@tonic-gate if (as != &kas && 15960Sstevel@tonic-gate valid_usr_range(segbase, segsize, 0, as, 15970Sstevel@tonic-gate as->a_userlimit) != RANGE_OKAY) 15980Sstevel@tonic-gate return ((struct seg *)NULL); /* bad virtual addr range */ 15990Sstevel@tonic-gate 16000Sstevel@tonic-gate new = kmem_cache_alloc(seg_cache, KM_SLEEP); 16010Sstevel@tonic-gate new->s_ops = NULL; 16020Sstevel@tonic-gate new->s_data = NULL; 16030Sstevel@tonic-gate new->s_szc = 0; 16040Sstevel@tonic-gate new->s_flags = 0; 16056695Saguzovsk mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL); 16066695Saguzovsk new->s_phead.p_lnext = &new->s_phead; 16076695Saguzovsk new->s_phead.p_lprev = &new->s_phead; 16080Sstevel@tonic-gate if (seg_attach(as, segbase, segsize, new) < 0) { 16090Sstevel@tonic-gate kmem_cache_free(seg_cache, new); 16100Sstevel@tonic-gate return ((struct seg *)NULL); 16110Sstevel@tonic-gate } 16120Sstevel@tonic-gate /* caller must fill in ops, data */ 16130Sstevel@tonic-gate return (new); 16140Sstevel@tonic-gate } 16150Sstevel@tonic-gate 16160Sstevel@tonic-gate /* 16170Sstevel@tonic-gate * Attach a segment to the address space. Used by seg_alloc() 16180Sstevel@tonic-gate * and for kernel startup to attach to static segments. 16190Sstevel@tonic-gate */ 16200Sstevel@tonic-gate int 16210Sstevel@tonic-gate seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg) 16220Sstevel@tonic-gate { 16230Sstevel@tonic-gate seg->s_as = as; 16240Sstevel@tonic-gate seg->s_base = base; 16250Sstevel@tonic-gate seg->s_size = size; 16260Sstevel@tonic-gate 16270Sstevel@tonic-gate /* 16280Sstevel@tonic-gate * as_addseg() will add the segment at the appropraite point 16290Sstevel@tonic-gate * in the list. It will return -1 if there is overlap with 16300Sstevel@tonic-gate * an already existing segment. 16310Sstevel@tonic-gate */ 16320Sstevel@tonic-gate return (as_addseg(as, seg)); 16330Sstevel@tonic-gate } 16340Sstevel@tonic-gate 16350Sstevel@tonic-gate /* 16360Sstevel@tonic-gate * Unmap a segment and free it from its associated address space. 16370Sstevel@tonic-gate * This should be called by anybody who's finished with a whole segment's 16380Sstevel@tonic-gate * mapping. Just calls SEGOP_UNMAP() on the whole mapping . It is the 16390Sstevel@tonic-gate * responsibility of the segment driver to unlink the the segment 16400Sstevel@tonic-gate * from the address space, and to free public and private data structures 16410Sstevel@tonic-gate * associated with the segment. (This is typically done by a call to 16420Sstevel@tonic-gate * seg_free()). 16430Sstevel@tonic-gate */ 16440Sstevel@tonic-gate void 16450Sstevel@tonic-gate seg_unmap(struct seg *seg) 16460Sstevel@tonic-gate { 16470Sstevel@tonic-gate #ifdef DEBUG 16480Sstevel@tonic-gate int ret; 16490Sstevel@tonic-gate #endif /* DEBUG */ 16500Sstevel@tonic-gate 16510Sstevel@tonic-gate ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 16520Sstevel@tonic-gate 16530Sstevel@tonic-gate /* Shouldn't have called seg_unmap if mapping isn't yet established */ 16540Sstevel@tonic-gate ASSERT(seg->s_data != NULL); 16550Sstevel@tonic-gate 16560Sstevel@tonic-gate /* Unmap the whole mapping */ 16570Sstevel@tonic-gate #ifdef DEBUG 16580Sstevel@tonic-gate ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 16590Sstevel@tonic-gate ASSERT(ret == 0); 16600Sstevel@tonic-gate #else 16610Sstevel@tonic-gate SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 16620Sstevel@tonic-gate #endif /* DEBUG */ 16630Sstevel@tonic-gate } 16640Sstevel@tonic-gate 16650Sstevel@tonic-gate /* 16660Sstevel@tonic-gate * Free the segment from its associated as. This should only be called 16670Sstevel@tonic-gate * if a mapping to the segment has not yet been established (e.g., if 16680Sstevel@tonic-gate * an error occurs in the middle of doing an as_map when the segment 16690Sstevel@tonic-gate * has already been partially set up) or if it has already been deleted 16700Sstevel@tonic-gate * (e.g., from a segment driver unmap routine if the unmap applies to the 16710Sstevel@tonic-gate * entire segment). If the mapping is currently set up then seg_unmap() should 16720Sstevel@tonic-gate * be called instead. 16730Sstevel@tonic-gate */ 16740Sstevel@tonic-gate void 16750Sstevel@tonic-gate seg_free(struct seg *seg) 16760Sstevel@tonic-gate { 16770Sstevel@tonic-gate register struct as *as = seg->s_as; 16780Sstevel@tonic-gate struct seg *tseg = as_removeseg(as, seg); 16790Sstevel@tonic-gate 16800Sstevel@tonic-gate ASSERT(tseg == seg); 16810Sstevel@tonic-gate 16820Sstevel@tonic-gate /* 16830Sstevel@tonic-gate * If the segment private data field is NULL, 16840Sstevel@tonic-gate * then segment driver is not attached yet. 16850Sstevel@tonic-gate */ 16860Sstevel@tonic-gate if (seg->s_data != NULL) 16870Sstevel@tonic-gate SEGOP_FREE(seg); 16880Sstevel@tonic-gate 16896695Saguzovsk mutex_destroy(&seg->s_pmtx); 16906695Saguzovsk ASSERT(seg->s_phead.p_lnext == &seg->s_phead); 16916695Saguzovsk ASSERT(seg->s_phead.p_lprev == &seg->s_phead); 16920Sstevel@tonic-gate kmem_cache_free(seg_cache, seg); 16930Sstevel@tonic-gate } 16940Sstevel@tonic-gate 16950Sstevel@tonic-gate /*ARGSUSED*/ 16960Sstevel@tonic-gate static void 16970Sstevel@tonic-gate seg_p_mem_config_post_add( 16980Sstevel@tonic-gate void *arg, 16990Sstevel@tonic-gate pgcnt_t delta_pages) 17000Sstevel@tonic-gate { 17010Sstevel@tonic-gate /* Nothing to do. */ 17020Sstevel@tonic-gate } 17030Sstevel@tonic-gate 17043480Sjfrank void 17053480Sjfrank seg_p_enable(void) 17063480Sjfrank { 17076695Saguzovsk mutex_enter(&seg_pcache_mtx); 17086695Saguzovsk ASSERT(seg_pdisabled != 0); 17096695Saguzovsk seg_pdisabled--; 17106695Saguzovsk mutex_exit(&seg_pcache_mtx); 17113480Sjfrank } 17123480Sjfrank 17133480Sjfrank /* 17143480Sjfrank * seg_p_disable - disables seg_pcache, and then attempts to empty the 17153480Sjfrank * cache. 17163480Sjfrank * Returns SEGP_SUCCESS if the cache was successfully emptied, or 17173480Sjfrank * SEGP_FAIL if the cache could not be emptied. 17183480Sjfrank */ 17193480Sjfrank int 17203480Sjfrank seg_p_disable(void) 17213480Sjfrank { 17223480Sjfrank pgcnt_t old_plocked; 17233480Sjfrank int stall_count = 0; 17243480Sjfrank 17256695Saguzovsk mutex_enter(&seg_pcache_mtx); 17266695Saguzovsk seg_pdisabled++; 17276695Saguzovsk ASSERT(seg_pdisabled != 0); 17286695Saguzovsk mutex_exit(&seg_pcache_mtx); 17293480Sjfrank 17303480Sjfrank /* 17313480Sjfrank * Attempt to empty the cache. Terminate if seg_plocked does not 17323480Sjfrank * diminish with SEGP_STALL_THRESHOLD consecutive attempts. 17333480Sjfrank */ 17343480Sjfrank while (seg_plocked != 0) { 17356695Saguzovsk ASSERT(seg_phashsize_win != 0); 17363480Sjfrank old_plocked = seg_plocked; 17376695Saguzovsk seg_ppurge_async(1); 17383480Sjfrank if (seg_plocked == old_plocked) { 17393480Sjfrank if (stall_count++ > SEGP_STALL_THRESHOLD) { 17403480Sjfrank return (SEGP_FAIL); 17413480Sjfrank } 17423480Sjfrank } else 17433480Sjfrank stall_count = 0; 17443480Sjfrank if (seg_plocked != 0) 17453480Sjfrank delay(hz/SEGP_PREDEL_DELAY_FACTOR); 17463480Sjfrank } 17473480Sjfrank return (SEGP_SUCCESS); 17483480Sjfrank } 17493480Sjfrank 17500Sstevel@tonic-gate /* 17510Sstevel@tonic-gate * Attempt to purge seg_pcache. May need to return before this has 17520Sstevel@tonic-gate * completed to allow other pre_del callbacks to unlock pages. This is 17530Sstevel@tonic-gate * ok because: 17546695Saguzovsk * 1) The seg_pdisabled flag has been set so at least we won't 17550Sstevel@tonic-gate * cache anymore locks and the locks we couldn't purge 17560Sstevel@tonic-gate * will not be held if they do get released by a subsequent 17570Sstevel@tonic-gate * pre-delete callback. 17580Sstevel@tonic-gate * 17590Sstevel@tonic-gate * 2) The rest of the memory delete thread processing does not 17600Sstevel@tonic-gate * depend on the changes made in this pre-delete callback. No 17610Sstevel@tonic-gate * panics will result, the worst that will happen is that the 17620Sstevel@tonic-gate * DR code will timeout and cancel the delete. 17630Sstevel@tonic-gate */ 17640Sstevel@tonic-gate /*ARGSUSED*/ 17650Sstevel@tonic-gate static int 17660Sstevel@tonic-gate seg_p_mem_config_pre_del( 17670Sstevel@tonic-gate void *arg, 17680Sstevel@tonic-gate pgcnt_t delta_pages) 17690Sstevel@tonic-gate { 17706695Saguzovsk if (seg_phashsize_win == 0) { 17716695Saguzovsk return (0); 17726695Saguzovsk } 17733480Sjfrank if (seg_p_disable() != SEGP_SUCCESS) 17743480Sjfrank cmn_err(CE_NOTE, 17753480Sjfrank "!Pre-delete couldn't purge"" pagelock cache - continuing"); 17760Sstevel@tonic-gate return (0); 17770Sstevel@tonic-gate } 17780Sstevel@tonic-gate 17790Sstevel@tonic-gate /*ARGSUSED*/ 17800Sstevel@tonic-gate static void 17810Sstevel@tonic-gate seg_p_mem_config_post_del( 17820Sstevel@tonic-gate void *arg, 17830Sstevel@tonic-gate pgcnt_t delta_pages, 17840Sstevel@tonic-gate int cancelled) 17850Sstevel@tonic-gate { 17866695Saguzovsk if (seg_phashsize_win == 0) { 17876695Saguzovsk return; 17886695Saguzovsk } 17893480Sjfrank seg_p_enable(); 17900Sstevel@tonic-gate } 17910Sstevel@tonic-gate 17920Sstevel@tonic-gate static kphysm_setup_vector_t seg_p_mem_config_vec = { 17930Sstevel@tonic-gate KPHYSM_SETUP_VECTOR_VERSION, 17940Sstevel@tonic-gate seg_p_mem_config_post_add, 17950Sstevel@tonic-gate seg_p_mem_config_pre_del, 17960Sstevel@tonic-gate seg_p_mem_config_post_del, 17970Sstevel@tonic-gate }; 17980Sstevel@tonic-gate 17990Sstevel@tonic-gate static void 18000Sstevel@tonic-gate seg_pinit_mem_config(void) 18010Sstevel@tonic-gate { 18020Sstevel@tonic-gate int ret; 18030Sstevel@tonic-gate 18040Sstevel@tonic-gate ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL); 18050Sstevel@tonic-gate /* 18060Sstevel@tonic-gate * Want to catch this in the debug kernel. At run time, if the 18070Sstevel@tonic-gate * callbacks don't get run all will be OK as the disable just makes 18080Sstevel@tonic-gate * it more likely that the pages can be collected. 18090Sstevel@tonic-gate */ 18100Sstevel@tonic-gate ASSERT(ret == 0); 18110Sstevel@tonic-gate } 18123247Sgjelinek 18133247Sgjelinek /* 18143247Sgjelinek * Verify that segment is not a shared anonymous segment which reserves 18153247Sgjelinek * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered 18163247Sgjelinek * from one zone to another if any segments are shared. This is because the 18173247Sgjelinek * last process to exit will credit the swap reservation. This could lead 18183247Sgjelinek * to the swap being reserved by one zone, and credited to another. 18193247Sgjelinek */ 18203247Sgjelinek boolean_t 18213247Sgjelinek seg_can_change_zones(struct seg *seg) 18223247Sgjelinek { 18233247Sgjelinek struct segvn_data *svd; 18243247Sgjelinek 18253247Sgjelinek if (seg->s_ops == &segspt_shmops) 18263247Sgjelinek return (B_FALSE); 18273247Sgjelinek 18283247Sgjelinek if (seg->s_ops == &segvn_ops) { 18293247Sgjelinek svd = (struct segvn_data *)seg->s_data; 18303247Sgjelinek if (svd->type == MAP_SHARED && 18313247Sgjelinek svd->amp != NULL && 18323247Sgjelinek svd->amp->swresv > 0) 18333247Sgjelinek return (B_FALSE); 18343247Sgjelinek } 18353247Sgjelinek return (B_TRUE); 18363247Sgjelinek } 18373247Sgjelinek 18383247Sgjelinek /* 18393247Sgjelinek * Return swap reserved by a segment backing a private mapping. 18403247Sgjelinek */ 18413247Sgjelinek size_t 18423247Sgjelinek seg_swresv(struct seg *seg) 18433247Sgjelinek { 18443247Sgjelinek struct segvn_data *svd; 18453247Sgjelinek size_t swap = 0; 18463247Sgjelinek 18473247Sgjelinek if (seg->s_ops == &segvn_ops) { 18483247Sgjelinek svd = (struct segvn_data *)seg->s_data; 18493247Sgjelinek if (svd->type == MAP_PRIVATE && svd->swresv > 0) 18503247Sgjelinek swap = svd->swresv; 18513247Sgjelinek } 18523247Sgjelinek return (swap); 18533247Sgjelinek } 1854