10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 53247Sgjelinek * Common Development and Distribution License (the "License"). 63247Sgjelinek * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate /* 225928Sjj204856 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 230Sstevel@tonic-gate * Use is subject to license terms. 240Sstevel@tonic-gate */ 250Sstevel@tonic-gate 260Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 270Sstevel@tonic-gate /* All Rights Reserved */ 280Sstevel@tonic-gate 290Sstevel@tonic-gate /* 300Sstevel@tonic-gate * University Copyright- Copyright (c) 1982, 1986, 1988 310Sstevel@tonic-gate * The Regents of the University of California 320Sstevel@tonic-gate * All Rights Reserved 330Sstevel@tonic-gate * 340Sstevel@tonic-gate * University Acknowledgment- Portions of this document are derived from 350Sstevel@tonic-gate * software developed by the University of California, Berkeley, and its 360Sstevel@tonic-gate * contributors. 370Sstevel@tonic-gate */ 380Sstevel@tonic-gate 390Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 400Sstevel@tonic-gate 410Sstevel@tonic-gate /* 420Sstevel@tonic-gate * VM - segment management. 430Sstevel@tonic-gate */ 440Sstevel@tonic-gate 450Sstevel@tonic-gate #include <sys/types.h> 460Sstevel@tonic-gate #include <sys/inttypes.h> 470Sstevel@tonic-gate #include <sys/t_lock.h> 480Sstevel@tonic-gate #include <sys/param.h> 490Sstevel@tonic-gate #include <sys/systm.h> 500Sstevel@tonic-gate #include <sys/kmem.h> 51*6695Saguzovsk #include <sys/sysmacros.h> 520Sstevel@tonic-gate #include <sys/vmsystm.h> 53*6695Saguzovsk #include <sys/tuneable.h> 540Sstevel@tonic-gate #include <sys/debug.h> 55*6695Saguzovsk #include <sys/fs/swapnode.h> 560Sstevel@tonic-gate #include <sys/cmn_err.h> 570Sstevel@tonic-gate #include <sys/callb.h> 580Sstevel@tonic-gate #include <sys/mem_config.h> 593247Sgjelinek #include <sys/mman.h> 600Sstevel@tonic-gate 610Sstevel@tonic-gate #include <vm/hat.h> 620Sstevel@tonic-gate #include <vm/as.h> 630Sstevel@tonic-gate #include <vm/seg.h> 640Sstevel@tonic-gate #include <vm/seg_kmem.h> 653247Sgjelinek #include <vm/seg_spt.h> 663247Sgjelinek #include <vm/seg_vn.h> 67*6695Saguzovsk #include <vm/anon.h> 68*6695Saguzovsk 690Sstevel@tonic-gate /* 700Sstevel@tonic-gate * kstats for segment advise 710Sstevel@tonic-gate */ 720Sstevel@tonic-gate segadvstat_t segadvstat = { 730Sstevel@tonic-gate { "MADV_FREE_hit", KSTAT_DATA_ULONG }, 740Sstevel@tonic-gate { "MADV_FREE_miss", KSTAT_DATA_ULONG }, 750Sstevel@tonic-gate }; 760Sstevel@tonic-gate 770Sstevel@tonic-gate kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat; 780Sstevel@tonic-gate uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t); 790Sstevel@tonic-gate 800Sstevel@tonic-gate /* 810Sstevel@tonic-gate * entry in the segment page cache 820Sstevel@tonic-gate */ 830Sstevel@tonic-gate struct seg_pcache { 84*6695Saguzovsk struct seg_pcache *p_hnext; /* list for hashed blocks */ 85*6695Saguzovsk struct seg_pcache *p_hprev; 86*6695Saguzovsk pcache_link_t p_plink; /* per segment/amp list */ 87*6695Saguzovsk void *p_htag0; /* segment/amp pointer */ 88*6695Saguzovsk caddr_t p_addr; /* base address/anon_idx */ 89*6695Saguzovsk size_t p_len; /* total bytes */ 90*6695Saguzovsk size_t p_wlen; /* writtable bytes at p_addr */ 91*6695Saguzovsk struct page **p_pp; /* pp shadow list */ 92*6695Saguzovsk seg_preclaim_cbfunc_t p_callback; /* reclaim callback function */ 93*6695Saguzovsk clock_t p_lbolt; /* lbolt from last use */ 94*6695Saguzovsk struct seg_phash *p_hashp; /* our pcache hash bucket */ 95*6695Saguzovsk uint_t p_active; /* active count */ 96*6695Saguzovsk uchar_t p_write; /* true if S_WRITE */ 97*6695Saguzovsk uchar_t p_ref; /* reference byte */ 98*6695Saguzovsk ushort_t p_flags; /* bit flags */ 990Sstevel@tonic-gate }; 1000Sstevel@tonic-gate 1010Sstevel@tonic-gate struct seg_phash { 102*6695Saguzovsk struct seg_pcache *p_hnext; /* list for hashed blocks */ 103*6695Saguzovsk struct seg_pcache *p_hprev; 104*6695Saguzovsk kmutex_t p_hmutex; /* protects hash bucket */ 105*6695Saguzovsk pcache_link_t p_halink[2]; /* active bucket linkages */ 106*6695Saguzovsk }; 107*6695Saguzovsk 108*6695Saguzovsk struct seg_phash_wired { 109*6695Saguzovsk struct seg_pcache *p_hnext; /* list for hashed blocks */ 110*6695Saguzovsk struct seg_pcache *p_hprev; 111*6695Saguzovsk kmutex_t p_hmutex; /* protects hash bucket */ 1120Sstevel@tonic-gate }; 1130Sstevel@tonic-gate 114*6695Saguzovsk /* 115*6695Saguzovsk * A parameter to control a maximum number of bytes that can be 116*6695Saguzovsk * purged from pcache at a time. 117*6695Saguzovsk */ 118*6695Saguzovsk #define P_MAX_APURGE_BYTES (1024 * 1024 * 1024) 119*6695Saguzovsk 120*6695Saguzovsk /* 121*6695Saguzovsk * log2(fraction of pcache to reclaim at a time). 122*6695Saguzovsk */ 123*6695Saguzovsk #define P_SHRINK_SHFT (5) 124*6695Saguzovsk 125*6695Saguzovsk /* 126*6695Saguzovsk * The following variables can be tuned via /etc/system. 127*6695Saguzovsk */ 128*6695Saguzovsk 129*6695Saguzovsk int segpcache_enabled = 1; /* if 1, shadow lists are cached */ 130*6695Saguzovsk pgcnt_t segpcache_maxwindow = 0; /* max # of pages that can be cached */ 131*6695Saguzovsk ulong_t segpcache_hashsize_win = 0; /* # of non wired buckets */ 132*6695Saguzovsk ulong_t segpcache_hashsize_wired = 0; /* # of wired buckets */ 133*6695Saguzovsk int segpcache_reap_sec = 1; /* reap check rate in secs */ 134*6695Saguzovsk clock_t segpcache_reap_ticks = 0; /* reap interval in ticks */ 135*6695Saguzovsk int segpcache_pcp_maxage_sec = 1; /* pcp max age in secs */ 136*6695Saguzovsk clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */ 137*6695Saguzovsk int segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */ 138*6695Saguzovsk pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */ 1390Sstevel@tonic-gate 140*6695Saguzovsk static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */ 141*6695Saguzovsk static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */ 142*6695Saguzovsk static kcondvar_t seg_pasync_cv; 143*6695Saguzovsk 144*6695Saguzovsk #pragma align 64(pctrl1) 145*6695Saguzovsk #pragma align 64(pctrl2) 146*6695Saguzovsk #pragma align 64(pctrl3) 1470Sstevel@tonic-gate 148*6695Saguzovsk /* 149*6695Saguzovsk * Keep frequently used variables together in one cache line. 150*6695Saguzovsk */ 151*6695Saguzovsk static struct p_ctrl1 { 152*6695Saguzovsk uint_t p_disabled; /* if not 0, caching temporarily off */ 153*6695Saguzovsk pgcnt_t p_maxwin; /* max # of pages that can be cached */ 154*6695Saguzovsk size_t p_hashwin_sz; /* # of non wired buckets */ 155*6695Saguzovsk struct seg_phash *p_htabwin; /* hash table for non wired entries */ 156*6695Saguzovsk size_t p_hashwired_sz; /* # of wired buckets */ 157*6695Saguzovsk struct seg_phash_wired *p_htabwired; /* hash table for wired entries */ 158*6695Saguzovsk kmem_cache_t *p_kmcache; /* kmem cache for seg_pcache structs */ 159*6695Saguzovsk #ifdef _LP64 160*6695Saguzovsk ulong_t pad[1]; 161*6695Saguzovsk #endif /* _LP64 */ 162*6695Saguzovsk } pctrl1; 163*6695Saguzovsk 164*6695Saguzovsk static struct p_ctrl2 { 165*6695Saguzovsk kmutex_t p_mem_mtx; /* protects window counter and p_halinks */ 166*6695Saguzovsk pgcnt_t p_locked_win; /* # pages from window */ 167*6695Saguzovsk pgcnt_t p_locked; /* # of pages cached by pagelock */ 168*6695Saguzovsk uchar_t p_ahcur; /* current active links for insert/delete */ 169*6695Saguzovsk uchar_t p_athr_on; /* async reclaim thread is running. */ 170*6695Saguzovsk pcache_link_t p_ahhead[2]; /* active buckets linkages */ 171*6695Saguzovsk } pctrl2; 1720Sstevel@tonic-gate 173*6695Saguzovsk static struct p_ctrl3 { 174*6695Saguzovsk clock_t p_pcp_maxage; /* max pcp age in ticks */ 175*6695Saguzovsk ulong_t p_athr_empty_ahb; /* athread walk stats */ 176*6695Saguzovsk ulong_t p_athr_full_ahb; /* athread walk stats */ 177*6695Saguzovsk pgcnt_t p_maxapurge_npages; /* max pages to purge at a time */ 178*6695Saguzovsk int p_shrink_shft; /* reap shift factor */ 179*6695Saguzovsk #ifdef _LP64 180*6695Saguzovsk ulong_t pad[3]; 181*6695Saguzovsk #endif /* _LP64 */ 182*6695Saguzovsk } pctrl3; 1830Sstevel@tonic-gate 184*6695Saguzovsk #define seg_pdisabled pctrl1.p_disabled 185*6695Saguzovsk #define seg_pmaxwindow pctrl1.p_maxwin 186*6695Saguzovsk #define seg_phashsize_win pctrl1.p_hashwin_sz 187*6695Saguzovsk #define seg_phashtab_win pctrl1.p_htabwin 188*6695Saguzovsk #define seg_phashsize_wired pctrl1.p_hashwired_sz 189*6695Saguzovsk #define seg_phashtab_wired pctrl1.p_htabwired 190*6695Saguzovsk #define seg_pkmcache pctrl1.p_kmcache 191*6695Saguzovsk #define seg_pmem_mtx pctrl2.p_mem_mtx 192*6695Saguzovsk #define seg_plocked_window pctrl2.p_locked_win 193*6695Saguzovsk #define seg_plocked pctrl2.p_locked 194*6695Saguzovsk #define seg_pahcur pctrl2.p_ahcur 195*6695Saguzovsk #define seg_pathr_on pctrl2.p_athr_on 196*6695Saguzovsk #define seg_pahhead pctrl2.p_ahhead 197*6695Saguzovsk #define seg_pmax_pcpage pctrl3.p_pcp_maxage 198*6695Saguzovsk #define seg_pathr_empty_ahb pctrl3.p_athr_empty_ahb 199*6695Saguzovsk #define seg_pathr_full_ahb pctrl3.p_athr_full_ahb 200*6695Saguzovsk #define seg_pshrink_shift pctrl3.p_shrink_shft 201*6695Saguzovsk #define seg_pmaxapurge_npages pctrl3.p_maxapurge_npages 202*6695Saguzovsk 203*6695Saguzovsk #define P_HASHWIN_MASK (seg_phashsize_win - 1) 204*6695Saguzovsk #define P_HASHWIRED_MASK (seg_phashsize_wired - 1) 205*6695Saguzovsk #define P_BASESHIFT (6) 206*6695Saguzovsk 207*6695Saguzovsk kthread_t *seg_pasync_thr; 208*6695Saguzovsk 209*6695Saguzovsk extern struct seg_ops segvn_ops; 210*6695Saguzovsk extern struct seg_ops segspt_shmops; 2110Sstevel@tonic-gate 212*6695Saguzovsk #define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED) 213*6695Saguzovsk #define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags) 214*6695Saguzovsk 215*6695Saguzovsk #define LBOLT_DELTA(t) ((ulong_t)(lbolt - (t))) 216*6695Saguzovsk 217*6695Saguzovsk #define PCP_AGE(pcp) LBOLT_DELTA((pcp)->p_lbolt) 218*6695Saguzovsk 219*6695Saguzovsk /* 220*6695Saguzovsk * htag0 argument can be a seg or amp pointer. 221*6695Saguzovsk */ 222*6695Saguzovsk #define P_HASHBP(seg, htag0, addr, flags) \ 223*6695Saguzovsk (IS_PFLAGS_WIRED((flags)) ? \ 224*6695Saguzovsk ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \ 225*6695Saguzovsk ((uintptr_t)(htag0) >> P_BASESHIFT)]) : \ 226*6695Saguzovsk (&seg_phashtab_win[P_HASHWIN_MASK & \ 227*6695Saguzovsk (((uintptr_t)(htag0) >> 3) ^ \ 228*6695Saguzovsk ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ? \ 229*6695Saguzovsk (flags >> 16) : page_get_shift((seg)->s_szc))))])) 2300Sstevel@tonic-gate 231*6695Saguzovsk /* 232*6695Saguzovsk * htag0 argument can be a seg or amp pointer. 233*6695Saguzovsk */ 234*6695Saguzovsk #define P_MATCH(pcp, htag0, addr, len) \ 235*6695Saguzovsk ((pcp)->p_htag0 == (htag0) && \ 236*6695Saguzovsk (pcp)->p_addr == (addr) && \ 237*6695Saguzovsk (pcp)->p_len >= (len)) 2380Sstevel@tonic-gate 239*6695Saguzovsk #define P_MATCH_PP(pcp, htag0, addr, len, pp) \ 240*6695Saguzovsk ((pcp)->p_pp == (pp) && \ 241*6695Saguzovsk (pcp)->p_htag0 == (htag0) && \ 242*6695Saguzovsk (pcp)->p_addr == (addr) && \ 243*6695Saguzovsk (pcp)->p_len >= (len)) 244*6695Saguzovsk 245*6695Saguzovsk #define plink2pcache(pl) ((struct seg_pcache *)((uintptr_t)(pl) - \ 246*6695Saguzovsk offsetof(struct seg_pcache, p_plink))) 247*6695Saguzovsk 248*6695Saguzovsk #define hlink2phash(hl, l) ((struct seg_phash *)((uintptr_t)(hl) - \ 249*6695Saguzovsk offsetof(struct seg_phash, p_halink[l]))) 2500Sstevel@tonic-gate 2510Sstevel@tonic-gate /* 252*6695Saguzovsk * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from 253*6695Saguzovsk * active hash bucket lists. We maintain active bucket lists to reduce the 254*6695Saguzovsk * overhead of finding active buckets during asynchronous purging since there 255*6695Saguzovsk * can be 10s of millions of buckets on a large system but only a small subset 256*6695Saguzovsk * of them in actual use. 257*6695Saguzovsk * 258*6695Saguzovsk * There're 2 active bucket lists. Current active list (as per seg_pahcur) is 259*6695Saguzovsk * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete 260*6695Saguzovsk * buckets. The other list is used by asynchronous purge thread. This allows 261*6695Saguzovsk * the purge thread to walk its active list without holding seg_pmem_mtx for a 262*6695Saguzovsk * long time. When asynchronous thread is done with its list it switches to 263*6695Saguzovsk * current active list and makes the list it just finished processing as 264*6695Saguzovsk * current active list. 265*6695Saguzovsk * 266*6695Saguzovsk * seg_padd_abuck() only adds the bucket to current list if the bucket is not 267*6695Saguzovsk * yet on any list. seg_premove_abuck() may remove the bucket from either 268*6695Saguzovsk * list. If the bucket is on current list it will be always removed. Otherwise 269*6695Saguzovsk * the bucket is only removed if asynchronous purge thread is not currently 270*6695Saguzovsk * running or seg_premove_abuck() is called by asynchronous purge thread 271*6695Saguzovsk * itself. A given bucket can only be on one of active lists at a time. These 272*6695Saguzovsk * routines should be called with per bucket lock held. The routines use 273*6695Saguzovsk * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after 274*6695Saguzovsk * the first entry is added to the bucket chain and seg_premove_abuck() must 275*6695Saguzovsk * be called after the last pcp entry is deleted from its chain. Per bucket 276*6695Saguzovsk * lock should be held by the callers. This avoids a potential race condition 277*6695Saguzovsk * when seg_premove_abuck() removes a bucket after pcp entries are added to 278*6695Saguzovsk * its list after the caller checked that the bucket has no entries. (this 279*6695Saguzovsk * race would cause a loss of an active bucket from the active lists). 280*6695Saguzovsk * 281*6695Saguzovsk * Both lists are circular doubly linked lists anchored at seg_pahhead heads. 282*6695Saguzovsk * New entries are added to the end of the list since LRU is used as the 283*6695Saguzovsk * purging policy. 284*6695Saguzovsk */ 285*6695Saguzovsk static void 286*6695Saguzovsk seg_padd_abuck(struct seg_phash *hp) 287*6695Saguzovsk { 288*6695Saguzovsk int lix; 289*6695Saguzovsk 290*6695Saguzovsk ASSERT(MUTEX_HELD(&hp->p_hmutex)); 291*6695Saguzovsk ASSERT((struct seg_phash *)hp->p_hnext != hp); 292*6695Saguzovsk ASSERT((struct seg_phash *)hp->p_hprev != hp); 293*6695Saguzovsk ASSERT(hp->p_hnext == hp->p_hprev); 294*6695Saguzovsk ASSERT(!IS_PCP_WIRED(hp->p_hnext)); 295*6695Saguzovsk ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp); 296*6695Saguzovsk ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp); 297*6695Saguzovsk ASSERT(hp >= seg_phashtab_win && 298*6695Saguzovsk hp < &seg_phashtab_win[seg_phashsize_win]); 299*6695Saguzovsk 300*6695Saguzovsk /* 301*6695Saguzovsk * This bucket can already be on one of active lists 302*6695Saguzovsk * since seg_premove_abuck() may have failed to remove it 303*6695Saguzovsk * before. 304*6695Saguzovsk */ 305*6695Saguzovsk mutex_enter(&seg_pmem_mtx); 306*6695Saguzovsk lix = seg_pahcur; 307*6695Saguzovsk ASSERT(lix >= 0 && lix <= 1); 308*6695Saguzovsk if (hp->p_halink[lix].p_lnext != NULL) { 309*6695Saguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL); 310*6695Saguzovsk ASSERT(hp->p_halink[!lix].p_lnext == NULL); 311*6695Saguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL); 312*6695Saguzovsk mutex_exit(&seg_pmem_mtx); 313*6695Saguzovsk return; 314*6695Saguzovsk } 315*6695Saguzovsk ASSERT(hp->p_halink[lix].p_lprev == NULL); 316*6695Saguzovsk 317*6695Saguzovsk /* 318*6695Saguzovsk * If this bucket is still on list !lix async thread can't yet remove 319*6695Saguzovsk * it since we hold here per bucket lock. In this case just return 320*6695Saguzovsk * since async thread will eventually find and process this bucket. 321*6695Saguzovsk */ 322*6695Saguzovsk if (hp->p_halink[!lix].p_lnext != NULL) { 323*6695Saguzovsk ASSERT(hp->p_halink[!lix].p_lprev != NULL); 324*6695Saguzovsk mutex_exit(&seg_pmem_mtx); 325*6695Saguzovsk return; 326*6695Saguzovsk } 327*6695Saguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL); 328*6695Saguzovsk /* 329*6695Saguzovsk * This bucket is not on any active bucket list yet. 330*6695Saguzovsk * Add the bucket to the tail of current active list. 331*6695Saguzovsk */ 332*6695Saguzovsk hp->p_halink[lix].p_lnext = &seg_pahhead[lix]; 333*6695Saguzovsk hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev; 334*6695Saguzovsk seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix]; 335*6695Saguzovsk seg_pahhead[lix].p_lprev = &hp->p_halink[lix]; 336*6695Saguzovsk mutex_exit(&seg_pmem_mtx); 337*6695Saguzovsk } 338*6695Saguzovsk 339*6695Saguzovsk static void 340*6695Saguzovsk seg_premove_abuck(struct seg_phash *hp, int athr) 341*6695Saguzovsk { 342*6695Saguzovsk int lix; 343*6695Saguzovsk 344*6695Saguzovsk ASSERT(MUTEX_HELD(&hp->p_hmutex)); 345*6695Saguzovsk ASSERT((struct seg_phash *)hp->p_hnext == hp); 346*6695Saguzovsk ASSERT((struct seg_phash *)hp->p_hprev == hp); 347*6695Saguzovsk ASSERT(hp >= seg_phashtab_win && 348*6695Saguzovsk hp < &seg_phashtab_win[seg_phashsize_win]); 349*6695Saguzovsk 350*6695Saguzovsk if (athr) { 351*6695Saguzovsk ASSERT(seg_pathr_on); 352*6695Saguzovsk ASSERT(seg_pahcur <= 1); 353*6695Saguzovsk /* 354*6695Saguzovsk * We are called by asynchronous thread that found this bucket 355*6695Saguzovsk * on not currently active (i.e. !seg_pahcur) list. Remove it 356*6695Saguzovsk * from there. Per bucket lock we are holding makes sure 357*6695Saguzovsk * seg_pinsert() can't sneak in and add pcp entries to this 358*6695Saguzovsk * bucket right before we remove the bucket from its list. 359*6695Saguzovsk */ 360*6695Saguzovsk lix = !seg_pahcur; 361*6695Saguzovsk ASSERT(hp->p_halink[lix].p_lnext != NULL); 362*6695Saguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL); 363*6695Saguzovsk ASSERT(hp->p_halink[!lix].p_lnext == NULL); 364*6695Saguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL); 365*6695Saguzovsk hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 366*6695Saguzovsk hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 367*6695Saguzovsk hp->p_halink[lix].p_lnext = NULL; 368*6695Saguzovsk hp->p_halink[lix].p_lprev = NULL; 369*6695Saguzovsk return; 370*6695Saguzovsk } 371*6695Saguzovsk 372*6695Saguzovsk mutex_enter(&seg_pmem_mtx); 373*6695Saguzovsk lix = seg_pahcur; 374*6695Saguzovsk ASSERT(lix >= 0 && lix <= 1); 375*6695Saguzovsk 376*6695Saguzovsk /* 377*6695Saguzovsk * If the bucket is on currently active list just remove it from 378*6695Saguzovsk * there. 379*6695Saguzovsk */ 380*6695Saguzovsk if (hp->p_halink[lix].p_lnext != NULL) { 381*6695Saguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL); 382*6695Saguzovsk ASSERT(hp->p_halink[!lix].p_lnext == NULL); 383*6695Saguzovsk ASSERT(hp->p_halink[!lix].p_lprev == NULL); 384*6695Saguzovsk hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 385*6695Saguzovsk hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 386*6695Saguzovsk hp->p_halink[lix].p_lnext = NULL; 387*6695Saguzovsk hp->p_halink[lix].p_lprev = NULL; 388*6695Saguzovsk mutex_exit(&seg_pmem_mtx); 389*6695Saguzovsk return; 390*6695Saguzovsk } 391*6695Saguzovsk ASSERT(hp->p_halink[lix].p_lprev == NULL); 392*6695Saguzovsk 393*6695Saguzovsk /* 394*6695Saguzovsk * If asynchronous thread is not running we can remove the bucket from 395*6695Saguzovsk * not currently active list. The bucket must be on this list since we 396*6695Saguzovsk * already checked that it's not on the other list and the bucket from 397*6695Saguzovsk * which we just deleted the last pcp entry must be still on one of the 398*6695Saguzovsk * active bucket lists. 399*6695Saguzovsk */ 400*6695Saguzovsk lix = !lix; 401*6695Saguzovsk ASSERT(hp->p_halink[lix].p_lnext != NULL); 402*6695Saguzovsk ASSERT(hp->p_halink[lix].p_lprev != NULL); 403*6695Saguzovsk 404*6695Saguzovsk if (!seg_pathr_on) { 405*6695Saguzovsk hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev; 406*6695Saguzovsk hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext; 407*6695Saguzovsk hp->p_halink[lix].p_lnext = NULL; 408*6695Saguzovsk hp->p_halink[lix].p_lprev = NULL; 409*6695Saguzovsk } 410*6695Saguzovsk mutex_exit(&seg_pmem_mtx); 411*6695Saguzovsk } 412*6695Saguzovsk 413*6695Saguzovsk /* 414*6695Saguzovsk * Check if bucket pointed by hp already has a pcp entry that matches request 415*6695Saguzovsk * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise. 416*6695Saguzovsk * Also delete matching entries that cover smaller address range but start 417*6695Saguzovsk * at the same address as addr argument. Return the list of deleted entries if 418*6695Saguzovsk * any. This is an internal helper function called from seg_pinsert() only 419*6695Saguzovsk * for non wired shadow lists. The caller already holds a per seg/amp list 420*6695Saguzovsk * lock. 421*6695Saguzovsk */ 422*6695Saguzovsk static struct seg_pcache * 423*6695Saguzovsk seg_plookup_checkdup(struct seg_phash *hp, void *htag0, 424*6695Saguzovsk caddr_t addr, size_t len, int *found) 425*6695Saguzovsk { 426*6695Saguzovsk struct seg_pcache *pcp; 427*6695Saguzovsk struct seg_pcache *delcallb_list = NULL; 428*6695Saguzovsk 429*6695Saguzovsk ASSERT(MUTEX_HELD(&hp->p_hmutex)); 430*6695Saguzovsk 431*6695Saguzovsk *found = 0; 432*6695Saguzovsk for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 433*6695Saguzovsk pcp = pcp->p_hnext) { 434*6695Saguzovsk ASSERT(pcp->p_hashp == hp); 435*6695Saguzovsk if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) { 436*6695Saguzovsk ASSERT(!IS_PCP_WIRED(pcp)); 437*6695Saguzovsk if (pcp->p_len < len) { 438*6695Saguzovsk pcache_link_t *plinkp; 439*6695Saguzovsk if (pcp->p_active) { 440*6695Saguzovsk continue; 441*6695Saguzovsk } 442*6695Saguzovsk plinkp = &pcp->p_plink; 443*6695Saguzovsk plinkp->p_lprev->p_lnext = plinkp->p_lnext; 444*6695Saguzovsk plinkp->p_lnext->p_lprev = plinkp->p_lprev; 445*6695Saguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 446*6695Saguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 447*6695Saguzovsk pcp->p_hprev = delcallb_list; 448*6695Saguzovsk delcallb_list = pcp; 449*6695Saguzovsk } else { 450*6695Saguzovsk *found = 1; 451*6695Saguzovsk break; 452*6695Saguzovsk } 453*6695Saguzovsk } 454*6695Saguzovsk } 455*6695Saguzovsk return (delcallb_list); 456*6695Saguzovsk } 457*6695Saguzovsk 458*6695Saguzovsk /* 459*6695Saguzovsk * lookup an address range in pagelock cache. Return shadow list and bump up 460*6695Saguzovsk * active count. If amp is not NULL use amp as a lookup tag otherwise use seg 461*6695Saguzovsk * as a lookup tag. 4620Sstevel@tonic-gate */ 4630Sstevel@tonic-gate struct page ** 464*6695Saguzovsk seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, 465*6695Saguzovsk enum seg_rw rw, uint_t flags) 4660Sstevel@tonic-gate { 4670Sstevel@tonic-gate struct seg_pcache *pcp; 4680Sstevel@tonic-gate struct seg_phash *hp; 469*6695Saguzovsk void *htag0; 470*6695Saguzovsk 471*6695Saguzovsk ASSERT(seg != NULL); 472*6695Saguzovsk ASSERT(rw == S_READ || rw == S_WRITE); 4730Sstevel@tonic-gate 4740Sstevel@tonic-gate /* 4750Sstevel@tonic-gate * Skip pagelock cache, while DR is in progress or 4760Sstevel@tonic-gate * seg_pcache is off. 4770Sstevel@tonic-gate */ 478*6695Saguzovsk if (seg_pdisabled) { 4790Sstevel@tonic-gate return (NULL); 4800Sstevel@tonic-gate } 481*6695Saguzovsk ASSERT(seg_phashsize_win != 0); 4820Sstevel@tonic-gate 483*6695Saguzovsk htag0 = (amp == NULL ? (void *)seg : (void *)amp); 484*6695Saguzovsk hp = P_HASHBP(seg, htag0, addr, flags); 4850Sstevel@tonic-gate mutex_enter(&hp->p_hmutex); 4860Sstevel@tonic-gate for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 4870Sstevel@tonic-gate pcp = pcp->p_hnext) { 488*6695Saguzovsk ASSERT(pcp->p_hashp == hp); 489*6695Saguzovsk if (P_MATCH(pcp, htag0, addr, len)) { 490*6695Saguzovsk ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp)); 491*6695Saguzovsk /* 492*6695Saguzovsk * If this request wants to write pages 493*6695Saguzovsk * but write permissions starting from 494*6695Saguzovsk * addr don't cover the entire length len 495*6695Saguzovsk * return lookup failure back to the caller. 496*6695Saguzovsk * It will check protections and fail this 497*6695Saguzovsk * pagelock operation with EACCESS error. 498*6695Saguzovsk */ 499*6695Saguzovsk if (rw == S_WRITE && pcp->p_wlen < len) { 500*6695Saguzovsk break; 501*6695Saguzovsk } 502*6695Saguzovsk if (pcp->p_active == UINT_MAX) { 503*6695Saguzovsk break; 504*6695Saguzovsk } 5050Sstevel@tonic-gate pcp->p_active++; 506*6695Saguzovsk if (rw == S_WRITE && !pcp->p_write) { 507*6695Saguzovsk pcp->p_write = 1; 508*6695Saguzovsk } 5090Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 5100Sstevel@tonic-gate return (pcp->p_pp); 5110Sstevel@tonic-gate } 5120Sstevel@tonic-gate } 5130Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 5140Sstevel@tonic-gate return (NULL); 5150Sstevel@tonic-gate } 5160Sstevel@tonic-gate 5170Sstevel@tonic-gate /* 518*6695Saguzovsk * mark address range inactive. If the cache is off or the address range is 519*6695Saguzovsk * not in the cache or another shadow list that covers bigger range is found 520*6695Saguzovsk * we call the segment driver to reclaim the pages. Otherwise just decrement 521*6695Saguzovsk * active count and set ref bit. If amp is not NULL use amp as a lookup tag 522*6695Saguzovsk * otherwise use seg as a lookup tag. 5230Sstevel@tonic-gate */ 5240Sstevel@tonic-gate void 525*6695Saguzovsk seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr, 526*6695Saguzovsk size_t len, struct page **pp, enum seg_rw rw, uint_t flags, 527*6695Saguzovsk seg_preclaim_cbfunc_t callback) 5280Sstevel@tonic-gate { 5290Sstevel@tonic-gate struct seg_pcache *pcp; 5300Sstevel@tonic-gate struct seg_phash *hp; 531*6695Saguzovsk kmutex_t *pmtx = NULL; 532*6695Saguzovsk pcache_link_t *pheadp; 533*6695Saguzovsk void *htag0; 534*6695Saguzovsk pgcnt_t npages = 0; 535*6695Saguzovsk int keep = 0; 5360Sstevel@tonic-gate 537*6695Saguzovsk ASSERT(seg != NULL); 538*6695Saguzovsk ASSERT(rw == S_READ || rw == S_WRITE); 539*6695Saguzovsk 540*6695Saguzovsk htag0 = (amp == NULL ? (void *)seg : (void *)amp); 541*6695Saguzovsk 542*6695Saguzovsk /* 543*6695Saguzovsk * Skip lookup if pcache is not configured. 544*6695Saguzovsk */ 545*6695Saguzovsk if (seg_phashsize_win == 0) { 546*6695Saguzovsk goto out; 5470Sstevel@tonic-gate } 548*6695Saguzovsk 549*6695Saguzovsk /* 550*6695Saguzovsk * Grab per seg/amp lock before hash lock if we are going to remove 551*6695Saguzovsk * inactive entry from pcache. 552*6695Saguzovsk */ 553*6695Saguzovsk if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) { 554*6695Saguzovsk if (amp == NULL) { 555*6695Saguzovsk pheadp = &seg->s_phead; 556*6695Saguzovsk pmtx = &seg->s_pmtx; 557*6695Saguzovsk } else { 558*6695Saguzovsk pheadp = &->a_phead; 559*6695Saguzovsk pmtx = &->a_pmtx; 560*6695Saguzovsk } 561*6695Saguzovsk mutex_enter(pmtx); 562*6695Saguzovsk } 563*6695Saguzovsk 564*6695Saguzovsk hp = P_HASHBP(seg, htag0, addr, flags); 5650Sstevel@tonic-gate mutex_enter(&hp->p_hmutex); 566*6695Saguzovsk again: 5670Sstevel@tonic-gate for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 5680Sstevel@tonic-gate pcp = pcp->p_hnext) { 569*6695Saguzovsk ASSERT(pcp->p_hashp == hp); 570*6695Saguzovsk if (P_MATCH_PP(pcp, htag0, addr, len, pp)) { 571*6695Saguzovsk ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp)); 572*6695Saguzovsk ASSERT(pcp->p_active); 573*6695Saguzovsk if (keep) { 574*6695Saguzovsk /* 575*6695Saguzovsk * Don't remove this pcp entry 576*6695Saguzovsk * if we didn't find duplicate 577*6695Saguzovsk * shadow lists on second search. 578*6695Saguzovsk * Somebody removed those duplicates 579*6695Saguzovsk * since we dropped hash lock after first 580*6695Saguzovsk * search. 581*6695Saguzovsk */ 582*6695Saguzovsk ASSERT(pmtx != NULL); 583*6695Saguzovsk ASSERT(!IS_PFLAGS_WIRED(flags)); 584*6695Saguzovsk mutex_exit(pmtx); 585*6695Saguzovsk pmtx = NULL; 586*6695Saguzovsk } 5870Sstevel@tonic-gate pcp->p_active--; 588*6695Saguzovsk if (pcp->p_active == 0 && (pmtx != NULL || 589*6695Saguzovsk (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) { 590*6695Saguzovsk 591*6695Saguzovsk /* 592*6695Saguzovsk * This entry is no longer active. Remove it 593*6695Saguzovsk * now either because pcaching is temporarily 594*6695Saguzovsk * disabled or there're other pcp entries that 595*6695Saguzovsk * can match this pagelock request (i.e. this 596*6695Saguzovsk * entry is a duplicate). 597*6695Saguzovsk */ 5980Sstevel@tonic-gate 5990Sstevel@tonic-gate ASSERT(callback == pcp->p_callback); 600*6695Saguzovsk if (pmtx != NULL) { 601*6695Saguzovsk pcache_link_t *plinkp = &pcp->p_plink; 602*6695Saguzovsk ASSERT(!IS_PCP_WIRED(pcp)); 603*6695Saguzovsk ASSERT(pheadp->p_lnext != pheadp); 604*6695Saguzovsk ASSERT(pheadp->p_lprev != pheadp); 605*6695Saguzovsk plinkp->p_lprev->p_lnext = 606*6695Saguzovsk plinkp->p_lnext; 607*6695Saguzovsk plinkp->p_lnext->p_lprev = 608*6695Saguzovsk plinkp->p_lprev; 609*6695Saguzovsk } 6100Sstevel@tonic-gate pcp->p_hprev->p_hnext = pcp->p_hnext; 6110Sstevel@tonic-gate pcp->p_hnext->p_hprev = pcp->p_hprev; 612*6695Saguzovsk if (!IS_PCP_WIRED(pcp) && 613*6695Saguzovsk hp->p_hnext == (struct seg_pcache *)hp) { 614*6695Saguzovsk /* 615*6695Saguzovsk * We removed the last entry from this 616*6695Saguzovsk * bucket. Now remove the bucket from 617*6695Saguzovsk * its active list. 618*6695Saguzovsk */ 619*6695Saguzovsk seg_premove_abuck(hp, 0); 620*6695Saguzovsk } 6210Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 622*6695Saguzovsk if (pmtx != NULL) { 623*6695Saguzovsk mutex_exit(pmtx); 624*6695Saguzovsk } 625*6695Saguzovsk len = pcp->p_len; 626*6695Saguzovsk npages = btop(len); 627*6695Saguzovsk if (rw != S_WRITE && pcp->p_write) { 628*6695Saguzovsk rw = S_WRITE; 629*6695Saguzovsk } 630*6695Saguzovsk kmem_cache_free(seg_pkmcache, pcp); 631*6695Saguzovsk goto out; 632*6695Saguzovsk } else { 633*6695Saguzovsk /* 634*6695Saguzovsk * We found a matching pcp entry but will not 635*6695Saguzovsk * free it right away even if it's no longer 636*6695Saguzovsk * active. 637*6695Saguzovsk */ 638*6695Saguzovsk if (!pcp->p_active && !IS_PCP_WIRED(pcp)) { 639*6695Saguzovsk /* 640*6695Saguzovsk * Set the reference bit and mark the 641*6695Saguzovsk * time of last access to this pcp 642*6695Saguzovsk * so that asynchronous thread doesn't 643*6695Saguzovsk * free it immediately since 644*6695Saguzovsk * it may be reactivated very soon. 645*6695Saguzovsk */ 646*6695Saguzovsk pcp->p_lbolt = lbolt; 647*6695Saguzovsk pcp->p_ref = 1; 648*6695Saguzovsk } 649*6695Saguzovsk mutex_exit(&hp->p_hmutex); 650*6695Saguzovsk if (pmtx != NULL) { 651*6695Saguzovsk mutex_exit(pmtx); 6520Sstevel@tonic-gate } 653*6695Saguzovsk return; 654*6695Saguzovsk } 655*6695Saguzovsk } else if (!IS_PFLAGS_WIRED(flags) && 656*6695Saguzovsk P_MATCH(pcp, htag0, addr, len)) { 657*6695Saguzovsk /* 658*6695Saguzovsk * This is a duplicate pcp entry. This situation may 659*6695Saguzovsk * happen if a bigger shadow list that covers our 660*6695Saguzovsk * range was added while our entry was still active. 661*6695Saguzovsk * Now we can free our pcp entry if it becomes 662*6695Saguzovsk * inactive. 663*6695Saguzovsk */ 664*6695Saguzovsk if (!pcp->p_active) { 665*6695Saguzovsk /* 666*6695Saguzovsk * Mark this entry as referenced just in case 667*6695Saguzovsk * we'll free our own pcp entry soon. 668*6695Saguzovsk */ 669*6695Saguzovsk pcp->p_lbolt = lbolt; 670*6695Saguzovsk pcp->p_ref = 1; 671*6695Saguzovsk } 672*6695Saguzovsk if (pmtx != NULL) { 673*6695Saguzovsk /* 674*6695Saguzovsk * we are already holding pmtx and found a 675*6695Saguzovsk * duplicate. Don't keep our own pcp entry. 676*6695Saguzovsk */ 677*6695Saguzovsk keep = 0; 678*6695Saguzovsk continue; 6790Sstevel@tonic-gate } 680*6695Saguzovsk /* 681*6695Saguzovsk * We have to use mutex_tryenter to attempt to lock 682*6695Saguzovsk * seg/amp list lock since we already hold hash lock 683*6695Saguzovsk * and seg/amp list lock is above hash lock in lock 684*6695Saguzovsk * order. If mutex_tryenter fails drop hash lock and 685*6695Saguzovsk * retake both locks in correct order and research 686*6695Saguzovsk * this hash chain. 687*6695Saguzovsk */ 688*6695Saguzovsk ASSERT(keep == 0); 689*6695Saguzovsk if (amp == NULL) { 690*6695Saguzovsk pheadp = &seg->s_phead; 691*6695Saguzovsk pmtx = &seg->s_pmtx; 692*6695Saguzovsk } else { 693*6695Saguzovsk pheadp = &->a_phead; 694*6695Saguzovsk pmtx = &->a_pmtx; 695*6695Saguzovsk } 696*6695Saguzovsk if (!mutex_tryenter(pmtx)) { 697*6695Saguzovsk mutex_exit(&hp->p_hmutex); 698*6695Saguzovsk mutex_enter(pmtx); 699*6695Saguzovsk mutex_enter(&hp->p_hmutex); 700*6695Saguzovsk /* 701*6695Saguzovsk * If we don't find bigger shadow list on 702*6695Saguzovsk * second search (it may happen since we 703*6695Saguzovsk * dropped bucket lock) keep the entry that 704*6695Saguzovsk * matches our own shadow list. 705*6695Saguzovsk */ 706*6695Saguzovsk keep = 1; 707*6695Saguzovsk goto again; 708*6695Saguzovsk } 7090Sstevel@tonic-gate } 7100Sstevel@tonic-gate } 7110Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 712*6695Saguzovsk if (pmtx != NULL) { 713*6695Saguzovsk mutex_exit(pmtx); 714*6695Saguzovsk } 7150Sstevel@tonic-gate out: 716*6695Saguzovsk (*callback)(htag0, addr, len, pp, rw, 0); 717*6695Saguzovsk if (npages) { 718*6695Saguzovsk mutex_enter(&seg_pmem_mtx); 719*6695Saguzovsk ASSERT(seg_plocked >= npages); 720*6695Saguzovsk seg_plocked -= npages; 721*6695Saguzovsk if (!IS_PFLAGS_WIRED(flags)) { 722*6695Saguzovsk ASSERT(seg_plocked_window >= npages); 723*6695Saguzovsk seg_plocked_window -= npages; 724*6695Saguzovsk } 725*6695Saguzovsk mutex_exit(&seg_pmem_mtx); 726*6695Saguzovsk } 727*6695Saguzovsk 7280Sstevel@tonic-gate } 7290Sstevel@tonic-gate 730*6695Saguzovsk #ifdef DEBUG 731*6695Saguzovsk static uint32_t p_insert_chk_mtbf = 0; 732*6695Saguzovsk #endif 733*6695Saguzovsk 7340Sstevel@tonic-gate /* 7350Sstevel@tonic-gate * The seg_pinsert_check() is used by segment drivers to predict whether 7360Sstevel@tonic-gate * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing. 7370Sstevel@tonic-gate */ 738*6695Saguzovsk /*ARGSUSED*/ 7390Sstevel@tonic-gate int 740*6695Saguzovsk seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr, 741*6695Saguzovsk size_t len, uint_t flags) 7420Sstevel@tonic-gate { 743*6695Saguzovsk ASSERT(seg != NULL); 7440Sstevel@tonic-gate 745*6695Saguzovsk #ifdef DEBUG 746*6695Saguzovsk if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) { 7470Sstevel@tonic-gate return (SEGP_FAIL); 7480Sstevel@tonic-gate } 749*6695Saguzovsk #endif 750*6695Saguzovsk 751*6695Saguzovsk if (seg_pdisabled) { 7520Sstevel@tonic-gate return (SEGP_FAIL); 7530Sstevel@tonic-gate } 754*6695Saguzovsk ASSERT(seg_phashsize_win != 0); 755*6695Saguzovsk 756*6695Saguzovsk if (IS_PFLAGS_WIRED(flags)) { 757*6695Saguzovsk return (SEGP_SUCCESS); 758*6695Saguzovsk } 7590Sstevel@tonic-gate 760*6695Saguzovsk if (seg_plocked_window + btop(len) > seg_pmaxwindow) { 761*6695Saguzovsk return (SEGP_FAIL); 7620Sstevel@tonic-gate } 763*6695Saguzovsk 764*6695Saguzovsk if (freemem < desfree) { 765*6695Saguzovsk return (SEGP_FAIL); 766*6695Saguzovsk } 767*6695Saguzovsk 7680Sstevel@tonic-gate return (SEGP_SUCCESS); 7690Sstevel@tonic-gate } 7700Sstevel@tonic-gate 771*6695Saguzovsk #ifdef DEBUG 772*6695Saguzovsk static uint32_t p_insert_mtbf = 0; 773*6695Saguzovsk #endif 7740Sstevel@tonic-gate 7750Sstevel@tonic-gate /* 776*6695Saguzovsk * Insert address range with shadow list into pagelock cache if there's no 777*6695Saguzovsk * shadow list already cached for this address range. If the cache is off or 778*6695Saguzovsk * caching is temporarily disabled or the allowed 'window' is exceeded return 779*6695Saguzovsk * SEGP_FAIL. Otherwise return SEGP_SUCCESS. 780*6695Saguzovsk * 781*6695Saguzovsk * For non wired shadow lists (segvn case) include address in the hashing 782*6695Saguzovsk * function to avoid linking all the entries from the same segment or amp on 783*6695Saguzovsk * the same bucket. amp is used instead of seg if amp is not NULL. Non wired 784*6695Saguzovsk * pcache entries are also linked on a per segment/amp list so that all 785*6695Saguzovsk * entries can be found quickly during seg/amp purge without walking the 786*6695Saguzovsk * entire pcache hash table. For wired shadow lists (segspt case) we 787*6695Saguzovsk * don't use address hashing and per segment linking because the caller 788*6695Saguzovsk * currently inserts only one entry per segment that covers the entire 789*6695Saguzovsk * segment. If we used per segment linking even for segspt it would complicate 790*6695Saguzovsk * seg_ppurge_wiredpp() locking. 791*6695Saguzovsk * 792*6695Saguzovsk * Both hash bucket and per seg/amp locks need to be held before adding a non 793*6695Saguzovsk * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken 794*6695Saguzovsk * first. 795*6695Saguzovsk * 796*6695Saguzovsk * This function will also remove from pcache old inactive shadow lists that 797*6695Saguzovsk * overlap with this request but cover smaller range for the same start 798*6695Saguzovsk * address. 7990Sstevel@tonic-gate */ 8000Sstevel@tonic-gate int 801*6695Saguzovsk seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len, 802*6695Saguzovsk size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags, 803*6695Saguzovsk seg_preclaim_cbfunc_t callback) 8040Sstevel@tonic-gate { 8050Sstevel@tonic-gate struct seg_pcache *pcp; 8060Sstevel@tonic-gate struct seg_phash *hp; 8070Sstevel@tonic-gate pgcnt_t npages; 808*6695Saguzovsk pcache_link_t *pheadp; 809*6695Saguzovsk kmutex_t *pmtx; 810*6695Saguzovsk struct seg_pcache *delcallb_list = NULL; 8110Sstevel@tonic-gate 812*6695Saguzovsk ASSERT(seg != NULL); 813*6695Saguzovsk ASSERT(rw == S_READ || rw == S_WRITE); 814*6695Saguzovsk ASSERT(rw == S_READ || wlen == len); 815*6695Saguzovsk ASSERT(rw == S_WRITE || wlen <= len); 816*6695Saguzovsk ASSERT(amp == NULL || wlen == len); 817*6695Saguzovsk 818*6695Saguzovsk #ifdef DEBUG 819*6695Saguzovsk if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) { 8200Sstevel@tonic-gate return (SEGP_FAIL); 8210Sstevel@tonic-gate } 822*6695Saguzovsk #endif 823*6695Saguzovsk 824*6695Saguzovsk if (seg_pdisabled) { 8250Sstevel@tonic-gate return (SEGP_FAIL); 8260Sstevel@tonic-gate } 827*6695Saguzovsk ASSERT(seg_phashsize_win != 0); 828*6695Saguzovsk 829*6695Saguzovsk ASSERT((len & PAGEOFFSET) == 0); 830*6695Saguzovsk npages = btop(len); 831*6695Saguzovsk mutex_enter(&seg_pmem_mtx); 832*6695Saguzovsk if (!IS_PFLAGS_WIRED(flags)) { 833*6695Saguzovsk if (seg_plocked_window + npages > seg_pmaxwindow) { 834*6695Saguzovsk mutex_exit(&seg_pmem_mtx); 8350Sstevel@tonic-gate return (SEGP_FAIL); 8360Sstevel@tonic-gate } 837*6695Saguzovsk seg_plocked_window += npages; 8380Sstevel@tonic-gate } 8390Sstevel@tonic-gate seg_plocked += npages; 840*6695Saguzovsk mutex_exit(&seg_pmem_mtx); 8410Sstevel@tonic-gate 842*6695Saguzovsk pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP); 843*6695Saguzovsk /* 844*6695Saguzovsk * If amp is not NULL set htag0 to amp otherwise set it to seg. 845*6695Saguzovsk */ 846*6695Saguzovsk if (amp == NULL) { 847*6695Saguzovsk pcp->p_htag0 = (void *)seg; 848*6695Saguzovsk pcp->p_flags = flags & 0xffff; 849*6695Saguzovsk } else { 850*6695Saguzovsk pcp->p_htag0 = (void *)amp; 851*6695Saguzovsk pcp->p_flags = (flags & 0xffff) | SEGP_AMP; 852*6695Saguzovsk } 8530Sstevel@tonic-gate pcp->p_addr = addr; 8540Sstevel@tonic-gate pcp->p_len = len; 855*6695Saguzovsk pcp->p_wlen = wlen; 8560Sstevel@tonic-gate pcp->p_pp = pp; 857*6695Saguzovsk pcp->p_write = (rw == S_WRITE); 8580Sstevel@tonic-gate pcp->p_callback = callback; 8590Sstevel@tonic-gate pcp->p_active = 1; 8600Sstevel@tonic-gate 861*6695Saguzovsk hp = P_HASHBP(seg, pcp->p_htag0, addr, flags); 862*6695Saguzovsk if (!IS_PFLAGS_WIRED(flags)) { 863*6695Saguzovsk int found; 864*6695Saguzovsk void *htag0; 865*6695Saguzovsk if (amp == NULL) { 866*6695Saguzovsk pheadp = &seg->s_phead; 867*6695Saguzovsk pmtx = &seg->s_pmtx; 868*6695Saguzovsk htag0 = (void *)seg; 869*6695Saguzovsk } else { 870*6695Saguzovsk pheadp = &->a_phead; 871*6695Saguzovsk pmtx = &->a_pmtx; 872*6695Saguzovsk htag0 = (void *)amp; 873*6695Saguzovsk } 874*6695Saguzovsk mutex_enter(pmtx); 875*6695Saguzovsk mutex_enter(&hp->p_hmutex); 876*6695Saguzovsk delcallb_list = seg_plookup_checkdup(hp, htag0, addr, 877*6695Saguzovsk len, &found); 878*6695Saguzovsk if (found) { 879*6695Saguzovsk mutex_exit(&hp->p_hmutex); 880*6695Saguzovsk mutex_exit(pmtx); 881*6695Saguzovsk mutex_enter(&seg_pmem_mtx); 882*6695Saguzovsk seg_plocked -= npages; 883*6695Saguzovsk seg_plocked_window -= npages; 884*6695Saguzovsk mutex_exit(&seg_pmem_mtx); 885*6695Saguzovsk kmem_cache_free(seg_pkmcache, pcp); 886*6695Saguzovsk goto out; 887*6695Saguzovsk } 888*6695Saguzovsk pcp->p_plink.p_lnext = pheadp->p_lnext; 889*6695Saguzovsk pcp->p_plink.p_lprev = pheadp; 890*6695Saguzovsk pheadp->p_lnext->p_lprev = &pcp->p_plink; 891*6695Saguzovsk pheadp->p_lnext = &pcp->p_plink; 892*6695Saguzovsk } else { 893*6695Saguzovsk mutex_enter(&hp->p_hmutex); 894*6695Saguzovsk } 895*6695Saguzovsk pcp->p_hashp = hp; 8960Sstevel@tonic-gate pcp->p_hnext = hp->p_hnext; 8970Sstevel@tonic-gate pcp->p_hprev = (struct seg_pcache *)hp; 8980Sstevel@tonic-gate hp->p_hnext->p_hprev = pcp; 8990Sstevel@tonic-gate hp->p_hnext = pcp; 900*6695Saguzovsk if (!IS_PFLAGS_WIRED(flags) && 901*6695Saguzovsk hp->p_hprev == pcp) { 902*6695Saguzovsk seg_padd_abuck(hp); 903*6695Saguzovsk } 9040Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 905*6695Saguzovsk if (!IS_PFLAGS_WIRED(flags)) { 906*6695Saguzovsk mutex_exit(pmtx); 907*6695Saguzovsk } 908*6695Saguzovsk 909*6695Saguzovsk out: 910*6695Saguzovsk npages = 0; 911*6695Saguzovsk while (delcallb_list != NULL) { 912*6695Saguzovsk pcp = delcallb_list; 913*6695Saguzovsk delcallb_list = pcp->p_hprev; 914*6695Saguzovsk ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active); 915*6695Saguzovsk (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 916*6695Saguzovsk pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0); 917*6695Saguzovsk npages += btop(pcp->p_len); 918*6695Saguzovsk kmem_cache_free(seg_pkmcache, pcp); 919*6695Saguzovsk } 920*6695Saguzovsk if (npages) { 921*6695Saguzovsk ASSERT(!IS_PFLAGS_WIRED(flags)); 922*6695Saguzovsk mutex_enter(&seg_pmem_mtx); 923*6695Saguzovsk ASSERT(seg_plocked >= npages); 924*6695Saguzovsk ASSERT(seg_plocked_window >= npages); 925*6695Saguzovsk seg_plocked -= npages; 926*6695Saguzovsk seg_plocked_window -= npages; 927*6695Saguzovsk mutex_exit(&seg_pmem_mtx); 928*6695Saguzovsk } 929*6695Saguzovsk 9300Sstevel@tonic-gate return (SEGP_SUCCESS); 9310Sstevel@tonic-gate } 9320Sstevel@tonic-gate 9330Sstevel@tonic-gate /* 934*6695Saguzovsk * purge entries from the pagelock cache if not active 935*6695Saguzovsk * and not recently used. 9360Sstevel@tonic-gate */ 9370Sstevel@tonic-gate static void 938*6695Saguzovsk seg_ppurge_async(int force) 9390Sstevel@tonic-gate { 9400Sstevel@tonic-gate struct seg_pcache *delcallb_list = NULL; 9410Sstevel@tonic-gate struct seg_pcache *pcp; 9420Sstevel@tonic-gate struct seg_phash *hp; 9430Sstevel@tonic-gate pgcnt_t npages = 0; 9440Sstevel@tonic-gate pgcnt_t npages_window = 0; 945*6695Saguzovsk pgcnt_t npgs_to_purge; 946*6695Saguzovsk pgcnt_t npgs_purged = 0; 947*6695Saguzovsk int hlinks = 0; 948*6695Saguzovsk int hlix; 949*6695Saguzovsk pcache_link_t *hlinkp; 950*6695Saguzovsk pcache_link_t *hlnextp = NULL; 951*6695Saguzovsk int lowmem; 952*6695Saguzovsk int trim; 953*6695Saguzovsk 954*6695Saguzovsk ASSERT(seg_phashsize_win != 0); 9550Sstevel@tonic-gate 9560Sstevel@tonic-gate /* 957*6695Saguzovsk * if the cache is off or empty, return 9580Sstevel@tonic-gate */ 959*6695Saguzovsk if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) { 9600Sstevel@tonic-gate return; 9610Sstevel@tonic-gate } 962*6695Saguzovsk 963*6695Saguzovsk if (!force) { 964*6695Saguzovsk lowmem = 0; 965*6695Saguzovsk trim = 0; 966*6695Saguzovsk if (freemem < lotsfree + needfree) { 967*6695Saguzovsk spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0); 968*6695Saguzovsk if (fmem <= 5 * (desfree >> 2)) { 969*6695Saguzovsk lowmem = 1; 970*6695Saguzovsk } else if (fmem <= 7 * (lotsfree >> 3)) { 971*6695Saguzovsk if (seg_plocked_window >= 972*6695Saguzovsk (availrmem_initial >> 1)) { 973*6695Saguzovsk lowmem = 1; 974*6695Saguzovsk } 975*6695Saguzovsk } else if (fmem < lotsfree) { 976*6695Saguzovsk if (seg_plocked_window >= 977*6695Saguzovsk 3 * (availrmem_initial >> 2)) { 978*6695Saguzovsk lowmem = 1; 979*6695Saguzovsk } 980*6695Saguzovsk } 981*6695Saguzovsk } 982*6695Saguzovsk if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) { 983*6695Saguzovsk trim = 1; 984*6695Saguzovsk } 985*6695Saguzovsk if (!lowmem && !trim) { 986*6695Saguzovsk return; 987*6695Saguzovsk } 988*6695Saguzovsk npgs_to_purge = seg_plocked_window >> 989*6695Saguzovsk seg_pshrink_shift; 990*6695Saguzovsk if (lowmem) { 991*6695Saguzovsk npgs_to_purge = MIN(npgs_to_purge, 992*6695Saguzovsk MAX(seg_pmaxapurge_npages, desfree)); 993*6695Saguzovsk } else { 994*6695Saguzovsk npgs_to_purge = MIN(npgs_to_purge, 995*6695Saguzovsk seg_pmaxapurge_npages); 996*6695Saguzovsk } 997*6695Saguzovsk if (npgs_to_purge == 0) { 998*6695Saguzovsk return; 999*6695Saguzovsk } 1000*6695Saguzovsk } else { 1001*6695Saguzovsk struct seg_phash_wired *hpw; 1002*6695Saguzovsk 1003*6695Saguzovsk ASSERT(seg_phashsize_wired != 0); 1004*6695Saguzovsk 1005*6695Saguzovsk for (hpw = seg_phashtab_wired; 1006*6695Saguzovsk hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) { 1007*6695Saguzovsk 1008*6695Saguzovsk if (hpw->p_hnext == (struct seg_pcache *)hpw) { 1009*6695Saguzovsk continue; 1010*6695Saguzovsk } 1011*6695Saguzovsk 1012*6695Saguzovsk mutex_enter(&hpw->p_hmutex); 1013*6695Saguzovsk 1014*6695Saguzovsk for (pcp = hpw->p_hnext; 1015*6695Saguzovsk pcp != (struct seg_pcache *)hpw; 1016*6695Saguzovsk pcp = pcp->p_hnext) { 1017*6695Saguzovsk 1018*6695Saguzovsk ASSERT(IS_PCP_WIRED(pcp)); 1019*6695Saguzovsk ASSERT(pcp->p_hashp == 1020*6695Saguzovsk (struct seg_phash *)hpw); 1021*6695Saguzovsk 1022*6695Saguzovsk if (pcp->p_active) { 1023*6695Saguzovsk continue; 1024*6695Saguzovsk } 1025*6695Saguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 1026*6695Saguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 1027*6695Saguzovsk pcp->p_hprev = delcallb_list; 1028*6695Saguzovsk delcallb_list = pcp; 1029*6695Saguzovsk } 1030*6695Saguzovsk mutex_exit(&hpw->p_hmutex); 1031*6695Saguzovsk } 1032*6695Saguzovsk } 1033*6695Saguzovsk 1034*6695Saguzovsk mutex_enter(&seg_pmem_mtx); 1035*6695Saguzovsk if (seg_pathr_on) { 1036*6695Saguzovsk mutex_exit(&seg_pmem_mtx); 1037*6695Saguzovsk goto runcb; 1038*6695Saguzovsk } 1039*6695Saguzovsk seg_pathr_on = 1; 1040*6695Saguzovsk mutex_exit(&seg_pmem_mtx); 1041*6695Saguzovsk ASSERT(seg_pahcur <= 1); 1042*6695Saguzovsk hlix = !seg_pahcur; 1043*6695Saguzovsk 1044*6695Saguzovsk again: 1045*6695Saguzovsk for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix]; 1046*6695Saguzovsk hlinkp = hlnextp) { 1047*6695Saguzovsk 1048*6695Saguzovsk hlnextp = hlinkp->p_lnext; 1049*6695Saguzovsk ASSERT(hlnextp != NULL); 1050*6695Saguzovsk 1051*6695Saguzovsk hp = hlink2phash(hlinkp, hlix); 1052*6695Saguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) { 1053*6695Saguzovsk seg_pathr_empty_ahb++; 1054*6695Saguzovsk continue; 1055*6695Saguzovsk } 1056*6695Saguzovsk seg_pathr_full_ahb++; 10570Sstevel@tonic-gate mutex_enter(&hp->p_hmutex); 1058*6695Saguzovsk 1059*6695Saguzovsk for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; 1060*6695Saguzovsk pcp = pcp->p_hnext) { 1061*6695Saguzovsk pcache_link_t *pheadp; 1062*6695Saguzovsk pcache_link_t *plinkp; 1063*6695Saguzovsk void *htag0; 1064*6695Saguzovsk kmutex_t *pmtx; 1065*6695Saguzovsk 1066*6695Saguzovsk ASSERT(!IS_PCP_WIRED(pcp)); 1067*6695Saguzovsk ASSERT(pcp->p_hashp == hp); 1068*6695Saguzovsk 1069*6695Saguzovsk if (pcp->p_active) { 1070*6695Saguzovsk continue; 1071*6695Saguzovsk } 1072*6695Saguzovsk if (!force && pcp->p_ref && 1073*6695Saguzovsk PCP_AGE(pcp) < seg_pmax_pcpage) { 1074*6695Saguzovsk pcp->p_ref = 0; 1075*6695Saguzovsk continue; 1076*6695Saguzovsk } 1077*6695Saguzovsk plinkp = &pcp->p_plink; 1078*6695Saguzovsk htag0 = pcp->p_htag0; 1079*6695Saguzovsk if (pcp->p_flags & SEGP_AMP) { 1080*6695Saguzovsk pheadp = &((amp_t *)htag0)->a_phead; 1081*6695Saguzovsk pmtx = &((amp_t *)htag0)->a_pmtx; 1082*6695Saguzovsk } else { 1083*6695Saguzovsk pheadp = &((seg_t *)htag0)->s_phead; 1084*6695Saguzovsk pmtx = &((seg_t *)htag0)->s_pmtx; 1085*6695Saguzovsk } 1086*6695Saguzovsk if (!mutex_tryenter(pmtx)) { 1087*6695Saguzovsk continue; 1088*6695Saguzovsk } 1089*6695Saguzovsk ASSERT(pheadp->p_lnext != pheadp); 1090*6695Saguzovsk ASSERT(pheadp->p_lprev != pheadp); 1091*6695Saguzovsk plinkp->p_lprev->p_lnext = 1092*6695Saguzovsk plinkp->p_lnext; 1093*6695Saguzovsk plinkp->p_lnext->p_lprev = 1094*6695Saguzovsk plinkp->p_lprev; 1095*6695Saguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 1096*6695Saguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 1097*6695Saguzovsk mutex_exit(pmtx); 1098*6695Saguzovsk pcp->p_hprev = delcallb_list; 1099*6695Saguzovsk delcallb_list = pcp; 1100*6695Saguzovsk npgs_purged += btop(pcp->p_len); 1101*6695Saguzovsk } 1102*6695Saguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) { 1103*6695Saguzovsk seg_premove_abuck(hp, 1); 1104*6695Saguzovsk } 1105*6695Saguzovsk mutex_exit(&hp->p_hmutex); 1106*6695Saguzovsk if (npgs_purged >= seg_plocked_window) { 1107*6695Saguzovsk break; 1108*6695Saguzovsk } 1109*6695Saguzovsk if (!force) { 1110*6695Saguzovsk if (npgs_purged >= npgs_to_purge) { 1111*6695Saguzovsk break; 1112*6695Saguzovsk } 1113*6695Saguzovsk if (!trim && !(seg_pathr_full_ahb & 15)) { 1114*6695Saguzovsk ASSERT(lowmem); 1115*6695Saguzovsk if (freemem >= lotsfree + needfree) { 1116*6695Saguzovsk break; 1117*6695Saguzovsk } 1118*6695Saguzovsk } 1119*6695Saguzovsk } 1120*6695Saguzovsk } 1121*6695Saguzovsk 1122*6695Saguzovsk if (hlinkp == &seg_pahhead[hlix]) { 1123*6695Saguzovsk /* 1124*6695Saguzovsk * We processed the entire hlix active bucket list 1125*6695Saguzovsk * but didn't find enough pages to reclaim. 1126*6695Saguzovsk * Switch the lists and walk the other list 1127*6695Saguzovsk * if we haven't done it yet. 1128*6695Saguzovsk */ 1129*6695Saguzovsk mutex_enter(&seg_pmem_mtx); 1130*6695Saguzovsk ASSERT(seg_pathr_on); 1131*6695Saguzovsk ASSERT(seg_pahcur == !hlix); 1132*6695Saguzovsk seg_pahcur = hlix; 1133*6695Saguzovsk mutex_exit(&seg_pmem_mtx); 1134*6695Saguzovsk if (++hlinks < 2) { 1135*6695Saguzovsk hlix = !hlix; 1136*6695Saguzovsk goto again; 1137*6695Saguzovsk } 1138*6695Saguzovsk } else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] && 1139*6695Saguzovsk seg_pahhead[hlix].p_lnext != hlinkp) { 1140*6695Saguzovsk ASSERT(hlinkp != NULL); 1141*6695Saguzovsk ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]); 1142*6695Saguzovsk ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]); 1143*6695Saguzovsk ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]); 11440Sstevel@tonic-gate 11450Sstevel@tonic-gate /* 1146*6695Saguzovsk * Reinsert the header to point to hlinkp 1147*6695Saguzovsk * so that we start from hlinkp bucket next time around. 11480Sstevel@tonic-gate */ 1149*6695Saguzovsk seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev; 1150*6695Saguzovsk seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext; 1151*6695Saguzovsk seg_pahhead[hlix].p_lnext = hlinkp; 1152*6695Saguzovsk seg_pahhead[hlix].p_lprev = hlinkp->p_lprev; 1153*6695Saguzovsk hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix]; 1154*6695Saguzovsk hlinkp->p_lprev = &seg_pahhead[hlix]; 1155*6695Saguzovsk } 1156*6695Saguzovsk 1157*6695Saguzovsk mutex_enter(&seg_pmem_mtx); 1158*6695Saguzovsk ASSERT(seg_pathr_on); 1159*6695Saguzovsk seg_pathr_on = 0; 1160*6695Saguzovsk mutex_exit(&seg_pmem_mtx); 11610Sstevel@tonic-gate 1162*6695Saguzovsk runcb: 1163*6695Saguzovsk /* 1164*6695Saguzovsk * Run the delayed callback list. segments/amps can't go away until 1165*6695Saguzovsk * callback is executed since they must have non 0 softlockcnt. That's 1166*6695Saguzovsk * why we don't need to hold as/seg/amp locks to execute the callback. 1167*6695Saguzovsk */ 1168*6695Saguzovsk while (delcallb_list != NULL) { 1169*6695Saguzovsk pcp = delcallb_list; 1170*6695Saguzovsk delcallb_list = pcp->p_hprev; 1171*6695Saguzovsk ASSERT(!pcp->p_active); 1172*6695Saguzovsk (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 1173*6695Saguzovsk pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1); 1174*6695Saguzovsk npages += btop(pcp->p_len); 1175*6695Saguzovsk if (!IS_PCP_WIRED(pcp)) { 1176*6695Saguzovsk npages_window += btop(pcp->p_len); 1177*6695Saguzovsk } 1178*6695Saguzovsk kmem_cache_free(seg_pkmcache, pcp); 1179*6695Saguzovsk } 1180*6695Saguzovsk if (npages) { 1181*6695Saguzovsk mutex_enter(&seg_pmem_mtx); 1182*6695Saguzovsk ASSERT(seg_plocked >= npages); 1183*6695Saguzovsk ASSERT(seg_plocked_window >= npages_window); 1184*6695Saguzovsk seg_plocked -= npages; 1185*6695Saguzovsk seg_plocked_window -= npages_window; 1186*6695Saguzovsk mutex_exit(&seg_pmem_mtx); 1187*6695Saguzovsk } 1188*6695Saguzovsk } 1189*6695Saguzovsk 1190*6695Saguzovsk /* 1191*6695Saguzovsk * Remove cached pages for segment(s) entries from hashtable. The segments 1192*6695Saguzovsk * are identified by pp array. This is useful for multiple seg's cached on 1193*6695Saguzovsk * behalf of dummy segment (ISM/DISM) with common pp array. 1194*6695Saguzovsk */ 1195*6695Saguzovsk void 1196*6695Saguzovsk seg_ppurge_wiredpp(struct page **pp) 1197*6695Saguzovsk { 1198*6695Saguzovsk struct seg_pcache *pcp; 1199*6695Saguzovsk struct seg_phash_wired *hp; 1200*6695Saguzovsk pgcnt_t npages = 0; 1201*6695Saguzovsk struct seg_pcache *delcallb_list = NULL; 1202*6695Saguzovsk 1203*6695Saguzovsk /* 1204*6695Saguzovsk * if the cache is empty, return 1205*6695Saguzovsk */ 1206*6695Saguzovsk if (seg_plocked == 0) { 1207*6695Saguzovsk return; 1208*6695Saguzovsk } 1209*6695Saguzovsk ASSERT(seg_phashsize_wired != 0); 1210*6695Saguzovsk 1211*6695Saguzovsk for (hp = seg_phashtab_wired; 1212*6695Saguzovsk hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) { 1213*6695Saguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) { 1214*6695Saguzovsk continue; 1215*6695Saguzovsk } 1216*6695Saguzovsk mutex_enter(&hp->p_hmutex); 1217*6695Saguzovsk pcp = hp->p_hnext; 1218*6695Saguzovsk while (pcp != (struct seg_pcache *)hp) { 1219*6695Saguzovsk ASSERT(pcp->p_hashp == (struct seg_phash *)hp); 1220*6695Saguzovsk ASSERT(IS_PCP_WIRED(pcp)); 12210Sstevel@tonic-gate /* 1222*6695Saguzovsk * purge entries which are not active 12230Sstevel@tonic-gate */ 1224*6695Saguzovsk if (!pcp->p_active && pcp->p_pp == pp) { 1225*6695Saguzovsk ASSERT(pcp->p_htag0 != NULL); 1226*6695Saguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 1227*6695Saguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 1228*6695Saguzovsk pcp->p_hprev = delcallb_list; 1229*6695Saguzovsk delcallb_list = pcp; 12300Sstevel@tonic-gate } 12310Sstevel@tonic-gate pcp = pcp->p_hnext; 12320Sstevel@tonic-gate } 12330Sstevel@tonic-gate mutex_exit(&hp->p_hmutex); 1234*6695Saguzovsk /* 1235*6695Saguzovsk * segments can't go away until callback is executed since 1236*6695Saguzovsk * they must have non 0 softlockcnt. That's why we don't 1237*6695Saguzovsk * need to hold as/seg locks to execute the callback. 1238*6695Saguzovsk */ 1239*6695Saguzovsk while (delcallb_list != NULL) { 1240*6695Saguzovsk int done; 1241*6695Saguzovsk pcp = delcallb_list; 1242*6695Saguzovsk delcallb_list = pcp->p_hprev; 1243*6695Saguzovsk ASSERT(!pcp->p_active); 1244*6695Saguzovsk done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, 1245*6695Saguzovsk pcp->p_len, pcp->p_pp, 1246*6695Saguzovsk pcp->p_write ? S_WRITE : S_READ, 1); 1247*6695Saguzovsk npages += btop(pcp->p_len); 1248*6695Saguzovsk ASSERT(IS_PCP_WIRED(pcp)); 1249*6695Saguzovsk kmem_cache_free(seg_pkmcache, pcp); 1250*6695Saguzovsk if (done) { 1251*6695Saguzovsk ASSERT(delcallb_list == NULL); 1252*6695Saguzovsk goto out; 1253*6695Saguzovsk } 1254*6695Saguzovsk } 12550Sstevel@tonic-gate } 12560Sstevel@tonic-gate 1257*6695Saguzovsk out: 1258*6695Saguzovsk mutex_enter(&seg_pmem_mtx); 1259*6695Saguzovsk ASSERT(seg_plocked >= npages); 12600Sstevel@tonic-gate seg_plocked -= npages; 1261*6695Saguzovsk mutex_exit(&seg_pmem_mtx); 12620Sstevel@tonic-gate } 12630Sstevel@tonic-gate 12640Sstevel@tonic-gate /* 12650Sstevel@tonic-gate * purge all entries for a given segment. Since we 12660Sstevel@tonic-gate * callback into the segment driver directly for page 12670Sstevel@tonic-gate * reclaim the caller needs to hold the right locks. 12680Sstevel@tonic-gate */ 12690Sstevel@tonic-gate void 1270*6695Saguzovsk seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags) 12710Sstevel@tonic-gate { 12720Sstevel@tonic-gate struct seg_pcache *delcallb_list = NULL; 12730Sstevel@tonic-gate struct seg_pcache *pcp; 12740Sstevel@tonic-gate struct seg_phash *hp; 12750Sstevel@tonic-gate pgcnt_t npages = 0; 1276*6695Saguzovsk void *htag0; 12770Sstevel@tonic-gate 1278*6695Saguzovsk if (seg_plocked == 0) { 12790Sstevel@tonic-gate return; 12800Sstevel@tonic-gate } 1281*6695Saguzovsk ASSERT(seg_phashsize_win != 0); 1282*6695Saguzovsk 1283*6695Saguzovsk /* 1284*6695Saguzovsk * If amp is not NULL use amp as a lookup tag otherwise use seg 1285*6695Saguzovsk * as a lookup tag. 1286*6695Saguzovsk */ 1287*6695Saguzovsk htag0 = (amp == NULL ? (void *)seg : (void *)amp); 1288*6695Saguzovsk ASSERT(htag0 != NULL); 1289*6695Saguzovsk if (IS_PFLAGS_WIRED(flags)) { 1290*6695Saguzovsk hp = P_HASHBP(seg, htag0, 0, flags); 1291*6695Saguzovsk mutex_enter(&hp->p_hmutex); 1292*6695Saguzovsk pcp = hp->p_hnext; 1293*6695Saguzovsk while (pcp != (struct seg_pcache *)hp) { 1294*6695Saguzovsk ASSERT(pcp->p_hashp == hp); 1295*6695Saguzovsk ASSERT(IS_PCP_WIRED(pcp)); 1296*6695Saguzovsk if (pcp->p_htag0 == htag0) { 1297*6695Saguzovsk if (pcp->p_active) { 1298*6695Saguzovsk break; 1299*6695Saguzovsk } 1300*6695Saguzovsk pcp->p_hprev->p_hnext = pcp->p_hnext; 1301*6695Saguzovsk pcp->p_hnext->p_hprev = pcp->p_hprev; 1302*6695Saguzovsk pcp->p_hprev = delcallb_list; 1303*6695Saguzovsk delcallb_list = pcp; 1304*6695Saguzovsk } 1305*6695Saguzovsk pcp = pcp->p_hnext; 1306*6695Saguzovsk } 1307*6695Saguzovsk mutex_exit(&hp->p_hmutex); 1308*6695Saguzovsk } else { 1309*6695Saguzovsk pcache_link_t *plinkp; 1310*6695Saguzovsk pcache_link_t *pheadp; 1311*6695Saguzovsk kmutex_t *pmtx; 1312*6695Saguzovsk 1313*6695Saguzovsk if (amp == NULL) { 1314*6695Saguzovsk ASSERT(seg != NULL); 1315*6695Saguzovsk pheadp = &seg->s_phead; 1316*6695Saguzovsk pmtx = &seg->s_pmtx; 1317*6695Saguzovsk } else { 1318*6695Saguzovsk pheadp = &->a_phead; 1319*6695Saguzovsk pmtx = &->a_pmtx; 1320*6695Saguzovsk } 1321*6695Saguzovsk mutex_enter(pmtx); 1322*6695Saguzovsk while ((plinkp = pheadp->p_lnext) != pheadp) { 1323*6695Saguzovsk pcp = plink2pcache(plinkp); 1324*6695Saguzovsk ASSERT(!IS_PCP_WIRED(pcp)); 1325*6695Saguzovsk ASSERT(pcp->p_htag0 == htag0); 1326*6695Saguzovsk hp = pcp->p_hashp; 1327*6695Saguzovsk mutex_enter(&hp->p_hmutex); 13280Sstevel@tonic-gate if (pcp->p_active) { 1329*6695Saguzovsk mutex_exit(&hp->p_hmutex); 13300Sstevel@tonic-gate break; 13310Sstevel@tonic-gate } 1332*6695Saguzovsk ASSERT(plinkp->p_lprev == pheadp); 1333*6695Saguzovsk pheadp->p_lnext = plinkp->p_lnext; 1334*6695Saguzovsk plinkp->p_lnext->p_lprev = pheadp; 13350Sstevel@tonic-gate pcp->p_hprev->p_hnext = pcp->p_hnext; 13360Sstevel@tonic-gate pcp->p_hnext->p_hprev = pcp->p_hprev; 13370Sstevel@tonic-gate pcp->p_hprev = delcallb_list; 13380Sstevel@tonic-gate delcallb_list = pcp; 1339*6695Saguzovsk if (hp->p_hnext == (struct seg_pcache *)hp) { 1340*6695Saguzovsk seg_premove_abuck(hp, 0); 1341*6695Saguzovsk } 1342*6695Saguzovsk mutex_exit(&hp->p_hmutex); 13430Sstevel@tonic-gate } 1344*6695Saguzovsk mutex_exit(pmtx); 13450Sstevel@tonic-gate } 13460Sstevel@tonic-gate while (delcallb_list != NULL) { 13470Sstevel@tonic-gate pcp = delcallb_list; 13480Sstevel@tonic-gate delcallb_list = pcp->p_hprev; 1349*6695Saguzovsk ASSERT(!pcp->p_active); 1350*6695Saguzovsk (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len, 1351*6695Saguzovsk pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0); 1352*6695Saguzovsk npages += btop(pcp->p_len); 1353*6695Saguzovsk kmem_cache_free(seg_pkmcache, pcp); 13540Sstevel@tonic-gate } 1355*6695Saguzovsk mutex_enter(&seg_pmem_mtx); 1356*6695Saguzovsk ASSERT(seg_plocked >= npages); 13570Sstevel@tonic-gate seg_plocked -= npages; 1358*6695Saguzovsk if (!IS_PFLAGS_WIRED(flags)) { 1359*6695Saguzovsk ASSERT(seg_plocked_window >= npages); 1360*6695Saguzovsk seg_plocked_window -= npages; 1361*6695Saguzovsk } 1362*6695Saguzovsk mutex_exit(&seg_pmem_mtx); 13630Sstevel@tonic-gate } 13640Sstevel@tonic-gate 13650Sstevel@tonic-gate static void seg_pinit_mem_config(void); 13660Sstevel@tonic-gate 13670Sstevel@tonic-gate /* 13680Sstevel@tonic-gate * setup the pagelock cache 13690Sstevel@tonic-gate */ 13700Sstevel@tonic-gate static void 13710Sstevel@tonic-gate seg_pinit(void) 13720Sstevel@tonic-gate { 13730Sstevel@tonic-gate struct seg_phash *hp; 1374*6695Saguzovsk ulong_t i; 1375*6695Saguzovsk pgcnt_t physmegs; 1376*6695Saguzovsk 1377*6695Saguzovsk seg_plocked = 0; 1378*6695Saguzovsk seg_plocked_window = 0; 1379*6695Saguzovsk 1380*6695Saguzovsk if (segpcache_enabled == 0) { 1381*6695Saguzovsk seg_phashsize_win = 0; 1382*6695Saguzovsk seg_phashsize_wired = 0; 1383*6695Saguzovsk seg_pdisabled = 1; 1384*6695Saguzovsk return; 1385*6695Saguzovsk } 13860Sstevel@tonic-gate 1387*6695Saguzovsk seg_pdisabled = 0; 1388*6695Saguzovsk seg_pkmcache = kmem_cache_create("seg_pcache", 1389*6695Saguzovsk sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0); 1390*6695Saguzovsk if (segpcache_pcp_maxage_ticks <= 0) { 1391*6695Saguzovsk segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz; 1392*6695Saguzovsk } 1393*6695Saguzovsk seg_pmax_pcpage = segpcache_pcp_maxage_ticks; 1394*6695Saguzovsk seg_pathr_empty_ahb = 0; 1395*6695Saguzovsk seg_pathr_full_ahb = 0; 1396*6695Saguzovsk seg_pshrink_shift = segpcache_shrink_shift; 1397*6695Saguzovsk seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes); 13980Sstevel@tonic-gate 1399*6695Saguzovsk mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL); 1400*6695Saguzovsk mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL); 1401*6695Saguzovsk mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL); 1402*6695Saguzovsk cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL); 1403*6695Saguzovsk 1404*6695Saguzovsk physmegs = physmem >> (20 - PAGESHIFT); 14050Sstevel@tonic-gate 1406*6695Saguzovsk /* 1407*6695Saguzovsk * If segpcache_hashsize_win was not set in /etc/system or it has 1408*6695Saguzovsk * absurd value set it to a default. 1409*6695Saguzovsk */ 1410*6695Saguzovsk if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) { 1411*6695Saguzovsk /* 1412*6695Saguzovsk * Create one bucket per 32K (or at least per 8 pages) of 1413*6695Saguzovsk * available memory. 1414*6695Saguzovsk */ 1415*6695Saguzovsk pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8); 1416*6695Saguzovsk segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket); 1417*6695Saguzovsk } 1418*6695Saguzovsk if (!ISP2(segpcache_hashsize_win)) { 1419*6695Saguzovsk ulong_t rndfac = ~(1UL << 1420*6695Saguzovsk (highbit(segpcache_hashsize_win) - 1)); 1421*6695Saguzovsk rndfac &= segpcache_hashsize_win; 1422*6695Saguzovsk segpcache_hashsize_win += rndfac; 1423*6695Saguzovsk segpcache_hashsize_win = 1 << 1424*6695Saguzovsk (highbit(segpcache_hashsize_win) - 1); 1425*6695Saguzovsk } 1426*6695Saguzovsk seg_phashsize_win = segpcache_hashsize_win; 1427*6695Saguzovsk seg_phashtab_win = kmem_zalloc( 1428*6695Saguzovsk seg_phashsize_win * sizeof (struct seg_phash), 1429*6695Saguzovsk KM_SLEEP); 1430*6695Saguzovsk for (i = 0; i < seg_phashsize_win; i++) { 1431*6695Saguzovsk hp = &seg_phashtab_win[i]; 1432*6695Saguzovsk hp->p_hnext = (struct seg_pcache *)hp; 1433*6695Saguzovsk hp->p_hprev = (struct seg_pcache *)hp; 1434*6695Saguzovsk mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); 1435*6695Saguzovsk } 14360Sstevel@tonic-gate 1437*6695Saguzovsk seg_pahcur = 0; 1438*6695Saguzovsk seg_pathr_on = 0; 1439*6695Saguzovsk seg_pahhead[0].p_lnext = &seg_pahhead[0]; 1440*6695Saguzovsk seg_pahhead[0].p_lprev = &seg_pahhead[0]; 1441*6695Saguzovsk seg_pahhead[1].p_lnext = &seg_pahhead[1]; 1442*6695Saguzovsk seg_pahhead[1].p_lprev = &seg_pahhead[1]; 1443*6695Saguzovsk 1444*6695Saguzovsk /* 1445*6695Saguzovsk * If segpcache_hashsize_wired was not set in /etc/system or it has 1446*6695Saguzovsk * absurd value set it to a default. 1447*6695Saguzovsk */ 1448*6695Saguzovsk if (segpcache_hashsize_wired == 0 || 1449*6695Saguzovsk segpcache_hashsize_wired > physmem / 4) { 1450*6695Saguzovsk /* 1451*6695Saguzovsk * Choose segpcache_hashsize_wired based on physmem. 1452*6695Saguzovsk * Create a bucket per 128K bytes upto 256K buckets. 1453*6695Saguzovsk */ 1454*6695Saguzovsk if (physmegs < 20 * 1024) { 1455*6695Saguzovsk segpcache_hashsize_wired = MAX(1024, physmegs << 3); 1456*6695Saguzovsk } else { 1457*6695Saguzovsk segpcache_hashsize_wired = 256 * 1024; 14580Sstevel@tonic-gate } 14590Sstevel@tonic-gate } 1460*6695Saguzovsk if (!ISP2(segpcache_hashsize_wired)) { 1461*6695Saguzovsk segpcache_hashsize_wired = 1 << 1462*6695Saguzovsk highbit(segpcache_hashsize_wired); 1463*6695Saguzovsk } 1464*6695Saguzovsk seg_phashsize_wired = segpcache_hashsize_wired; 1465*6695Saguzovsk seg_phashtab_wired = kmem_zalloc( 1466*6695Saguzovsk seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP); 1467*6695Saguzovsk for (i = 0; i < seg_phashsize_wired; i++) { 1468*6695Saguzovsk hp = (struct seg_phash *)&seg_phashtab_wired[i]; 1469*6695Saguzovsk hp->p_hnext = (struct seg_pcache *)hp; 1470*6695Saguzovsk hp->p_hprev = (struct seg_pcache *)hp; 1471*6695Saguzovsk mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); 1472*6695Saguzovsk } 14730Sstevel@tonic-gate 1474*6695Saguzovsk if (segpcache_maxwindow == 0) { 1475*6695Saguzovsk if (physmegs < 64) { 1476*6695Saguzovsk /* 3% of memory */ 1477*6695Saguzovsk segpcache_maxwindow = availrmem >> 5; 1478*6695Saguzovsk } else if (physmegs < 512) { 1479*6695Saguzovsk /* 12% of memory */ 1480*6695Saguzovsk segpcache_maxwindow = availrmem >> 3; 1481*6695Saguzovsk } else if (physmegs < 1024) { 1482*6695Saguzovsk /* 25% of memory */ 1483*6695Saguzovsk segpcache_maxwindow = availrmem >> 2; 1484*6695Saguzovsk } else if (physmegs < 2048) { 1485*6695Saguzovsk /* 50% of memory */ 1486*6695Saguzovsk segpcache_maxwindow = availrmem >> 1; 1487*6695Saguzovsk } else { 1488*6695Saguzovsk /* no limit */ 1489*6695Saguzovsk segpcache_maxwindow = (pgcnt_t)-1; 1490*6695Saguzovsk } 1491*6695Saguzovsk } 1492*6695Saguzovsk seg_pmaxwindow = segpcache_maxwindow; 14930Sstevel@tonic-gate seg_pinit_mem_config(); 14940Sstevel@tonic-gate } 14950Sstevel@tonic-gate 14960Sstevel@tonic-gate /* 14970Sstevel@tonic-gate * called by pageout if memory is low 14980Sstevel@tonic-gate */ 14990Sstevel@tonic-gate void 15000Sstevel@tonic-gate seg_preap(void) 15010Sstevel@tonic-gate { 15020Sstevel@tonic-gate /* 1503*6695Saguzovsk * if the cache is off or empty, return 15040Sstevel@tonic-gate */ 1505*6695Saguzovsk if (seg_plocked_window == 0) { 15060Sstevel@tonic-gate return; 15070Sstevel@tonic-gate } 1508*6695Saguzovsk ASSERT(seg_phashsize_win != 0); 1509*6695Saguzovsk 1510*6695Saguzovsk /* 1511*6695Saguzovsk * If somebody is already purging pcache 1512*6695Saguzovsk * just return. 1513*6695Saguzovsk */ 1514*6695Saguzovsk if (seg_pdisabled) { 1515*6695Saguzovsk return; 1516*6695Saguzovsk } 1517*6695Saguzovsk 1518*6695Saguzovsk cv_signal(&seg_pasync_cv); 15190Sstevel@tonic-gate } 15200Sstevel@tonic-gate 15210Sstevel@tonic-gate /* 15220Sstevel@tonic-gate * run as a backgroud thread and reclaim pagelock 15230Sstevel@tonic-gate * pages which have not been used recently 15240Sstevel@tonic-gate */ 15250Sstevel@tonic-gate void 15260Sstevel@tonic-gate seg_pasync_thread(void) 15270Sstevel@tonic-gate { 15280Sstevel@tonic-gate callb_cpr_t cpr_info; 15290Sstevel@tonic-gate 1530*6695Saguzovsk if (seg_phashsize_win == 0) { 1531*6695Saguzovsk thread_exit(); 1532*6695Saguzovsk /*NOTREACHED*/ 15330Sstevel@tonic-gate } 15340Sstevel@tonic-gate 1535*6695Saguzovsk seg_pasync_thr = curthread; 1536*6695Saguzovsk 1537*6695Saguzovsk CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx, 1538*6695Saguzovsk callb_generic_cpr, "seg_pasync"); 1539*6695Saguzovsk 1540*6695Saguzovsk if (segpcache_reap_ticks <= 0) { 1541*6695Saguzovsk segpcache_reap_ticks = segpcache_reap_sec * hz; 1542*6695Saguzovsk } 15430Sstevel@tonic-gate 1544*6695Saguzovsk mutex_enter(&seg_pasync_mtx); 1545*6695Saguzovsk for (;;) { 1546*6695Saguzovsk CALLB_CPR_SAFE_BEGIN(&cpr_info); 1547*6695Saguzovsk (void) cv_timedwait(&seg_pasync_cv, &seg_pasync_mtx, 1548*6695Saguzovsk lbolt + segpcache_reap_ticks); 1549*6695Saguzovsk CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx); 1550*6695Saguzovsk if (seg_pdisabled == 0) { 1551*6695Saguzovsk seg_ppurge_async(0); 1552*6695Saguzovsk } 15530Sstevel@tonic-gate } 15540Sstevel@tonic-gate } 15550Sstevel@tonic-gate 15560Sstevel@tonic-gate static struct kmem_cache *seg_cache; 15570Sstevel@tonic-gate 15580Sstevel@tonic-gate /* 15590Sstevel@tonic-gate * Initialize segment management data structures. 15600Sstevel@tonic-gate */ 15610Sstevel@tonic-gate void 15620Sstevel@tonic-gate seg_init(void) 15630Sstevel@tonic-gate { 15640Sstevel@tonic-gate kstat_t *ksp; 15650Sstevel@tonic-gate 1566*6695Saguzovsk seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg), 1567*6695Saguzovsk 0, NULL, NULL, NULL, NULL, NULL, 0); 15680Sstevel@tonic-gate 15690Sstevel@tonic-gate ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED, 15705928Sjj204856 segadvstat_ndata, KSTAT_FLAG_VIRTUAL); 15710Sstevel@tonic-gate if (ksp) { 15720Sstevel@tonic-gate ksp->ks_data = (void *)segadvstat_ptr; 15730Sstevel@tonic-gate kstat_install(ksp); 15740Sstevel@tonic-gate } 15750Sstevel@tonic-gate 15760Sstevel@tonic-gate seg_pinit(); 15770Sstevel@tonic-gate } 15780Sstevel@tonic-gate 15790Sstevel@tonic-gate /* 15800Sstevel@tonic-gate * Allocate a segment to cover [base, base+size] 15810Sstevel@tonic-gate * and attach it to the specified address space. 15820Sstevel@tonic-gate */ 15830Sstevel@tonic-gate struct seg * 15840Sstevel@tonic-gate seg_alloc(struct as *as, caddr_t base, size_t size) 15850Sstevel@tonic-gate { 15860Sstevel@tonic-gate struct seg *new; 15870Sstevel@tonic-gate caddr_t segbase; 15880Sstevel@tonic-gate size_t segsize; 15890Sstevel@tonic-gate 15900Sstevel@tonic-gate segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK); 15910Sstevel@tonic-gate segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) - 15920Sstevel@tonic-gate (uintptr_t)segbase; 15930Sstevel@tonic-gate 15940Sstevel@tonic-gate if (!valid_va_range(&segbase, &segsize, segsize, AH_LO)) 15950Sstevel@tonic-gate return ((struct seg *)NULL); /* bad virtual addr range */ 15960Sstevel@tonic-gate 15970Sstevel@tonic-gate if (as != &kas && 15980Sstevel@tonic-gate valid_usr_range(segbase, segsize, 0, as, 15990Sstevel@tonic-gate as->a_userlimit) != RANGE_OKAY) 16000Sstevel@tonic-gate return ((struct seg *)NULL); /* bad virtual addr range */ 16010Sstevel@tonic-gate 16020Sstevel@tonic-gate new = kmem_cache_alloc(seg_cache, KM_SLEEP); 16030Sstevel@tonic-gate new->s_ops = NULL; 16040Sstevel@tonic-gate new->s_data = NULL; 16050Sstevel@tonic-gate new->s_szc = 0; 16060Sstevel@tonic-gate new->s_flags = 0; 1607*6695Saguzovsk mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL); 1608*6695Saguzovsk new->s_phead.p_lnext = &new->s_phead; 1609*6695Saguzovsk new->s_phead.p_lprev = &new->s_phead; 16100Sstevel@tonic-gate if (seg_attach(as, segbase, segsize, new) < 0) { 16110Sstevel@tonic-gate kmem_cache_free(seg_cache, new); 16120Sstevel@tonic-gate return ((struct seg *)NULL); 16130Sstevel@tonic-gate } 16140Sstevel@tonic-gate /* caller must fill in ops, data */ 16150Sstevel@tonic-gate return (new); 16160Sstevel@tonic-gate } 16170Sstevel@tonic-gate 16180Sstevel@tonic-gate /* 16190Sstevel@tonic-gate * Attach a segment to the address space. Used by seg_alloc() 16200Sstevel@tonic-gate * and for kernel startup to attach to static segments. 16210Sstevel@tonic-gate */ 16220Sstevel@tonic-gate int 16230Sstevel@tonic-gate seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg) 16240Sstevel@tonic-gate { 16250Sstevel@tonic-gate seg->s_as = as; 16260Sstevel@tonic-gate seg->s_base = base; 16270Sstevel@tonic-gate seg->s_size = size; 16280Sstevel@tonic-gate 16290Sstevel@tonic-gate /* 16300Sstevel@tonic-gate * as_addseg() will add the segment at the appropraite point 16310Sstevel@tonic-gate * in the list. It will return -1 if there is overlap with 16320Sstevel@tonic-gate * an already existing segment. 16330Sstevel@tonic-gate */ 16340Sstevel@tonic-gate return (as_addseg(as, seg)); 16350Sstevel@tonic-gate } 16360Sstevel@tonic-gate 16370Sstevel@tonic-gate /* 16380Sstevel@tonic-gate * Unmap a segment and free it from its associated address space. 16390Sstevel@tonic-gate * This should be called by anybody who's finished with a whole segment's 16400Sstevel@tonic-gate * mapping. Just calls SEGOP_UNMAP() on the whole mapping . It is the 16410Sstevel@tonic-gate * responsibility of the segment driver to unlink the the segment 16420Sstevel@tonic-gate * from the address space, and to free public and private data structures 16430Sstevel@tonic-gate * associated with the segment. (This is typically done by a call to 16440Sstevel@tonic-gate * seg_free()). 16450Sstevel@tonic-gate */ 16460Sstevel@tonic-gate void 16470Sstevel@tonic-gate seg_unmap(struct seg *seg) 16480Sstevel@tonic-gate { 16490Sstevel@tonic-gate #ifdef DEBUG 16500Sstevel@tonic-gate int ret; 16510Sstevel@tonic-gate #endif /* DEBUG */ 16520Sstevel@tonic-gate 16530Sstevel@tonic-gate ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 16540Sstevel@tonic-gate 16550Sstevel@tonic-gate /* Shouldn't have called seg_unmap if mapping isn't yet established */ 16560Sstevel@tonic-gate ASSERT(seg->s_data != NULL); 16570Sstevel@tonic-gate 16580Sstevel@tonic-gate /* Unmap the whole mapping */ 16590Sstevel@tonic-gate #ifdef DEBUG 16600Sstevel@tonic-gate ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 16610Sstevel@tonic-gate ASSERT(ret == 0); 16620Sstevel@tonic-gate #else 16630Sstevel@tonic-gate SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 16640Sstevel@tonic-gate #endif /* DEBUG */ 16650Sstevel@tonic-gate } 16660Sstevel@tonic-gate 16670Sstevel@tonic-gate /* 16680Sstevel@tonic-gate * Free the segment from its associated as. This should only be called 16690Sstevel@tonic-gate * if a mapping to the segment has not yet been established (e.g., if 16700Sstevel@tonic-gate * an error occurs in the middle of doing an as_map when the segment 16710Sstevel@tonic-gate * has already been partially set up) or if it has already been deleted 16720Sstevel@tonic-gate * (e.g., from a segment driver unmap routine if the unmap applies to the 16730Sstevel@tonic-gate * entire segment). If the mapping is currently set up then seg_unmap() should 16740Sstevel@tonic-gate * be called instead. 16750Sstevel@tonic-gate */ 16760Sstevel@tonic-gate void 16770Sstevel@tonic-gate seg_free(struct seg *seg) 16780Sstevel@tonic-gate { 16790Sstevel@tonic-gate register struct as *as = seg->s_as; 16800Sstevel@tonic-gate struct seg *tseg = as_removeseg(as, seg); 16810Sstevel@tonic-gate 16820Sstevel@tonic-gate ASSERT(tseg == seg); 16830Sstevel@tonic-gate 16840Sstevel@tonic-gate /* 16850Sstevel@tonic-gate * If the segment private data field is NULL, 16860Sstevel@tonic-gate * then segment driver is not attached yet. 16870Sstevel@tonic-gate */ 16880Sstevel@tonic-gate if (seg->s_data != NULL) 16890Sstevel@tonic-gate SEGOP_FREE(seg); 16900Sstevel@tonic-gate 1691*6695Saguzovsk mutex_destroy(&seg->s_pmtx); 1692*6695Saguzovsk ASSERT(seg->s_phead.p_lnext == &seg->s_phead); 1693*6695Saguzovsk ASSERT(seg->s_phead.p_lprev == &seg->s_phead); 16940Sstevel@tonic-gate kmem_cache_free(seg_cache, seg); 16950Sstevel@tonic-gate } 16960Sstevel@tonic-gate 16970Sstevel@tonic-gate /*ARGSUSED*/ 16980Sstevel@tonic-gate static void 16990Sstevel@tonic-gate seg_p_mem_config_post_add( 17000Sstevel@tonic-gate void *arg, 17010Sstevel@tonic-gate pgcnt_t delta_pages) 17020Sstevel@tonic-gate { 17030Sstevel@tonic-gate /* Nothing to do. */ 17040Sstevel@tonic-gate } 17050Sstevel@tonic-gate 17063480Sjfrank void 17073480Sjfrank seg_p_enable(void) 17083480Sjfrank { 1709*6695Saguzovsk mutex_enter(&seg_pcache_mtx); 1710*6695Saguzovsk ASSERT(seg_pdisabled != 0); 1711*6695Saguzovsk seg_pdisabled--; 1712*6695Saguzovsk mutex_exit(&seg_pcache_mtx); 17133480Sjfrank } 17143480Sjfrank 17153480Sjfrank /* 17163480Sjfrank * seg_p_disable - disables seg_pcache, and then attempts to empty the 17173480Sjfrank * cache. 17183480Sjfrank * Returns SEGP_SUCCESS if the cache was successfully emptied, or 17193480Sjfrank * SEGP_FAIL if the cache could not be emptied. 17203480Sjfrank */ 17213480Sjfrank int 17223480Sjfrank seg_p_disable(void) 17233480Sjfrank { 17243480Sjfrank pgcnt_t old_plocked; 17253480Sjfrank int stall_count = 0; 17263480Sjfrank 1727*6695Saguzovsk mutex_enter(&seg_pcache_mtx); 1728*6695Saguzovsk seg_pdisabled++; 1729*6695Saguzovsk ASSERT(seg_pdisabled != 0); 1730*6695Saguzovsk mutex_exit(&seg_pcache_mtx); 17313480Sjfrank 17323480Sjfrank /* 17333480Sjfrank * Attempt to empty the cache. Terminate if seg_plocked does not 17343480Sjfrank * diminish with SEGP_STALL_THRESHOLD consecutive attempts. 17353480Sjfrank */ 17363480Sjfrank while (seg_plocked != 0) { 1737*6695Saguzovsk ASSERT(seg_phashsize_win != 0); 17383480Sjfrank old_plocked = seg_plocked; 1739*6695Saguzovsk seg_ppurge_async(1); 17403480Sjfrank if (seg_plocked == old_plocked) { 17413480Sjfrank if (stall_count++ > SEGP_STALL_THRESHOLD) { 17423480Sjfrank return (SEGP_FAIL); 17433480Sjfrank } 17443480Sjfrank } else 17453480Sjfrank stall_count = 0; 17463480Sjfrank if (seg_plocked != 0) 17473480Sjfrank delay(hz/SEGP_PREDEL_DELAY_FACTOR); 17483480Sjfrank } 17493480Sjfrank return (SEGP_SUCCESS); 17503480Sjfrank } 17513480Sjfrank 17520Sstevel@tonic-gate /* 17530Sstevel@tonic-gate * Attempt to purge seg_pcache. May need to return before this has 17540Sstevel@tonic-gate * completed to allow other pre_del callbacks to unlock pages. This is 17550Sstevel@tonic-gate * ok because: 1756*6695Saguzovsk * 1) The seg_pdisabled flag has been set so at least we won't 17570Sstevel@tonic-gate * cache anymore locks and the locks we couldn't purge 17580Sstevel@tonic-gate * will not be held if they do get released by a subsequent 17590Sstevel@tonic-gate * pre-delete callback. 17600Sstevel@tonic-gate * 17610Sstevel@tonic-gate * 2) The rest of the memory delete thread processing does not 17620Sstevel@tonic-gate * depend on the changes made in this pre-delete callback. No 17630Sstevel@tonic-gate * panics will result, the worst that will happen is that the 17640Sstevel@tonic-gate * DR code will timeout and cancel the delete. 17650Sstevel@tonic-gate */ 17660Sstevel@tonic-gate /*ARGSUSED*/ 17670Sstevel@tonic-gate static int 17680Sstevel@tonic-gate seg_p_mem_config_pre_del( 17690Sstevel@tonic-gate void *arg, 17700Sstevel@tonic-gate pgcnt_t delta_pages) 17710Sstevel@tonic-gate { 1772*6695Saguzovsk if (seg_phashsize_win == 0) { 1773*6695Saguzovsk return (0); 1774*6695Saguzovsk } 17753480Sjfrank if (seg_p_disable() != SEGP_SUCCESS) 17763480Sjfrank cmn_err(CE_NOTE, 17773480Sjfrank "!Pre-delete couldn't purge"" pagelock cache - continuing"); 17780Sstevel@tonic-gate return (0); 17790Sstevel@tonic-gate } 17800Sstevel@tonic-gate 17810Sstevel@tonic-gate /*ARGSUSED*/ 17820Sstevel@tonic-gate static void 17830Sstevel@tonic-gate seg_p_mem_config_post_del( 17840Sstevel@tonic-gate void *arg, 17850Sstevel@tonic-gate pgcnt_t delta_pages, 17860Sstevel@tonic-gate int cancelled) 17870Sstevel@tonic-gate { 1788*6695Saguzovsk if (seg_phashsize_win == 0) { 1789*6695Saguzovsk return; 1790*6695Saguzovsk } 17913480Sjfrank seg_p_enable(); 17920Sstevel@tonic-gate } 17930Sstevel@tonic-gate 17940Sstevel@tonic-gate static kphysm_setup_vector_t seg_p_mem_config_vec = { 17950Sstevel@tonic-gate KPHYSM_SETUP_VECTOR_VERSION, 17960Sstevel@tonic-gate seg_p_mem_config_post_add, 17970Sstevel@tonic-gate seg_p_mem_config_pre_del, 17980Sstevel@tonic-gate seg_p_mem_config_post_del, 17990Sstevel@tonic-gate }; 18000Sstevel@tonic-gate 18010Sstevel@tonic-gate static void 18020Sstevel@tonic-gate seg_pinit_mem_config(void) 18030Sstevel@tonic-gate { 18040Sstevel@tonic-gate int ret; 18050Sstevel@tonic-gate 18060Sstevel@tonic-gate ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL); 18070Sstevel@tonic-gate /* 18080Sstevel@tonic-gate * Want to catch this in the debug kernel. At run time, if the 18090Sstevel@tonic-gate * callbacks don't get run all will be OK as the disable just makes 18100Sstevel@tonic-gate * it more likely that the pages can be collected. 18110Sstevel@tonic-gate */ 18120Sstevel@tonic-gate ASSERT(ret == 0); 18130Sstevel@tonic-gate } 18143247Sgjelinek 18153247Sgjelinek /* 18163247Sgjelinek * Verify that segment is not a shared anonymous segment which reserves 18173247Sgjelinek * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered 18183247Sgjelinek * from one zone to another if any segments are shared. This is because the 18193247Sgjelinek * last process to exit will credit the swap reservation. This could lead 18203247Sgjelinek * to the swap being reserved by one zone, and credited to another. 18213247Sgjelinek */ 18223247Sgjelinek boolean_t 18233247Sgjelinek seg_can_change_zones(struct seg *seg) 18243247Sgjelinek { 18253247Sgjelinek struct segvn_data *svd; 18263247Sgjelinek 18273247Sgjelinek if (seg->s_ops == &segspt_shmops) 18283247Sgjelinek return (B_FALSE); 18293247Sgjelinek 18303247Sgjelinek if (seg->s_ops == &segvn_ops) { 18313247Sgjelinek svd = (struct segvn_data *)seg->s_data; 18323247Sgjelinek if (svd->type == MAP_SHARED && 18333247Sgjelinek svd->amp != NULL && 18343247Sgjelinek svd->amp->swresv > 0) 18353247Sgjelinek return (B_FALSE); 18363247Sgjelinek } 18373247Sgjelinek return (B_TRUE); 18383247Sgjelinek } 18393247Sgjelinek 18403247Sgjelinek /* 18413247Sgjelinek * Return swap reserved by a segment backing a private mapping. 18423247Sgjelinek */ 18433247Sgjelinek size_t 18443247Sgjelinek seg_swresv(struct seg *seg) 18453247Sgjelinek { 18463247Sgjelinek struct segvn_data *svd; 18473247Sgjelinek size_t swap = 0; 18483247Sgjelinek 18493247Sgjelinek if (seg->s_ops == &segvn_ops) { 18503247Sgjelinek svd = (struct segvn_data *)seg->s_data; 18513247Sgjelinek if (svd->type == MAP_PRIVATE && svd->swresv > 0) 18523247Sgjelinek swap = svd->swresv; 18533247Sgjelinek } 18543247Sgjelinek return (swap); 18553247Sgjelinek } 1856