10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 50Sstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only 60Sstevel@tonic-gate * (the "License"). You may not use this file except in compliance 70Sstevel@tonic-gate * with the License. 80Sstevel@tonic-gate * 90Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 100Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 110Sstevel@tonic-gate * See the License for the specific language governing permissions 120Sstevel@tonic-gate * and limitations under the License. 130Sstevel@tonic-gate * 140Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 150Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 160Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 170Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 180Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 190Sstevel@tonic-gate * 200Sstevel@tonic-gate * CDDL HEADER END 210Sstevel@tonic-gate */ 220Sstevel@tonic-gate /* 230Sstevel@tonic-gate * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 240Sstevel@tonic-gate * Use is subject to license terms. 250Sstevel@tonic-gate */ 260Sstevel@tonic-gate 270Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 280Sstevel@tonic-gate 290Sstevel@tonic-gate #include <sys/types.h> 300Sstevel@tonic-gate #include <sys/sysmacros.h> 310Sstevel@tonic-gate #include <sys/kmem.h> 320Sstevel@tonic-gate #include <sys/atomic.h> 330Sstevel@tonic-gate #include <sys/bitmap.h> 340Sstevel@tonic-gate #include <sys/systm.h> 350Sstevel@tonic-gate #include <vm/seg_kmem.h> 360Sstevel@tonic-gate #include <vm/hat.h> 370Sstevel@tonic-gate #include <vm/vm_dep.h> 380Sstevel@tonic-gate #include <vm/hat_i86.h> 390Sstevel@tonic-gate #include <sys/cmn_err.h> 400Sstevel@tonic-gate 410Sstevel@tonic-gate 420Sstevel@tonic-gate /* 430Sstevel@tonic-gate * When pages are shared by more than one mapping, a list of these 440Sstevel@tonic-gate * structs hangs off of the page_t connected by the hm_next and hm_prev 450Sstevel@tonic-gate * fields. Every hment is also indexed by a system-wide hash table, using 460Sstevel@tonic-gate * hm_hashnext to connect it to the chain of hments in a single hash 470Sstevel@tonic-gate * bucket. 480Sstevel@tonic-gate */ 490Sstevel@tonic-gate struct hment { 500Sstevel@tonic-gate struct hment *hm_hashnext; /* next mapping on hash chain */ 510Sstevel@tonic-gate struct hment *hm_next; /* next mapping of same page */ 520Sstevel@tonic-gate struct hment *hm_prev; /* previous mapping of same page */ 530Sstevel@tonic-gate htable_t *hm_htable; /* corresponding htable_t */ 540Sstevel@tonic-gate uint16_t hm_entry; /* index of pte in htable */ 550Sstevel@tonic-gate uint16_t hm_pad; /* explicitly expose compiler padding */ 560Sstevel@tonic-gate #ifdef __amd64 570Sstevel@tonic-gate uint32_t hm_pad2; /* explicitly expose compiler padding */ 580Sstevel@tonic-gate #endif 590Sstevel@tonic-gate }; 600Sstevel@tonic-gate 610Sstevel@tonic-gate /* 620Sstevel@tonic-gate * Value returned by hment_walk() when dealing with a single mapping 630Sstevel@tonic-gate * embedded in the page_t. 640Sstevel@tonic-gate */ 650Sstevel@tonic-gate #define HMENT_EMBEDDED ((hment_t *)(uintptr_t)1) 660Sstevel@tonic-gate 670Sstevel@tonic-gate kmem_cache_t *hment_cache; 680Sstevel@tonic-gate 690Sstevel@tonic-gate /* 700Sstevel@tonic-gate * The hment reserve is similar to the htable reserve, with the following 710Sstevel@tonic-gate * exception. Hment's are never needed for HAT kmem allocs. 720Sstevel@tonic-gate * 730Sstevel@tonic-gate * The hment_reserve_amount variable is used, so that you can change it's 740Sstevel@tonic-gate * value to zero via a kernel debugger to force stealing to get tested. 750Sstevel@tonic-gate */ 760Sstevel@tonic-gate #define HMENT_RESERVE_AMOUNT (200) /* currently a guess at right value. */ 770Sstevel@tonic-gate uint_t hment_reserve_amount = HMENT_RESERVE_AMOUNT; 780Sstevel@tonic-gate kmutex_t hment_reserve_mutex; 790Sstevel@tonic-gate uint_t hment_reserve_count; 800Sstevel@tonic-gate hment_t *hment_reserve_pool; 810Sstevel@tonic-gate extern kthread_t *hat_reserves_thread; 820Sstevel@tonic-gate 830Sstevel@tonic-gate /* 840Sstevel@tonic-gate * Possible performance RFE: we might need to make this dynamic, perhaps 850Sstevel@tonic-gate * based on the number of pages in the system. 860Sstevel@tonic-gate */ 870Sstevel@tonic-gate #define HMENT_HASH_SIZE (64 * 1024) 880Sstevel@tonic-gate static uint_t hment_hash_entries = HMENT_HASH_SIZE; 890Sstevel@tonic-gate static hment_t **hment_hash; 900Sstevel@tonic-gate 910Sstevel@tonic-gate /* 920Sstevel@tonic-gate * Lots of highly shared pages will have the same value for "entry" (consider 930Sstevel@tonic-gate * the starting address of "xterm" or "sh"). So we'll distinguish them by 940Sstevel@tonic-gate * adding the pfn of the page table into both the high bits. 950Sstevel@tonic-gate * The shift by 9 corresponds to the range of values for entry (0..511). 960Sstevel@tonic-gate */ 970Sstevel@tonic-gate #define HMENT_HASH(pfn, entry) (uint32_t) \ 980Sstevel@tonic-gate ((((pfn) << 9) + entry + pfn) & (hment_hash_entries - 1)) 990Sstevel@tonic-gate 1000Sstevel@tonic-gate /* 1010Sstevel@tonic-gate * "mlist_lock" is a hashed mutex lock for protecting per-page mapping 1020Sstevel@tonic-gate * lists and "hash_lock" is a similar lock protecting the hment hash 1030Sstevel@tonic-gate * table. The hashed approach is taken to avoid the spatial overhead of 1040Sstevel@tonic-gate * maintaining a separate lock for each page, while still achieving better 1050Sstevel@tonic-gate * scalability than a single lock would allow. 1060Sstevel@tonic-gate */ 1070Sstevel@tonic-gate #define MLIST_NUM_LOCK 256 /* must be power of two */ 1080Sstevel@tonic-gate static kmutex_t mlist_lock[MLIST_NUM_LOCK]; 1090Sstevel@tonic-gate 1100Sstevel@tonic-gate /* 1110Sstevel@tonic-gate * the shift by 9 is so that all large pages don't use the same hash bucket 1120Sstevel@tonic-gate */ 1130Sstevel@tonic-gate #define MLIST_MUTEX(pp) \ 1140Sstevel@tonic-gate &mlist_lock[((pp)->p_pagenum + ((pp)->p_pagenum >> 9)) & \ 1150Sstevel@tonic-gate (MLIST_NUM_LOCK - 1)] 1160Sstevel@tonic-gate 1170Sstevel@tonic-gate #define HASH_NUM_LOCK 256 /* must be power of two */ 1180Sstevel@tonic-gate static kmutex_t hash_lock[HASH_NUM_LOCK]; 1190Sstevel@tonic-gate 1200Sstevel@tonic-gate #define HASH_MUTEX(idx) &hash_lock[(idx) & (HASH_NUM_LOCK-1)] 1210Sstevel@tonic-gate 1220Sstevel@tonic-gate static hment_t *hment_steal(void); 1230Sstevel@tonic-gate 1240Sstevel@tonic-gate /* 1250Sstevel@tonic-gate * put one hment onto the reserves list 1260Sstevel@tonic-gate */ 1270Sstevel@tonic-gate static void 1280Sstevel@tonic-gate hment_put_reserve(hment_t *hm) 1290Sstevel@tonic-gate { 1300Sstevel@tonic-gate HATSTAT_INC(hs_hm_put_reserve); 1310Sstevel@tonic-gate mutex_enter(&hment_reserve_mutex); 1320Sstevel@tonic-gate hm->hm_next = hment_reserve_pool; 1330Sstevel@tonic-gate hment_reserve_pool = hm; 1340Sstevel@tonic-gate ++hment_reserve_count; 1350Sstevel@tonic-gate mutex_exit(&hment_reserve_mutex); 1360Sstevel@tonic-gate } 1370Sstevel@tonic-gate 1380Sstevel@tonic-gate /* 1390Sstevel@tonic-gate * Take one hment from the reserve. 1400Sstevel@tonic-gate */ 1410Sstevel@tonic-gate static hment_t * 1420Sstevel@tonic-gate hment_get_reserve(void) 1430Sstevel@tonic-gate { 1440Sstevel@tonic-gate hment_t *hm = NULL; 1450Sstevel@tonic-gate 1460Sstevel@tonic-gate /* 1470Sstevel@tonic-gate * We rely on a "donation system" to refill the hment reserve 1480Sstevel@tonic-gate * list, which only takes place when we are allocating hments for 1490Sstevel@tonic-gate * user mappings. It is theoretically possible that an incredibly 1500Sstevel@tonic-gate * long string of kernel hment_alloc()s with no intervening user 1510Sstevel@tonic-gate * hment_alloc()s could exhaust that pool. 1520Sstevel@tonic-gate */ 1530Sstevel@tonic-gate HATSTAT_INC(hs_hm_get_reserve); 1540Sstevel@tonic-gate mutex_enter(&hment_reserve_mutex); 1550Sstevel@tonic-gate if (hment_reserve_count != 0) { 1560Sstevel@tonic-gate hm = hment_reserve_pool; 1570Sstevel@tonic-gate hment_reserve_pool = hm->hm_next; 1580Sstevel@tonic-gate --hment_reserve_count; 1590Sstevel@tonic-gate } 1600Sstevel@tonic-gate mutex_exit(&hment_reserve_mutex); 1610Sstevel@tonic-gate return (hm); 1620Sstevel@tonic-gate } 1630Sstevel@tonic-gate 1640Sstevel@tonic-gate /* 1650Sstevel@tonic-gate * Allocate an hment 1660Sstevel@tonic-gate */ 1670Sstevel@tonic-gate static hment_t * 1680Sstevel@tonic-gate hment_alloc() 1690Sstevel@tonic-gate { 1700Sstevel@tonic-gate int km_flag = can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP; 1710Sstevel@tonic-gate hment_t *hm = NULL; 1720Sstevel@tonic-gate int use_reserves = (use_boot_reserve || 1730Sstevel@tonic-gate curthread == hat_reserves_thread || panicstr != NULL); 1740Sstevel@tonic-gate 1750Sstevel@tonic-gate /* 1760Sstevel@tonic-gate * If we aren't using the reserves, try using kmem to get an hment. 1770Sstevel@tonic-gate * Donate any successful allocations to reserves if low. 1780Sstevel@tonic-gate * 1790Sstevel@tonic-gate * If we're in panic, resort to using the reserves. 1800Sstevel@tonic-gate */ 1810Sstevel@tonic-gate HATSTAT_INC(hs_hm_alloc); 1820Sstevel@tonic-gate if (!use_reserves) { 1830Sstevel@tonic-gate for (;;) { 1840Sstevel@tonic-gate hm = kmem_cache_alloc(hment_cache, km_flag); 1850Sstevel@tonic-gate if (hment_reserve_count >= hment_reserve_amount || 1860Sstevel@tonic-gate hm == NULL || panicstr != NULL || 1870Sstevel@tonic-gate curthread == hat_reserves_thread) 1880Sstevel@tonic-gate break; 1890Sstevel@tonic-gate hment_put_reserve(hm); 1900Sstevel@tonic-gate } 1910Sstevel@tonic-gate } 1920Sstevel@tonic-gate 1930Sstevel@tonic-gate /* 1940Sstevel@tonic-gate * If allocation failed, we need to tap the reserves or steal 1950Sstevel@tonic-gate */ 1960Sstevel@tonic-gate if (hm == NULL) { 1970Sstevel@tonic-gate if (use_reserves) 1980Sstevel@tonic-gate hm = hment_get_reserve(); 1990Sstevel@tonic-gate 2000Sstevel@tonic-gate /* 2010Sstevel@tonic-gate * If we still haven't gotten an hment, attempt to steal one by 2020Sstevel@tonic-gate * victimizing a mapping in a user htable. 2030Sstevel@tonic-gate */ 2040Sstevel@tonic-gate if (hm == NULL && can_steal_post_boot) 2050Sstevel@tonic-gate hm = hment_steal(); 2060Sstevel@tonic-gate 2070Sstevel@tonic-gate /* 2080Sstevel@tonic-gate * we're in dire straights, try the reserve 2090Sstevel@tonic-gate */ 2100Sstevel@tonic-gate if (hm == NULL) 2110Sstevel@tonic-gate hm = hment_get_reserve(); 2120Sstevel@tonic-gate 2130Sstevel@tonic-gate /* 2140Sstevel@tonic-gate * still no hment is a serious problem. 2150Sstevel@tonic-gate */ 2160Sstevel@tonic-gate if (hm == NULL) 2170Sstevel@tonic-gate panic("hment_alloc(): no reserve, couldn't steal"); 2180Sstevel@tonic-gate } 2190Sstevel@tonic-gate 2200Sstevel@tonic-gate 2210Sstevel@tonic-gate hm->hm_entry = 0; 2220Sstevel@tonic-gate hm->hm_htable = NULL; 2230Sstevel@tonic-gate hm->hm_hashnext = NULL; 2240Sstevel@tonic-gate hm->hm_next = NULL; 2250Sstevel@tonic-gate hm->hm_prev = NULL; 2260Sstevel@tonic-gate return (hm); 2270Sstevel@tonic-gate } 2280Sstevel@tonic-gate 2290Sstevel@tonic-gate /* 2300Sstevel@tonic-gate * Free an hment, possibly to the reserves list when called from the 2310Sstevel@tonic-gate * thread using the reserves. For example, when freeing an hment during an 2320Sstevel@tonic-gate * htable_steal(), we can't recurse into the kmem allocator, so we just 2330Sstevel@tonic-gate * push the hment onto the reserve list. 2340Sstevel@tonic-gate */ 2350Sstevel@tonic-gate void 2360Sstevel@tonic-gate hment_free(hment_t *hm) 2370Sstevel@tonic-gate { 2380Sstevel@tonic-gate #ifdef DEBUG 2390Sstevel@tonic-gate /* 2400Sstevel@tonic-gate * zero out all fields to try and force any race conditions to segfault 2410Sstevel@tonic-gate */ 2420Sstevel@tonic-gate bzero(hm, sizeof (*hm)); 2430Sstevel@tonic-gate #endif 2440Sstevel@tonic-gate HATSTAT_INC(hs_hm_free); 2450Sstevel@tonic-gate if (curthread == hat_reserves_thread || 2460Sstevel@tonic-gate hment_reserve_count < hment_reserve_amount) 2470Sstevel@tonic-gate hment_put_reserve(hm); 2480Sstevel@tonic-gate else 2490Sstevel@tonic-gate kmem_cache_free(hment_cache, hm); 2500Sstevel@tonic-gate } 2510Sstevel@tonic-gate 2520Sstevel@tonic-gate int 2530Sstevel@tonic-gate x86_hm_held(page_t *pp) 2540Sstevel@tonic-gate { 2550Sstevel@tonic-gate ASSERT(pp != NULL); 2560Sstevel@tonic-gate return (MUTEX_HELD(MLIST_MUTEX(pp))); 2570Sstevel@tonic-gate } 2580Sstevel@tonic-gate 2590Sstevel@tonic-gate void 2600Sstevel@tonic-gate x86_hm_enter(page_t *pp) 2610Sstevel@tonic-gate { 2620Sstevel@tonic-gate ASSERT(pp != NULL); 2630Sstevel@tonic-gate mutex_enter(MLIST_MUTEX(pp)); 2640Sstevel@tonic-gate } 2650Sstevel@tonic-gate 2660Sstevel@tonic-gate void 2670Sstevel@tonic-gate x86_hm_exit(page_t *pp) 2680Sstevel@tonic-gate { 2690Sstevel@tonic-gate ASSERT(pp != NULL); 2700Sstevel@tonic-gate mutex_exit(MLIST_MUTEX(pp)); 2710Sstevel@tonic-gate } 2720Sstevel@tonic-gate 2730Sstevel@tonic-gate /* 2740Sstevel@tonic-gate * Internal routine to add a full hment to a page_t mapping list 2750Sstevel@tonic-gate */ 2760Sstevel@tonic-gate static void 2770Sstevel@tonic-gate hment_insert(hment_t *hm, page_t *pp) 2780Sstevel@tonic-gate { 2790Sstevel@tonic-gate uint_t idx; 2800Sstevel@tonic-gate 2810Sstevel@tonic-gate ASSERT(x86_hm_held(pp)); 2820Sstevel@tonic-gate ASSERT(!pp->p_embed); 2830Sstevel@tonic-gate 2840Sstevel@tonic-gate /* 2850Sstevel@tonic-gate * Add the hment to the page's mapping list. 2860Sstevel@tonic-gate */ 2870Sstevel@tonic-gate ++pp->p_share; 2880Sstevel@tonic-gate hm->hm_next = pp->p_mapping; 2890Sstevel@tonic-gate if (pp->p_mapping != NULL) 2900Sstevel@tonic-gate ((hment_t *)pp->p_mapping)->hm_prev = hm; 2910Sstevel@tonic-gate pp->p_mapping = hm; 2920Sstevel@tonic-gate 2930Sstevel@tonic-gate /* 2940Sstevel@tonic-gate * Add the hment to the system-wide hash table. 2950Sstevel@tonic-gate */ 2960Sstevel@tonic-gate idx = HMENT_HASH(hm->hm_htable->ht_pfn, hm->hm_entry); 2970Sstevel@tonic-gate 2980Sstevel@tonic-gate mutex_enter(HASH_MUTEX(idx)); 2990Sstevel@tonic-gate hm->hm_hashnext = hment_hash[idx]; 3000Sstevel@tonic-gate hment_hash[idx] = hm; 3010Sstevel@tonic-gate mutex_exit(HASH_MUTEX(idx)); 3020Sstevel@tonic-gate } 3030Sstevel@tonic-gate 3040Sstevel@tonic-gate /* 3050Sstevel@tonic-gate * Prepare a mapping list entry to the given page. 3060Sstevel@tonic-gate * 3070Sstevel@tonic-gate * There are 4 different situations to deal with: 3080Sstevel@tonic-gate * 3090Sstevel@tonic-gate * - Adding the first mapping to a page_t as an embedded hment 3100Sstevel@tonic-gate * - Refaulting on an existing embedded mapping 3110Sstevel@tonic-gate * - Upgrading an embedded mapping when adding a 2nd mapping 3120Sstevel@tonic-gate * - Adding another mapping to a page_t that already has multiple mappings 3130Sstevel@tonic-gate * note we don't optimized for the refaulting case here. 3140Sstevel@tonic-gate * 3150Sstevel@tonic-gate * Due to competition with other threads that may be mapping/unmapping the 3160Sstevel@tonic-gate * same page and the need to drop all locks while allocating hments, any or 3170Sstevel@tonic-gate * all of the 3 situations can occur (and in almost any order) in any given 3180Sstevel@tonic-gate * call. Isn't this fun! 3190Sstevel@tonic-gate */ 3200Sstevel@tonic-gate hment_t * 3210Sstevel@tonic-gate hment_prepare(htable_t *htable, uint_t entry, page_t *pp) 3220Sstevel@tonic-gate { 3230Sstevel@tonic-gate hment_t *hm = NULL; 3240Sstevel@tonic-gate 3250Sstevel@tonic-gate ASSERT(x86_hm_held(pp)); 3260Sstevel@tonic-gate 3270Sstevel@tonic-gate for (;;) { 3280Sstevel@tonic-gate 3290Sstevel@tonic-gate /* 3300Sstevel@tonic-gate * The most common case is establishing the first mapping to a 3310Sstevel@tonic-gate * page, so check that first. This doesn't need any allocated 3320Sstevel@tonic-gate * hment. 3330Sstevel@tonic-gate */ 3340Sstevel@tonic-gate if (pp->p_mapping == NULL) { 3350Sstevel@tonic-gate ASSERT(!pp->p_embed); 3360Sstevel@tonic-gate ASSERT(pp->p_share == 0); 3370Sstevel@tonic-gate if (hm == NULL) 3380Sstevel@tonic-gate break; 3390Sstevel@tonic-gate 3400Sstevel@tonic-gate /* 3410Sstevel@tonic-gate * we had an hment already, so free it and retry 3420Sstevel@tonic-gate */ 3430Sstevel@tonic-gate goto free_and_continue; 3440Sstevel@tonic-gate } 3450Sstevel@tonic-gate 3460Sstevel@tonic-gate /* 3470Sstevel@tonic-gate * If there is an embedded mapping, we may need to 3480Sstevel@tonic-gate * convert it to an hment. 3490Sstevel@tonic-gate */ 3500Sstevel@tonic-gate if (pp->p_embed) { 3510Sstevel@tonic-gate 3520Sstevel@tonic-gate /* should point to htable */ 3530Sstevel@tonic-gate ASSERT(pp->p_mapping != NULL); 3540Sstevel@tonic-gate 3550Sstevel@tonic-gate /* 3560Sstevel@tonic-gate * If we are faulting on a pre-existing mapping 3570Sstevel@tonic-gate * there is no need to promote/allocate a new hment. 3580Sstevel@tonic-gate * This happens a lot due to segmap. 3590Sstevel@tonic-gate */ 3600Sstevel@tonic-gate if (pp->p_mapping == htable && pp->p_mlentry == entry) { 3610Sstevel@tonic-gate if (hm == NULL) 3620Sstevel@tonic-gate break; 3630Sstevel@tonic-gate goto free_and_continue; 3640Sstevel@tonic-gate } 3650Sstevel@tonic-gate 3660Sstevel@tonic-gate /* 3670Sstevel@tonic-gate * If we have an hment allocated, use it to promote the 3680Sstevel@tonic-gate * existing embedded mapping. 3690Sstevel@tonic-gate */ 3700Sstevel@tonic-gate if (hm != NULL) { 3710Sstevel@tonic-gate hm->hm_htable = pp->p_mapping; 3720Sstevel@tonic-gate hm->hm_entry = pp->p_mlentry; 3730Sstevel@tonic-gate pp->p_mapping = NULL; 3740Sstevel@tonic-gate pp->p_share = 0; 3750Sstevel@tonic-gate pp->p_embed = 0; 3760Sstevel@tonic-gate hment_insert(hm, pp); 3770Sstevel@tonic-gate } 3780Sstevel@tonic-gate 3790Sstevel@tonic-gate /* 3800Sstevel@tonic-gate * We either didn't have an hment allocated or we just 3810Sstevel@tonic-gate * used it for the embedded mapping. In either case, 3820Sstevel@tonic-gate * allocate another hment and restart. 3830Sstevel@tonic-gate */ 3840Sstevel@tonic-gate goto allocate_and_continue; 3850Sstevel@tonic-gate } 3860Sstevel@tonic-gate 3870Sstevel@tonic-gate /* 3880Sstevel@tonic-gate * Last possibility is that we're adding an hment to a list 3890Sstevel@tonic-gate * of hments. 3900Sstevel@tonic-gate */ 3910Sstevel@tonic-gate if (hm != NULL) 3920Sstevel@tonic-gate break; 3930Sstevel@tonic-gate allocate_and_continue: 3940Sstevel@tonic-gate x86_hm_exit(pp); 3950Sstevel@tonic-gate hm = hment_alloc(); 3960Sstevel@tonic-gate x86_hm_enter(pp); 3970Sstevel@tonic-gate continue; 3980Sstevel@tonic-gate 3990Sstevel@tonic-gate free_and_continue: 4000Sstevel@tonic-gate /* 4010Sstevel@tonic-gate * we allocated an hment already, free it and retry 4020Sstevel@tonic-gate */ 4030Sstevel@tonic-gate x86_hm_exit(pp); 4040Sstevel@tonic-gate hment_free(hm); 4050Sstevel@tonic-gate hm = NULL; 4060Sstevel@tonic-gate x86_hm_enter(pp); 4070Sstevel@tonic-gate } 4080Sstevel@tonic-gate ASSERT(x86_hm_held(pp)); 4090Sstevel@tonic-gate return (hm); 4100Sstevel@tonic-gate } 4110Sstevel@tonic-gate 4120Sstevel@tonic-gate /* 4130Sstevel@tonic-gate * Record a mapping list entry for the htable/entry to the given page. 4140Sstevel@tonic-gate * 4150Sstevel@tonic-gate * hment_prepare() should have properly set up the situation. 4160Sstevel@tonic-gate */ 4170Sstevel@tonic-gate void 4180Sstevel@tonic-gate hment_assign(htable_t *htable, uint_t entry, page_t *pp, hment_t *hm) 4190Sstevel@tonic-gate { 4200Sstevel@tonic-gate ASSERT(x86_hm_held(pp)); 4210Sstevel@tonic-gate 4220Sstevel@tonic-gate /* 4230Sstevel@tonic-gate * The most common case is establishing the first mapping to a 4240Sstevel@tonic-gate * page, so check that first. This doesn't need any allocated 4250Sstevel@tonic-gate * hment. 4260Sstevel@tonic-gate */ 4270Sstevel@tonic-gate if (pp->p_mapping == NULL) { 4280Sstevel@tonic-gate ASSERT(hm == NULL); 4290Sstevel@tonic-gate ASSERT(!pp->p_embed); 4300Sstevel@tonic-gate ASSERT(pp->p_share == 0); 4310Sstevel@tonic-gate pp->p_embed = 1; 4320Sstevel@tonic-gate pp->p_mapping = htable; 4330Sstevel@tonic-gate pp->p_mlentry = entry; 4340Sstevel@tonic-gate return; 4350Sstevel@tonic-gate } 4360Sstevel@tonic-gate 4370Sstevel@tonic-gate /* 4380Sstevel@tonic-gate * We should never get here with a pre-existing embedded maping 4390Sstevel@tonic-gate */ 4400Sstevel@tonic-gate ASSERT(!pp->p_embed); 4410Sstevel@tonic-gate 4420Sstevel@tonic-gate /* 4430Sstevel@tonic-gate * add the new hment to the mapping list 4440Sstevel@tonic-gate */ 4450Sstevel@tonic-gate ASSERT(hm != NULL); 4460Sstevel@tonic-gate hm->hm_htable = htable; 4470Sstevel@tonic-gate hm->hm_entry = entry; 4480Sstevel@tonic-gate hment_insert(hm, pp); 4490Sstevel@tonic-gate } 4500Sstevel@tonic-gate 4510Sstevel@tonic-gate /* 4520Sstevel@tonic-gate * Walk through the mappings for a page. 4530Sstevel@tonic-gate * 4540Sstevel@tonic-gate * must already have done an x86_hm_enter() 4550Sstevel@tonic-gate */ 4560Sstevel@tonic-gate hment_t * 4570Sstevel@tonic-gate hment_walk(page_t *pp, htable_t **ht, uint_t *entry, hment_t *prev) 4580Sstevel@tonic-gate { 4590Sstevel@tonic-gate hment_t *hm; 4600Sstevel@tonic-gate 4610Sstevel@tonic-gate ASSERT(x86_hm_held(pp)); 4620Sstevel@tonic-gate 4630Sstevel@tonic-gate if (pp->p_embed) { 4640Sstevel@tonic-gate if (prev == NULL) { 4650Sstevel@tonic-gate *ht = (htable_t *)pp->p_mapping; 4660Sstevel@tonic-gate *entry = pp->p_mlentry; 4670Sstevel@tonic-gate hm = HMENT_EMBEDDED; 4680Sstevel@tonic-gate } else { 4690Sstevel@tonic-gate ASSERT(prev == HMENT_EMBEDDED); 4700Sstevel@tonic-gate hm = NULL; 4710Sstevel@tonic-gate } 4720Sstevel@tonic-gate } else { 4730Sstevel@tonic-gate if (prev == NULL) { 4740Sstevel@tonic-gate ASSERT(prev != HMENT_EMBEDDED); 4750Sstevel@tonic-gate hm = (hment_t *)pp->p_mapping; 4760Sstevel@tonic-gate } else { 4770Sstevel@tonic-gate hm = prev->hm_next; 4780Sstevel@tonic-gate } 4790Sstevel@tonic-gate 4800Sstevel@tonic-gate if (hm != NULL) { 4810Sstevel@tonic-gate *ht = hm->hm_htable; 4820Sstevel@tonic-gate *entry = hm->hm_entry; 4830Sstevel@tonic-gate } 4840Sstevel@tonic-gate } 4850Sstevel@tonic-gate return (hm); 4860Sstevel@tonic-gate } 4870Sstevel@tonic-gate 4880Sstevel@tonic-gate /* 4890Sstevel@tonic-gate * Remove a mapping to a page from its mapping list. Must have 4900Sstevel@tonic-gate * the corresponding mapping list locked. 4910Sstevel@tonic-gate * Finds the mapping list entry with the given pte_t and 4920Sstevel@tonic-gate * unlinks it from the mapping list. 4930Sstevel@tonic-gate */ 4940Sstevel@tonic-gate hment_t * 4950Sstevel@tonic-gate hment_remove(page_t *pp, htable_t *ht, uint_t entry) 4960Sstevel@tonic-gate { 4970Sstevel@tonic-gate hment_t *prev = NULL; 4980Sstevel@tonic-gate hment_t *hm; 4990Sstevel@tonic-gate uint_t idx; 5000Sstevel@tonic-gate 5010Sstevel@tonic-gate ASSERT(x86_hm_held(pp)); 5020Sstevel@tonic-gate 5030Sstevel@tonic-gate /* 5040Sstevel@tonic-gate * Check if we have only one mapping embedded in the page_t. 5050Sstevel@tonic-gate */ 5060Sstevel@tonic-gate if (pp->p_embed) { 5070Sstevel@tonic-gate ASSERT(ht == (htable_t *)pp->p_mapping); 5080Sstevel@tonic-gate ASSERT(entry == pp->p_mlentry); 5090Sstevel@tonic-gate ASSERT(pp->p_share == 0); 5100Sstevel@tonic-gate pp->p_mapping = NULL; 5110Sstevel@tonic-gate pp->p_mlentry = 0; 5120Sstevel@tonic-gate pp->p_embed = 0; 5130Sstevel@tonic-gate return (NULL); 5140Sstevel@tonic-gate } 5150Sstevel@tonic-gate 5160Sstevel@tonic-gate /* 5170Sstevel@tonic-gate * Otherwise it must be in the list of hments. 5180Sstevel@tonic-gate * Find the hment in the system-wide hash table and remove it. 5190Sstevel@tonic-gate */ 5200Sstevel@tonic-gate ASSERT(pp->p_share != 0); 5210Sstevel@tonic-gate idx = HMENT_HASH(ht->ht_pfn, entry); 5220Sstevel@tonic-gate mutex_enter(HASH_MUTEX(idx)); 5230Sstevel@tonic-gate hm = hment_hash[idx]; 5240Sstevel@tonic-gate while (hm && (hm->hm_htable != ht || hm->hm_entry != entry)) { 5250Sstevel@tonic-gate prev = hm; 5260Sstevel@tonic-gate hm = hm->hm_hashnext; 5270Sstevel@tonic-gate } 528*47Sjosephb if (hm == NULL) { 529*47Sjosephb panic("hment_remove() missing in hash table pp=%lx, ht=%lx," 530*47Sjosephb "entry=0x%x hash index=0x%x", (uintptr_t)pp, (uintptr_t)ht, 531*47Sjosephb entry, idx); 532*47Sjosephb } 5330Sstevel@tonic-gate 5340Sstevel@tonic-gate if (prev) 5350Sstevel@tonic-gate prev->hm_hashnext = hm->hm_hashnext; 5360Sstevel@tonic-gate else 5370Sstevel@tonic-gate hment_hash[idx] = hm->hm_hashnext; 5380Sstevel@tonic-gate mutex_exit(HASH_MUTEX(idx)); 5390Sstevel@tonic-gate 5400Sstevel@tonic-gate /* 5410Sstevel@tonic-gate * Remove the hment from the page's mapping list 5420Sstevel@tonic-gate */ 5430Sstevel@tonic-gate if (hm->hm_next) 5440Sstevel@tonic-gate hm->hm_next->hm_prev = hm->hm_prev; 5450Sstevel@tonic-gate if (hm->hm_prev) 5460Sstevel@tonic-gate hm->hm_prev->hm_next = hm->hm_next; 5470Sstevel@tonic-gate else 5480Sstevel@tonic-gate pp->p_mapping = hm->hm_next; 5490Sstevel@tonic-gate 5500Sstevel@tonic-gate --pp->p_share; 5510Sstevel@tonic-gate hm->hm_hashnext = NULL; 5520Sstevel@tonic-gate hm->hm_next = NULL; 5530Sstevel@tonic-gate hm->hm_prev = NULL; 5540Sstevel@tonic-gate 5550Sstevel@tonic-gate return (hm); 5560Sstevel@tonic-gate } 5570Sstevel@tonic-gate 5580Sstevel@tonic-gate /* 5590Sstevel@tonic-gate * Put initial hment's in the reserve pool. 5600Sstevel@tonic-gate */ 5610Sstevel@tonic-gate void 5620Sstevel@tonic-gate hment_reserve(uint_t count) 5630Sstevel@tonic-gate { 5640Sstevel@tonic-gate hment_t *hm; 5650Sstevel@tonic-gate 5660Sstevel@tonic-gate count += hment_reserve_amount; 5670Sstevel@tonic-gate 5680Sstevel@tonic-gate while (hment_reserve_count < count) { 5690Sstevel@tonic-gate hm = kmem_cache_alloc(hment_cache, KM_NOSLEEP); 5700Sstevel@tonic-gate if (hm == NULL) 5710Sstevel@tonic-gate return; 5720Sstevel@tonic-gate hment_put_reserve(hm); 5730Sstevel@tonic-gate } 5740Sstevel@tonic-gate } 5750Sstevel@tonic-gate 5760Sstevel@tonic-gate /* 5770Sstevel@tonic-gate * Readjust the hment reserves after they may have been used. 5780Sstevel@tonic-gate */ 5790Sstevel@tonic-gate void 5800Sstevel@tonic-gate hment_adjust_reserve() 5810Sstevel@tonic-gate { 5820Sstevel@tonic-gate hment_t *hm; 5830Sstevel@tonic-gate 5840Sstevel@tonic-gate /* 5850Sstevel@tonic-gate * Free up any excess reserves 5860Sstevel@tonic-gate */ 5870Sstevel@tonic-gate while (hment_reserve_count > hment_reserve_amount) { 5880Sstevel@tonic-gate ASSERT(curthread != hat_reserves_thread); 5890Sstevel@tonic-gate hm = hment_get_reserve(); 5900Sstevel@tonic-gate if (hm == NULL) 5910Sstevel@tonic-gate return; 5920Sstevel@tonic-gate hment_free(hm); 5930Sstevel@tonic-gate } 5940Sstevel@tonic-gate } 5950Sstevel@tonic-gate 5960Sstevel@tonic-gate /* 5970Sstevel@tonic-gate * initialize hment data structures 5980Sstevel@tonic-gate */ 5990Sstevel@tonic-gate void 6000Sstevel@tonic-gate hment_init(void) 6010Sstevel@tonic-gate { 6020Sstevel@tonic-gate int i; 6030Sstevel@tonic-gate int flags = KMC_NOHASH | KMC_NODEBUG; 6040Sstevel@tonic-gate 6050Sstevel@tonic-gate /* 6060Sstevel@tonic-gate * Initialize kmem caches. On 32 bit kernel's we shut off 6070Sstevel@tonic-gate * debug information to save on precious kernel VA usage. 6080Sstevel@tonic-gate */ 6090Sstevel@tonic-gate hment_cache = kmem_cache_create("hment_t", 6100Sstevel@tonic-gate sizeof (hment_t), 0, NULL, NULL, NULL, 6110Sstevel@tonic-gate NULL, hat_memload_arena, flags); 6120Sstevel@tonic-gate 6130Sstevel@tonic-gate hment_hash = kmem_zalloc(hment_hash_entries * sizeof (hment_t *), 6140Sstevel@tonic-gate KM_SLEEP); 6150Sstevel@tonic-gate 6160Sstevel@tonic-gate for (i = 0; i < MLIST_NUM_LOCK; i++) 6170Sstevel@tonic-gate mutex_init(&mlist_lock[i], NULL, MUTEX_DEFAULT, NULL); 6180Sstevel@tonic-gate 6190Sstevel@tonic-gate for (i = 0; i < HASH_NUM_LOCK; i++) 6200Sstevel@tonic-gate mutex_init(&hash_lock[i], NULL, MUTEX_DEFAULT, NULL); 6210Sstevel@tonic-gate 6220Sstevel@tonic-gate 6230Sstevel@tonic-gate } 6240Sstevel@tonic-gate 6250Sstevel@tonic-gate /* 6260Sstevel@tonic-gate * return the number of mappings to a page 6270Sstevel@tonic-gate * 6280Sstevel@tonic-gate * Note there is no ASSERT() that the MUTEX is held for this. 6290Sstevel@tonic-gate * Hence the return value might be inaccurate if this is called without 6300Sstevel@tonic-gate * doing an x86_hm_enter(). 6310Sstevel@tonic-gate */ 6320Sstevel@tonic-gate uint_t 6330Sstevel@tonic-gate hment_mapcnt(page_t *pp) 6340Sstevel@tonic-gate { 6350Sstevel@tonic-gate uint_t cnt; 6360Sstevel@tonic-gate uint_t szc; 6370Sstevel@tonic-gate page_t *larger; 6380Sstevel@tonic-gate hment_t *hm; 6390Sstevel@tonic-gate 6400Sstevel@tonic-gate x86_hm_enter(pp); 6410Sstevel@tonic-gate if (pp->p_mapping == NULL) 6420Sstevel@tonic-gate cnt = 0; 6430Sstevel@tonic-gate else if (pp->p_embed) 6440Sstevel@tonic-gate cnt = 1; 6450Sstevel@tonic-gate else 6460Sstevel@tonic-gate cnt = pp->p_share; 6470Sstevel@tonic-gate x86_hm_exit(pp); 6480Sstevel@tonic-gate 6490Sstevel@tonic-gate /* 6500Sstevel@tonic-gate * walk through all larger mapping sizes counting mappings 6510Sstevel@tonic-gate */ 6520Sstevel@tonic-gate for (szc = 1; szc <= pp->p_szc; ++szc) { 6530Sstevel@tonic-gate larger = PP_GROUPLEADER(pp, szc); 6540Sstevel@tonic-gate if (larger == pp) /* don't double count large mappings */ 6550Sstevel@tonic-gate continue; 6560Sstevel@tonic-gate 6570Sstevel@tonic-gate x86_hm_enter(larger); 6580Sstevel@tonic-gate if (larger->p_mapping != NULL) { 6590Sstevel@tonic-gate if (larger->p_embed && 6600Sstevel@tonic-gate ((htable_t *)larger->p_mapping)->ht_level == szc) { 6610Sstevel@tonic-gate ++cnt; 6620Sstevel@tonic-gate } else if (!larger->p_embed) { 6630Sstevel@tonic-gate for (hm = larger->p_mapping; hm; 6640Sstevel@tonic-gate hm = hm->hm_next) { 6650Sstevel@tonic-gate if (hm->hm_htable->ht_level == szc) 6660Sstevel@tonic-gate ++cnt; 6670Sstevel@tonic-gate } 6680Sstevel@tonic-gate } 6690Sstevel@tonic-gate } 6700Sstevel@tonic-gate x86_hm_exit(larger); 6710Sstevel@tonic-gate } 6720Sstevel@tonic-gate return (cnt); 6730Sstevel@tonic-gate } 6740Sstevel@tonic-gate 6750Sstevel@tonic-gate /* 6760Sstevel@tonic-gate * We need to steal an hment. Walk through all the page_t's until we 6770Sstevel@tonic-gate * find one that has multiple mappings. Unload one of the mappings 6780Sstevel@tonic-gate * and reclaim that hment. Note that we'll save/restart the starting 6790Sstevel@tonic-gate * page to try and spread the pain. 6800Sstevel@tonic-gate */ 6810Sstevel@tonic-gate static page_t *last_page = NULL; 6820Sstevel@tonic-gate 6830Sstevel@tonic-gate static hment_t * 6840Sstevel@tonic-gate hment_steal(void) 6850Sstevel@tonic-gate { 6860Sstevel@tonic-gate page_t *last = last_page; 6870Sstevel@tonic-gate page_t *pp = last; 6880Sstevel@tonic-gate hment_t *hm = NULL; 6890Sstevel@tonic-gate hment_t *hm2; 6900Sstevel@tonic-gate htable_t *ht; 6910Sstevel@tonic-gate uint_t found_one = 0; 6920Sstevel@tonic-gate 6930Sstevel@tonic-gate HATSTAT_INC(hs_hm_steals); 6940Sstevel@tonic-gate if (pp == NULL) 6950Sstevel@tonic-gate last = pp = page_first(); 6960Sstevel@tonic-gate 6970Sstevel@tonic-gate while (!found_one) { 6980Sstevel@tonic-gate HATSTAT_INC(hs_hm_steal_exam); 6990Sstevel@tonic-gate pp = page_next(pp); 7000Sstevel@tonic-gate if (pp == NULL) 7010Sstevel@tonic-gate pp = page_first(); 7020Sstevel@tonic-gate 7030Sstevel@tonic-gate /* 7040Sstevel@tonic-gate * The loop and function exit here if nothing found to steal. 7050Sstevel@tonic-gate */ 7060Sstevel@tonic-gate if (pp == last) 7070Sstevel@tonic-gate return (NULL); 7080Sstevel@tonic-gate 7090Sstevel@tonic-gate /* 7100Sstevel@tonic-gate * Only lock the page_t if it has hments. 7110Sstevel@tonic-gate */ 7120Sstevel@tonic-gate if (pp->p_mapping == NULL || pp->p_embed) 7130Sstevel@tonic-gate continue; 7140Sstevel@tonic-gate 7150Sstevel@tonic-gate /* 7160Sstevel@tonic-gate * Search the mapping list for a usable mapping. 7170Sstevel@tonic-gate */ 7180Sstevel@tonic-gate x86_hm_enter(pp); 7190Sstevel@tonic-gate if (!pp->p_embed) { 7200Sstevel@tonic-gate for (hm = pp->p_mapping; hm; hm = hm->hm_next) { 7210Sstevel@tonic-gate ht = hm->hm_htable; 7220Sstevel@tonic-gate if (ht->ht_hat != kas.a_hat && 7230Sstevel@tonic-gate ht->ht_busy == 0 && 7240Sstevel@tonic-gate ht->ht_lock_cnt == 0) { 7250Sstevel@tonic-gate found_one = 1; 7260Sstevel@tonic-gate break; 7270Sstevel@tonic-gate } 7280Sstevel@tonic-gate } 7290Sstevel@tonic-gate } 7300Sstevel@tonic-gate if (!found_one) 7310Sstevel@tonic-gate x86_hm_exit(pp); 7320Sstevel@tonic-gate } 7330Sstevel@tonic-gate 7340Sstevel@tonic-gate /* 7350Sstevel@tonic-gate * Steal the mapping we found. Note that hati_page_unmap() will 7360Sstevel@tonic-gate * do the x86_hm_exit(). 7370Sstevel@tonic-gate */ 7380Sstevel@tonic-gate hm2 = hati_page_unmap(pp, ht, hm->hm_entry); 7390Sstevel@tonic-gate ASSERT(hm2 == hm); 7400Sstevel@tonic-gate last_page = pp; 7410Sstevel@tonic-gate return (hm); 7420Sstevel@tonic-gate } 743