xref: /onnv-gate/usr/src/uts/i86pc/vm/hment.c (revision 3543:4a9ba10f6a83)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
53308Ssudheer  * Common Development and Distribution License (the "License").
63308Ssudheer  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
22*3543Sjosephb  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
270Sstevel@tonic-gate 
280Sstevel@tonic-gate #include <sys/types.h>
290Sstevel@tonic-gate #include <sys/sysmacros.h>
300Sstevel@tonic-gate #include <sys/kmem.h>
310Sstevel@tonic-gate #include <sys/atomic.h>
320Sstevel@tonic-gate #include <sys/bitmap.h>
330Sstevel@tonic-gate #include <sys/systm.h>
340Sstevel@tonic-gate #include <vm/seg_kmem.h>
350Sstevel@tonic-gate #include <vm/hat.h>
360Sstevel@tonic-gate #include <vm/vm_dep.h>
370Sstevel@tonic-gate #include <vm/hat_i86.h>
380Sstevel@tonic-gate #include <sys/cmn_err.h>
390Sstevel@tonic-gate 
400Sstevel@tonic-gate 
410Sstevel@tonic-gate /*
420Sstevel@tonic-gate  * When pages are shared by more than one mapping, a list of these
430Sstevel@tonic-gate  * structs hangs off of the page_t connected by the hm_next and hm_prev
440Sstevel@tonic-gate  * fields.  Every hment is also indexed by a system-wide hash table, using
450Sstevel@tonic-gate  * hm_hashnext to connect it to the chain of hments in a single hash
460Sstevel@tonic-gate  * bucket.
470Sstevel@tonic-gate  */
480Sstevel@tonic-gate struct hment {
490Sstevel@tonic-gate 	struct hment	*hm_hashnext;	/* next mapping on hash chain */
500Sstevel@tonic-gate 	struct hment	*hm_next;	/* next mapping of same page */
510Sstevel@tonic-gate 	struct hment	*hm_prev;	/* previous mapping of same page */
520Sstevel@tonic-gate 	htable_t	*hm_htable;	/* corresponding htable_t */
533308Ssudheer 	pfn_t		hm_pfn;		/* mapping page frame number */
540Sstevel@tonic-gate 	uint16_t	hm_entry;	/* index of pte in htable */
550Sstevel@tonic-gate 	uint16_t	hm_pad;		/* explicitly expose compiler padding */
560Sstevel@tonic-gate #ifdef __amd64
570Sstevel@tonic-gate 	uint32_t	hm_pad2;	/* explicitly expose compiler padding */
580Sstevel@tonic-gate #endif
590Sstevel@tonic-gate };
600Sstevel@tonic-gate 
610Sstevel@tonic-gate /*
620Sstevel@tonic-gate  * Value returned by hment_walk() when dealing with a single mapping
630Sstevel@tonic-gate  * embedded in the page_t.
640Sstevel@tonic-gate  */
650Sstevel@tonic-gate #define	HMENT_EMBEDDED ((hment_t *)(uintptr_t)1)
660Sstevel@tonic-gate 
670Sstevel@tonic-gate kmem_cache_t *hment_cache;
680Sstevel@tonic-gate 
690Sstevel@tonic-gate /*
700Sstevel@tonic-gate  * The hment reserve is similar to the htable reserve, with the following
710Sstevel@tonic-gate  * exception. Hment's are never needed for HAT kmem allocs.
720Sstevel@tonic-gate  *
730Sstevel@tonic-gate  * The hment_reserve_amount variable is used, so that you can change it's
740Sstevel@tonic-gate  * value to zero via a kernel debugger to force stealing to get tested.
750Sstevel@tonic-gate  */
760Sstevel@tonic-gate #define	HMENT_RESERVE_AMOUNT	(200)	/* currently a guess at right value. */
770Sstevel@tonic-gate uint_t hment_reserve_amount = HMENT_RESERVE_AMOUNT;
780Sstevel@tonic-gate kmutex_t hment_reserve_mutex;
790Sstevel@tonic-gate uint_t	hment_reserve_count;
800Sstevel@tonic-gate hment_t	*hment_reserve_pool;
810Sstevel@tonic-gate extern  kthread_t *hat_reserves_thread;
820Sstevel@tonic-gate 
830Sstevel@tonic-gate /*
840Sstevel@tonic-gate  * Possible performance RFE: we might need to make this dynamic, perhaps
850Sstevel@tonic-gate  * based on the number of pages in the system.
860Sstevel@tonic-gate  */
870Sstevel@tonic-gate #define	HMENT_HASH_SIZE (64 * 1024)
880Sstevel@tonic-gate static uint_t hment_hash_entries = HMENT_HASH_SIZE;
890Sstevel@tonic-gate static hment_t **hment_hash;
900Sstevel@tonic-gate 
910Sstevel@tonic-gate /*
920Sstevel@tonic-gate  * Lots of highly shared pages will have the same value for "entry" (consider
930Sstevel@tonic-gate  * the starting address of "xterm" or "sh"). So we'll distinguish them by
940Sstevel@tonic-gate  * adding the pfn of the page table into both the high bits.
950Sstevel@tonic-gate  * The shift by 9 corresponds to the range of values for entry (0..511).
960Sstevel@tonic-gate  */
970Sstevel@tonic-gate #define	HMENT_HASH(pfn, entry) (uint32_t) 	\
980Sstevel@tonic-gate 	((((pfn) << 9) + entry + pfn) & (hment_hash_entries - 1))
990Sstevel@tonic-gate 
1000Sstevel@tonic-gate /*
1010Sstevel@tonic-gate  * "mlist_lock" is a hashed mutex lock for protecting per-page mapping
1020Sstevel@tonic-gate  * lists and "hash_lock" is a similar lock protecting the hment hash
1030Sstevel@tonic-gate  * table.  The hashed approach is taken to avoid the spatial overhead of
1040Sstevel@tonic-gate  * maintaining a separate lock for each page, while still achieving better
1050Sstevel@tonic-gate  * scalability than a single lock would allow.
1060Sstevel@tonic-gate  */
1070Sstevel@tonic-gate #define	MLIST_NUM_LOCK	256		/* must be power of two */
1080Sstevel@tonic-gate static kmutex_t mlist_lock[MLIST_NUM_LOCK];
1090Sstevel@tonic-gate 
1100Sstevel@tonic-gate /*
1110Sstevel@tonic-gate  * the shift by 9 is so that all large pages don't use the same hash bucket
1120Sstevel@tonic-gate  */
1130Sstevel@tonic-gate #define	MLIST_MUTEX(pp) \
1140Sstevel@tonic-gate 	&mlist_lock[((pp)->p_pagenum + ((pp)->p_pagenum >> 9)) & \
1150Sstevel@tonic-gate 	(MLIST_NUM_LOCK - 1)]
1160Sstevel@tonic-gate 
1170Sstevel@tonic-gate #define	HASH_NUM_LOCK	256		/* must be power of two */
1180Sstevel@tonic-gate static kmutex_t hash_lock[HASH_NUM_LOCK];
1190Sstevel@tonic-gate 
1200Sstevel@tonic-gate #define	HASH_MUTEX(idx) &hash_lock[(idx) & (HASH_NUM_LOCK-1)]
1210Sstevel@tonic-gate 
1220Sstevel@tonic-gate static hment_t *hment_steal(void);
1230Sstevel@tonic-gate 
1240Sstevel@tonic-gate /*
1250Sstevel@tonic-gate  * put one hment onto the reserves list
1260Sstevel@tonic-gate  */
1270Sstevel@tonic-gate static void
1280Sstevel@tonic-gate hment_put_reserve(hment_t *hm)
1290Sstevel@tonic-gate {
1300Sstevel@tonic-gate 	HATSTAT_INC(hs_hm_put_reserve);
1310Sstevel@tonic-gate 	mutex_enter(&hment_reserve_mutex);
1320Sstevel@tonic-gate 	hm->hm_next = hment_reserve_pool;
1330Sstevel@tonic-gate 	hment_reserve_pool = hm;
1340Sstevel@tonic-gate 	++hment_reserve_count;
1350Sstevel@tonic-gate 	mutex_exit(&hment_reserve_mutex);
1360Sstevel@tonic-gate }
1370Sstevel@tonic-gate 
1380Sstevel@tonic-gate /*
1390Sstevel@tonic-gate  * Take one hment from the reserve.
1400Sstevel@tonic-gate  */
1410Sstevel@tonic-gate static hment_t *
1420Sstevel@tonic-gate hment_get_reserve(void)
1430Sstevel@tonic-gate {
1440Sstevel@tonic-gate 	hment_t *hm = NULL;
1450Sstevel@tonic-gate 
1460Sstevel@tonic-gate 	/*
1470Sstevel@tonic-gate 	 * We rely on a "donation system" to refill the hment reserve
1480Sstevel@tonic-gate 	 * list, which only takes place when we are allocating hments for
1490Sstevel@tonic-gate 	 * user mappings.  It is theoretically possible that an incredibly
1500Sstevel@tonic-gate 	 * long string of kernel hment_alloc()s with no intervening user
1510Sstevel@tonic-gate 	 * hment_alloc()s could exhaust that pool.
1520Sstevel@tonic-gate 	 */
1530Sstevel@tonic-gate 	HATSTAT_INC(hs_hm_get_reserve);
1540Sstevel@tonic-gate 	mutex_enter(&hment_reserve_mutex);
1550Sstevel@tonic-gate 	if (hment_reserve_count != 0) {
1560Sstevel@tonic-gate 		hm = hment_reserve_pool;
1570Sstevel@tonic-gate 		hment_reserve_pool = hm->hm_next;
1580Sstevel@tonic-gate 		--hment_reserve_count;
1590Sstevel@tonic-gate 	}
1600Sstevel@tonic-gate 	mutex_exit(&hment_reserve_mutex);
1610Sstevel@tonic-gate 	return (hm);
1620Sstevel@tonic-gate }
1630Sstevel@tonic-gate 
1640Sstevel@tonic-gate /*
1650Sstevel@tonic-gate  * Allocate an hment
1660Sstevel@tonic-gate  */
1670Sstevel@tonic-gate static hment_t *
1680Sstevel@tonic-gate hment_alloc()
1690Sstevel@tonic-gate {
1700Sstevel@tonic-gate 	int km_flag = can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP;
1710Sstevel@tonic-gate 	hment_t	*hm = NULL;
1720Sstevel@tonic-gate 
1730Sstevel@tonic-gate 	/*
1740Sstevel@tonic-gate 	 * If we aren't using the reserves, try using kmem to get an hment.
1750Sstevel@tonic-gate 	 * Donate any successful allocations to reserves if low.
1760Sstevel@tonic-gate 	 *
1770Sstevel@tonic-gate 	 * If we're in panic, resort to using the reserves.
1780Sstevel@tonic-gate 	 */
1790Sstevel@tonic-gate 	HATSTAT_INC(hs_hm_alloc);
180*3543Sjosephb 	if (!USE_HAT_RESERVES()) {
1810Sstevel@tonic-gate 		for (;;) {
1820Sstevel@tonic-gate 			hm = kmem_cache_alloc(hment_cache, km_flag);
183*3543Sjosephb 			if (USE_HAT_RESERVES() ||
184*3543Sjosephb 			    hment_reserve_count >= hment_reserve_amount)
1850Sstevel@tonic-gate 				break;
1860Sstevel@tonic-gate 			hment_put_reserve(hm);
1870Sstevel@tonic-gate 		}
1880Sstevel@tonic-gate 	}
1890Sstevel@tonic-gate 
1900Sstevel@tonic-gate 	/*
1910Sstevel@tonic-gate 	 * If allocation failed, we need to tap the reserves or steal
1920Sstevel@tonic-gate 	 */
1930Sstevel@tonic-gate 	if (hm == NULL) {
194*3543Sjosephb 		if (USE_HAT_RESERVES())
1950Sstevel@tonic-gate 			hm = hment_get_reserve();
1960Sstevel@tonic-gate 
1970Sstevel@tonic-gate 		/*
1980Sstevel@tonic-gate 		 * If we still haven't gotten an hment, attempt to steal one by
1990Sstevel@tonic-gate 		 * victimizing a mapping in a user htable.
2000Sstevel@tonic-gate 		 */
2010Sstevel@tonic-gate 		if (hm == NULL && can_steal_post_boot)
2020Sstevel@tonic-gate 			hm = hment_steal();
2030Sstevel@tonic-gate 
2040Sstevel@tonic-gate 		/*
2050Sstevel@tonic-gate 		 * we're in dire straights, try the reserve
2060Sstevel@tonic-gate 		 */
2070Sstevel@tonic-gate 		if (hm == NULL)
2080Sstevel@tonic-gate 			hm = hment_get_reserve();
2090Sstevel@tonic-gate 
2100Sstevel@tonic-gate 		/*
2110Sstevel@tonic-gate 		 * still no hment is a serious problem.
2120Sstevel@tonic-gate 		 */
2130Sstevel@tonic-gate 		if (hm == NULL)
2140Sstevel@tonic-gate 			panic("hment_alloc(): no reserve, couldn't steal");
2150Sstevel@tonic-gate 	}
2160Sstevel@tonic-gate 
2170Sstevel@tonic-gate 
2180Sstevel@tonic-gate 	hm->hm_entry = 0;
2190Sstevel@tonic-gate 	hm->hm_htable = NULL;
2200Sstevel@tonic-gate 	hm->hm_hashnext = NULL;
2210Sstevel@tonic-gate 	hm->hm_next = NULL;
2220Sstevel@tonic-gate 	hm->hm_prev = NULL;
2233308Ssudheer 	hm->hm_pfn = PFN_INVALID;
2240Sstevel@tonic-gate 	return (hm);
2250Sstevel@tonic-gate }
2260Sstevel@tonic-gate 
2270Sstevel@tonic-gate /*
2280Sstevel@tonic-gate  * Free an hment, possibly to the reserves list when called from the
2290Sstevel@tonic-gate  * thread using the reserves. For example, when freeing an hment during an
2300Sstevel@tonic-gate  * htable_steal(), we can't recurse into the kmem allocator, so we just
2310Sstevel@tonic-gate  * push the hment onto the reserve list.
2320Sstevel@tonic-gate  */
2330Sstevel@tonic-gate void
2340Sstevel@tonic-gate hment_free(hment_t *hm)
2350Sstevel@tonic-gate {
2360Sstevel@tonic-gate #ifdef DEBUG
2370Sstevel@tonic-gate 	/*
2380Sstevel@tonic-gate 	 * zero out all fields to try and force any race conditions to segfault
2390Sstevel@tonic-gate 	 */
2400Sstevel@tonic-gate 	bzero(hm, sizeof (*hm));
2410Sstevel@tonic-gate #endif
2420Sstevel@tonic-gate 	HATSTAT_INC(hs_hm_free);
243*3543Sjosephb 	if (USE_HAT_RESERVES() ||
2440Sstevel@tonic-gate 	    hment_reserve_count < hment_reserve_amount)
2450Sstevel@tonic-gate 		hment_put_reserve(hm);
2460Sstevel@tonic-gate 	else
2470Sstevel@tonic-gate 		kmem_cache_free(hment_cache, hm);
2480Sstevel@tonic-gate }
2490Sstevel@tonic-gate 
2500Sstevel@tonic-gate int
2510Sstevel@tonic-gate x86_hm_held(page_t *pp)
2520Sstevel@tonic-gate {
2530Sstevel@tonic-gate 	ASSERT(pp != NULL);
2540Sstevel@tonic-gate 	return (MUTEX_HELD(MLIST_MUTEX(pp)));
2550Sstevel@tonic-gate }
2560Sstevel@tonic-gate 
2570Sstevel@tonic-gate void
2580Sstevel@tonic-gate x86_hm_enter(page_t *pp)
2590Sstevel@tonic-gate {
2600Sstevel@tonic-gate 	ASSERT(pp != NULL);
2610Sstevel@tonic-gate 	mutex_enter(MLIST_MUTEX(pp));
2620Sstevel@tonic-gate }
2630Sstevel@tonic-gate 
2640Sstevel@tonic-gate void
2650Sstevel@tonic-gate x86_hm_exit(page_t *pp)
2660Sstevel@tonic-gate {
2670Sstevel@tonic-gate 	ASSERT(pp != NULL);
2680Sstevel@tonic-gate 	mutex_exit(MLIST_MUTEX(pp));
2690Sstevel@tonic-gate }
2700Sstevel@tonic-gate 
2710Sstevel@tonic-gate /*
2720Sstevel@tonic-gate  * Internal routine to add a full hment to a page_t mapping list
2730Sstevel@tonic-gate  */
2740Sstevel@tonic-gate static void
2750Sstevel@tonic-gate hment_insert(hment_t *hm, page_t *pp)
2760Sstevel@tonic-gate {
2770Sstevel@tonic-gate 	uint_t		idx;
2780Sstevel@tonic-gate 
2790Sstevel@tonic-gate 	ASSERT(x86_hm_held(pp));
2800Sstevel@tonic-gate 	ASSERT(!pp->p_embed);
2810Sstevel@tonic-gate 
2820Sstevel@tonic-gate 	/*
2830Sstevel@tonic-gate 	 * Add the hment to the page's mapping list.
2840Sstevel@tonic-gate 	 */
2850Sstevel@tonic-gate 	++pp->p_share;
2860Sstevel@tonic-gate 	hm->hm_next = pp->p_mapping;
2870Sstevel@tonic-gate 	if (pp->p_mapping != NULL)
2880Sstevel@tonic-gate 		((hment_t *)pp->p_mapping)->hm_prev = hm;
2890Sstevel@tonic-gate 	pp->p_mapping = hm;
2900Sstevel@tonic-gate 
2910Sstevel@tonic-gate 	/*
2920Sstevel@tonic-gate 	 * Add the hment to the system-wide hash table.
2930Sstevel@tonic-gate 	 */
2940Sstevel@tonic-gate 	idx = HMENT_HASH(hm->hm_htable->ht_pfn, hm->hm_entry);
2950Sstevel@tonic-gate 
2960Sstevel@tonic-gate 	mutex_enter(HASH_MUTEX(idx));
2970Sstevel@tonic-gate 	hm->hm_hashnext = hment_hash[idx];
2980Sstevel@tonic-gate 	hment_hash[idx] = hm;
2990Sstevel@tonic-gate 	mutex_exit(HASH_MUTEX(idx));
3000Sstevel@tonic-gate }
3010Sstevel@tonic-gate 
3020Sstevel@tonic-gate /*
3030Sstevel@tonic-gate  * Prepare a mapping list entry to the given page.
3040Sstevel@tonic-gate  *
3050Sstevel@tonic-gate  * There are 4 different situations to deal with:
3060Sstevel@tonic-gate  *
3070Sstevel@tonic-gate  * - Adding the first mapping to a page_t as an embedded hment
3080Sstevel@tonic-gate  * - Refaulting on an existing embedded mapping
3090Sstevel@tonic-gate  * - Upgrading an embedded mapping when adding a 2nd mapping
3100Sstevel@tonic-gate  * - Adding another mapping to a page_t that already has multiple mappings
3110Sstevel@tonic-gate  *	 note we don't optimized for the refaulting case here.
3120Sstevel@tonic-gate  *
3130Sstevel@tonic-gate  * Due to competition with other threads that may be mapping/unmapping the
3140Sstevel@tonic-gate  * same page and the need to drop all locks while allocating hments, any or
3150Sstevel@tonic-gate  * all of the 3 situations can occur (and in almost any order) in any given
3160Sstevel@tonic-gate  * call. Isn't this fun!
3170Sstevel@tonic-gate  */
3180Sstevel@tonic-gate hment_t *
3190Sstevel@tonic-gate hment_prepare(htable_t *htable, uint_t entry, page_t *pp)
3200Sstevel@tonic-gate {
3210Sstevel@tonic-gate 	hment_t		*hm = NULL;
3220Sstevel@tonic-gate 
3230Sstevel@tonic-gate 	ASSERT(x86_hm_held(pp));
3240Sstevel@tonic-gate 
3250Sstevel@tonic-gate 	for (;;) {
3260Sstevel@tonic-gate 
3270Sstevel@tonic-gate 		/*
3280Sstevel@tonic-gate 		 * The most common case is establishing the first mapping to a
3290Sstevel@tonic-gate 		 * page, so check that first. This doesn't need any allocated
3300Sstevel@tonic-gate 		 * hment.
3310Sstevel@tonic-gate 		 */
3320Sstevel@tonic-gate 		if (pp->p_mapping == NULL) {
3330Sstevel@tonic-gate 			ASSERT(!pp->p_embed);
3340Sstevel@tonic-gate 			ASSERT(pp->p_share == 0);
3350Sstevel@tonic-gate 			if (hm == NULL)
3360Sstevel@tonic-gate 				break;
3370Sstevel@tonic-gate 
3380Sstevel@tonic-gate 			/*
3390Sstevel@tonic-gate 			 * we had an hment already, so free it and retry
3400Sstevel@tonic-gate 			 */
3410Sstevel@tonic-gate 			goto free_and_continue;
3420Sstevel@tonic-gate 		}
3430Sstevel@tonic-gate 
3440Sstevel@tonic-gate 		/*
3450Sstevel@tonic-gate 		 * If there is an embedded mapping, we may need to
3460Sstevel@tonic-gate 		 * convert it to an hment.
3470Sstevel@tonic-gate 		 */
3480Sstevel@tonic-gate 		if (pp->p_embed) {
3490Sstevel@tonic-gate 
3500Sstevel@tonic-gate 			/* should point to htable */
3510Sstevel@tonic-gate 			ASSERT(pp->p_mapping != NULL);
3520Sstevel@tonic-gate 
3530Sstevel@tonic-gate 			/*
3540Sstevel@tonic-gate 			 * If we are faulting on a pre-existing mapping
3550Sstevel@tonic-gate 			 * there is no need to promote/allocate a new hment.
3560Sstevel@tonic-gate 			 * This happens a lot due to segmap.
3570Sstevel@tonic-gate 			 */
3580Sstevel@tonic-gate 			if (pp->p_mapping == htable && pp->p_mlentry == entry) {
3590Sstevel@tonic-gate 				if (hm == NULL)
3600Sstevel@tonic-gate 					break;
3610Sstevel@tonic-gate 				goto free_and_continue;
3620Sstevel@tonic-gate 			}
3630Sstevel@tonic-gate 
3640Sstevel@tonic-gate 			/*
3650Sstevel@tonic-gate 			 * If we have an hment allocated, use it to promote the
3660Sstevel@tonic-gate 			 * existing embedded mapping.
3670Sstevel@tonic-gate 			 */
3680Sstevel@tonic-gate 			if (hm != NULL) {
3690Sstevel@tonic-gate 				hm->hm_htable = pp->p_mapping;
3700Sstevel@tonic-gate 				hm->hm_entry = pp->p_mlentry;
3713308Ssudheer 				hm->hm_pfn = pp->p_pagenum;
3720Sstevel@tonic-gate 				pp->p_mapping = NULL;
3730Sstevel@tonic-gate 				pp->p_share = 0;
3740Sstevel@tonic-gate 				pp->p_embed = 0;
3750Sstevel@tonic-gate 				hment_insert(hm, pp);
3760Sstevel@tonic-gate 			}
3770Sstevel@tonic-gate 
3780Sstevel@tonic-gate 			/*
3790Sstevel@tonic-gate 			 * We either didn't have an hment allocated or we just
3800Sstevel@tonic-gate 			 * used it for the embedded mapping. In either case,
3810Sstevel@tonic-gate 			 * allocate another hment and restart.
3820Sstevel@tonic-gate 			 */
3830Sstevel@tonic-gate 			goto allocate_and_continue;
3840Sstevel@tonic-gate 		}
3850Sstevel@tonic-gate 
3860Sstevel@tonic-gate 		/*
3870Sstevel@tonic-gate 		 * Last possibility is that we're adding an hment to a list
3880Sstevel@tonic-gate 		 * of hments.
3890Sstevel@tonic-gate 		 */
3900Sstevel@tonic-gate 		if (hm != NULL)
3910Sstevel@tonic-gate 			break;
3920Sstevel@tonic-gate allocate_and_continue:
3930Sstevel@tonic-gate 		x86_hm_exit(pp);
3940Sstevel@tonic-gate 		hm = hment_alloc();
3950Sstevel@tonic-gate 		x86_hm_enter(pp);
3960Sstevel@tonic-gate 		continue;
3970Sstevel@tonic-gate 
3980Sstevel@tonic-gate free_and_continue:
3990Sstevel@tonic-gate 		/*
4000Sstevel@tonic-gate 		 * we allocated an hment already, free it and retry
4010Sstevel@tonic-gate 		 */
4020Sstevel@tonic-gate 		x86_hm_exit(pp);
4030Sstevel@tonic-gate 		hment_free(hm);
4040Sstevel@tonic-gate 		hm = NULL;
4050Sstevel@tonic-gate 		x86_hm_enter(pp);
4060Sstevel@tonic-gate 	}
4070Sstevel@tonic-gate 	ASSERT(x86_hm_held(pp));
4080Sstevel@tonic-gate 	return (hm);
4090Sstevel@tonic-gate }
4100Sstevel@tonic-gate 
4110Sstevel@tonic-gate /*
4120Sstevel@tonic-gate  * Record a mapping list entry for the htable/entry to the given page.
4130Sstevel@tonic-gate  *
4140Sstevel@tonic-gate  * hment_prepare() should have properly set up the situation.
4150Sstevel@tonic-gate  */
4160Sstevel@tonic-gate void
4170Sstevel@tonic-gate hment_assign(htable_t *htable, uint_t entry, page_t *pp, hment_t *hm)
4180Sstevel@tonic-gate {
4190Sstevel@tonic-gate 	ASSERT(x86_hm_held(pp));
4200Sstevel@tonic-gate 
4210Sstevel@tonic-gate 	/*
4220Sstevel@tonic-gate 	 * The most common case is establishing the first mapping to a
4230Sstevel@tonic-gate 	 * page, so check that first. This doesn't need any allocated
4240Sstevel@tonic-gate 	 * hment.
4250Sstevel@tonic-gate 	 */
4260Sstevel@tonic-gate 	if (pp->p_mapping == NULL) {
4270Sstevel@tonic-gate 		ASSERT(hm == NULL);
4280Sstevel@tonic-gate 		ASSERT(!pp->p_embed);
4290Sstevel@tonic-gate 		ASSERT(pp->p_share == 0);
4300Sstevel@tonic-gate 		pp->p_embed = 1;
4310Sstevel@tonic-gate 		pp->p_mapping = htable;
4320Sstevel@tonic-gate 		pp->p_mlentry = entry;
4330Sstevel@tonic-gate 		return;
4340Sstevel@tonic-gate 	}
4350Sstevel@tonic-gate 
4360Sstevel@tonic-gate 	/*
4370Sstevel@tonic-gate 	 * We should never get here with a pre-existing embedded maping
4380Sstevel@tonic-gate 	 */
4390Sstevel@tonic-gate 	ASSERT(!pp->p_embed);
4400Sstevel@tonic-gate 
4410Sstevel@tonic-gate 	/*
4420Sstevel@tonic-gate 	 * add the new hment to the mapping list
4430Sstevel@tonic-gate 	 */
4440Sstevel@tonic-gate 	ASSERT(hm != NULL);
4450Sstevel@tonic-gate 	hm->hm_htable = htable;
4460Sstevel@tonic-gate 	hm->hm_entry = entry;
4473308Ssudheer 	hm->hm_pfn = pp->p_pagenum;
4480Sstevel@tonic-gate 	hment_insert(hm, pp);
4490Sstevel@tonic-gate }
4500Sstevel@tonic-gate 
4510Sstevel@tonic-gate /*
4520Sstevel@tonic-gate  * Walk through the mappings for a page.
4530Sstevel@tonic-gate  *
4540Sstevel@tonic-gate  * must already have done an x86_hm_enter()
4550Sstevel@tonic-gate  */
4560Sstevel@tonic-gate hment_t *
4570Sstevel@tonic-gate hment_walk(page_t *pp, htable_t **ht, uint_t *entry, hment_t *prev)
4580Sstevel@tonic-gate {
4590Sstevel@tonic-gate 	hment_t		*hm;
4600Sstevel@tonic-gate 
4610Sstevel@tonic-gate 	ASSERT(x86_hm_held(pp));
4620Sstevel@tonic-gate 
4630Sstevel@tonic-gate 	if (pp->p_embed) {
4640Sstevel@tonic-gate 		if (prev == NULL) {
4650Sstevel@tonic-gate 			*ht = (htable_t *)pp->p_mapping;
4660Sstevel@tonic-gate 			*entry = pp->p_mlentry;
4670Sstevel@tonic-gate 			hm = HMENT_EMBEDDED;
4680Sstevel@tonic-gate 		} else {
4690Sstevel@tonic-gate 			ASSERT(prev == HMENT_EMBEDDED);
4700Sstevel@tonic-gate 			hm = NULL;
4710Sstevel@tonic-gate 		}
4720Sstevel@tonic-gate 	} else {
4730Sstevel@tonic-gate 		if (prev == NULL) {
4740Sstevel@tonic-gate 			ASSERT(prev != HMENT_EMBEDDED);
4750Sstevel@tonic-gate 			hm = (hment_t *)pp->p_mapping;
4760Sstevel@tonic-gate 		} else {
4770Sstevel@tonic-gate 			hm = prev->hm_next;
4780Sstevel@tonic-gate 		}
4790Sstevel@tonic-gate 
4800Sstevel@tonic-gate 		if (hm != NULL) {
4810Sstevel@tonic-gate 			*ht = hm->hm_htable;
4820Sstevel@tonic-gate 			*entry = hm->hm_entry;
4830Sstevel@tonic-gate 		}
4840Sstevel@tonic-gate 	}
4850Sstevel@tonic-gate 	return (hm);
4860Sstevel@tonic-gate }
4870Sstevel@tonic-gate 
4880Sstevel@tonic-gate /*
4890Sstevel@tonic-gate  * Remove a mapping to a page from its mapping list. Must have
4900Sstevel@tonic-gate  * the corresponding mapping list locked.
4910Sstevel@tonic-gate  * Finds the mapping list entry with the given pte_t and
4920Sstevel@tonic-gate  * unlinks it from the mapping list.
4930Sstevel@tonic-gate  */
4940Sstevel@tonic-gate hment_t *
4950Sstevel@tonic-gate hment_remove(page_t *pp, htable_t *ht, uint_t entry)
4960Sstevel@tonic-gate {
4970Sstevel@tonic-gate 	hment_t		*prev = NULL;
4980Sstevel@tonic-gate 	hment_t		*hm;
4990Sstevel@tonic-gate 	uint_t		idx;
5003308Ssudheer 	pfn_t		pfn;
5010Sstevel@tonic-gate 
5020Sstevel@tonic-gate 	ASSERT(x86_hm_held(pp));
5030Sstevel@tonic-gate 
5040Sstevel@tonic-gate 	/*
5050Sstevel@tonic-gate 	 * Check if we have only one mapping embedded in the page_t.
5060Sstevel@tonic-gate 	 */
5070Sstevel@tonic-gate 	if (pp->p_embed) {
5080Sstevel@tonic-gate 		ASSERT(ht == (htable_t *)pp->p_mapping);
5090Sstevel@tonic-gate 		ASSERT(entry == pp->p_mlentry);
5100Sstevel@tonic-gate 		ASSERT(pp->p_share == 0);
5110Sstevel@tonic-gate 		pp->p_mapping = NULL;
5120Sstevel@tonic-gate 		pp->p_mlentry = 0;
5130Sstevel@tonic-gate 		pp->p_embed = 0;
5140Sstevel@tonic-gate 		return (NULL);
5150Sstevel@tonic-gate 	}
5160Sstevel@tonic-gate 
5170Sstevel@tonic-gate 	/*
5180Sstevel@tonic-gate 	 * Otherwise it must be in the list of hments.
5190Sstevel@tonic-gate 	 * Find the hment in the system-wide hash table and remove it.
5200Sstevel@tonic-gate 	 */
5210Sstevel@tonic-gate 	ASSERT(pp->p_share != 0);
5223308Ssudheer 	pfn = pp->p_pagenum;
5230Sstevel@tonic-gate 	idx = HMENT_HASH(ht->ht_pfn, entry);
5240Sstevel@tonic-gate 	mutex_enter(HASH_MUTEX(idx));
5250Sstevel@tonic-gate 	hm = hment_hash[idx];
5263308Ssudheer 	while (hm && (hm->hm_htable != ht || hm->hm_entry != entry ||
5273308Ssudheer 	    hm->hm_pfn != pfn)) {
5280Sstevel@tonic-gate 		prev = hm;
5290Sstevel@tonic-gate 		hm = hm->hm_hashnext;
5300Sstevel@tonic-gate 	}
53147Sjosephb 	if (hm == NULL) {
53247Sjosephb 		panic("hment_remove() missing in hash table pp=%lx, ht=%lx,"
53347Sjosephb 		    "entry=0x%x hash index=0x%x", (uintptr_t)pp, (uintptr_t)ht,
53447Sjosephb 		    entry, idx);
53547Sjosephb 	}
5360Sstevel@tonic-gate 
5370Sstevel@tonic-gate 	if (prev)
5380Sstevel@tonic-gate 		prev->hm_hashnext = hm->hm_hashnext;
5390Sstevel@tonic-gate 	else
5400Sstevel@tonic-gate 		hment_hash[idx] = hm->hm_hashnext;
5410Sstevel@tonic-gate 	mutex_exit(HASH_MUTEX(idx));
5420Sstevel@tonic-gate 
5430Sstevel@tonic-gate 	/*
5440Sstevel@tonic-gate 	 * Remove the hment from the page's mapping list
5450Sstevel@tonic-gate 	 */
5460Sstevel@tonic-gate 	if (hm->hm_next)
5470Sstevel@tonic-gate 		hm->hm_next->hm_prev = hm->hm_prev;
5480Sstevel@tonic-gate 	if (hm->hm_prev)
5490Sstevel@tonic-gate 		hm->hm_prev->hm_next = hm->hm_next;
5500Sstevel@tonic-gate 	else
5510Sstevel@tonic-gate 		pp->p_mapping = hm->hm_next;
5520Sstevel@tonic-gate 
5530Sstevel@tonic-gate 	--pp->p_share;
5540Sstevel@tonic-gate 	hm->hm_hashnext = NULL;
5550Sstevel@tonic-gate 	hm->hm_next = NULL;
5560Sstevel@tonic-gate 	hm->hm_prev = NULL;
5570Sstevel@tonic-gate 
5580Sstevel@tonic-gate 	return (hm);
5590Sstevel@tonic-gate }
5600Sstevel@tonic-gate 
5610Sstevel@tonic-gate /*
5620Sstevel@tonic-gate  * Put initial hment's in the reserve pool.
5630Sstevel@tonic-gate  */
5640Sstevel@tonic-gate void
5650Sstevel@tonic-gate hment_reserve(uint_t count)
5660Sstevel@tonic-gate {
5670Sstevel@tonic-gate 	hment_t	*hm;
5680Sstevel@tonic-gate 
5690Sstevel@tonic-gate 	count += hment_reserve_amount;
5700Sstevel@tonic-gate 
5710Sstevel@tonic-gate 	while (hment_reserve_count < count) {
5720Sstevel@tonic-gate 		hm = kmem_cache_alloc(hment_cache, KM_NOSLEEP);
5730Sstevel@tonic-gate 		if (hm == NULL)
5740Sstevel@tonic-gate 			return;
5750Sstevel@tonic-gate 		hment_put_reserve(hm);
5760Sstevel@tonic-gate 	}
5770Sstevel@tonic-gate }
5780Sstevel@tonic-gate 
5790Sstevel@tonic-gate /*
5800Sstevel@tonic-gate  * Readjust the hment reserves after they may have been used.
5810Sstevel@tonic-gate  */
5820Sstevel@tonic-gate void
5830Sstevel@tonic-gate hment_adjust_reserve()
5840Sstevel@tonic-gate {
5850Sstevel@tonic-gate 	hment_t	*hm;
5860Sstevel@tonic-gate 
5870Sstevel@tonic-gate 	/*
5880Sstevel@tonic-gate 	 * Free up any excess reserves
5890Sstevel@tonic-gate 	 */
5900Sstevel@tonic-gate 	while (hment_reserve_count > hment_reserve_amount) {
5910Sstevel@tonic-gate 		ASSERT(curthread != hat_reserves_thread);
5920Sstevel@tonic-gate 		hm = hment_get_reserve();
5930Sstevel@tonic-gate 		if (hm == NULL)
5940Sstevel@tonic-gate 			return;
5950Sstevel@tonic-gate 		hment_free(hm);
5960Sstevel@tonic-gate 	}
5970Sstevel@tonic-gate }
5980Sstevel@tonic-gate 
5990Sstevel@tonic-gate /*
6000Sstevel@tonic-gate  * initialize hment data structures
6010Sstevel@tonic-gate  */
6020Sstevel@tonic-gate void
6030Sstevel@tonic-gate hment_init(void)
6040Sstevel@tonic-gate {
6050Sstevel@tonic-gate 	int i;
6060Sstevel@tonic-gate 	int flags = KMC_NOHASH | KMC_NODEBUG;
6070Sstevel@tonic-gate 
6080Sstevel@tonic-gate 	/*
6090Sstevel@tonic-gate 	 * Initialize kmem caches. On 32 bit kernel's we shut off
6100Sstevel@tonic-gate 	 * debug information to save on precious kernel VA usage.
6110Sstevel@tonic-gate 	 */
6120Sstevel@tonic-gate 	hment_cache = kmem_cache_create("hment_t",
6130Sstevel@tonic-gate 	    sizeof (hment_t), 0, NULL, NULL, NULL,
6140Sstevel@tonic-gate 	    NULL, hat_memload_arena, flags);
6150Sstevel@tonic-gate 
6160Sstevel@tonic-gate 	hment_hash = kmem_zalloc(hment_hash_entries * sizeof (hment_t *),
6170Sstevel@tonic-gate 	    KM_SLEEP);
6180Sstevel@tonic-gate 
6190Sstevel@tonic-gate 	for (i = 0; i < MLIST_NUM_LOCK; i++)
6200Sstevel@tonic-gate 		mutex_init(&mlist_lock[i], NULL, MUTEX_DEFAULT, NULL);
6210Sstevel@tonic-gate 
6220Sstevel@tonic-gate 	for (i = 0; i < HASH_NUM_LOCK; i++)
6230Sstevel@tonic-gate 		mutex_init(&hash_lock[i], NULL, MUTEX_DEFAULT, NULL);
6240Sstevel@tonic-gate 
6250Sstevel@tonic-gate 
6260Sstevel@tonic-gate }
6270Sstevel@tonic-gate 
6280Sstevel@tonic-gate /*
6290Sstevel@tonic-gate  * return the number of mappings to a page
6300Sstevel@tonic-gate  *
6310Sstevel@tonic-gate  * Note there is no ASSERT() that the MUTEX is held for this.
6320Sstevel@tonic-gate  * Hence the return value might be inaccurate if this is called without
6330Sstevel@tonic-gate  * doing an x86_hm_enter().
6340Sstevel@tonic-gate  */
6350Sstevel@tonic-gate uint_t
6360Sstevel@tonic-gate hment_mapcnt(page_t *pp)
6370Sstevel@tonic-gate {
6380Sstevel@tonic-gate 	uint_t cnt;
6390Sstevel@tonic-gate 	uint_t szc;
6400Sstevel@tonic-gate 	page_t *larger;
6410Sstevel@tonic-gate 	hment_t	*hm;
6420Sstevel@tonic-gate 
6430Sstevel@tonic-gate 	x86_hm_enter(pp);
6440Sstevel@tonic-gate 	if (pp->p_mapping == NULL)
6450Sstevel@tonic-gate 		cnt = 0;
6460Sstevel@tonic-gate 	else if (pp->p_embed)
6470Sstevel@tonic-gate 		cnt = 1;
6480Sstevel@tonic-gate 	else
6490Sstevel@tonic-gate 		cnt = pp->p_share;
6500Sstevel@tonic-gate 	x86_hm_exit(pp);
6510Sstevel@tonic-gate 
6520Sstevel@tonic-gate 	/*
6530Sstevel@tonic-gate 	 * walk through all larger mapping sizes counting mappings
6540Sstevel@tonic-gate 	 */
6550Sstevel@tonic-gate 	for (szc = 1; szc <= pp->p_szc; ++szc) {
6560Sstevel@tonic-gate 		larger = PP_GROUPLEADER(pp, szc);
6570Sstevel@tonic-gate 		if (larger == pp)	/* don't double count large mappings */
6580Sstevel@tonic-gate 			continue;
6590Sstevel@tonic-gate 
6600Sstevel@tonic-gate 		x86_hm_enter(larger);
6610Sstevel@tonic-gate 		if (larger->p_mapping != NULL) {
6620Sstevel@tonic-gate 			if (larger->p_embed &&
6630Sstevel@tonic-gate 			    ((htable_t *)larger->p_mapping)->ht_level == szc) {
6640Sstevel@tonic-gate 				++cnt;
6650Sstevel@tonic-gate 			} else if (!larger->p_embed) {
6660Sstevel@tonic-gate 				for (hm = larger->p_mapping; hm;
6670Sstevel@tonic-gate 				    hm = hm->hm_next) {
6680Sstevel@tonic-gate 					if (hm->hm_htable->ht_level == szc)
6690Sstevel@tonic-gate 						++cnt;
6700Sstevel@tonic-gate 				}
6710Sstevel@tonic-gate 			}
6720Sstevel@tonic-gate 		}
6730Sstevel@tonic-gate 		x86_hm_exit(larger);
6740Sstevel@tonic-gate 	}
6750Sstevel@tonic-gate 	return (cnt);
6760Sstevel@tonic-gate }
6770Sstevel@tonic-gate 
6780Sstevel@tonic-gate /*
6790Sstevel@tonic-gate  * We need to steal an hment. Walk through all the page_t's until we
6800Sstevel@tonic-gate  * find one that has multiple mappings. Unload one of the mappings
6810Sstevel@tonic-gate  * and reclaim that hment. Note that we'll save/restart the starting
6820Sstevel@tonic-gate  * page to try and spread the pain.
6830Sstevel@tonic-gate  */
6840Sstevel@tonic-gate static page_t *last_page = NULL;
6850Sstevel@tonic-gate 
6860Sstevel@tonic-gate static hment_t *
6870Sstevel@tonic-gate hment_steal(void)
6880Sstevel@tonic-gate {
6890Sstevel@tonic-gate 	page_t *last = last_page;
6900Sstevel@tonic-gate 	page_t *pp = last;
6910Sstevel@tonic-gate 	hment_t *hm = NULL;
6920Sstevel@tonic-gate 	hment_t *hm2;
6930Sstevel@tonic-gate 	htable_t *ht;
6940Sstevel@tonic-gate 	uint_t found_one = 0;
6950Sstevel@tonic-gate 
6960Sstevel@tonic-gate 	HATSTAT_INC(hs_hm_steals);
6970Sstevel@tonic-gate 	if (pp == NULL)
6980Sstevel@tonic-gate 		last = pp = page_first();
6990Sstevel@tonic-gate 
7000Sstevel@tonic-gate 	while (!found_one) {
7010Sstevel@tonic-gate 		HATSTAT_INC(hs_hm_steal_exam);
7020Sstevel@tonic-gate 		pp = page_next(pp);
7030Sstevel@tonic-gate 		if (pp == NULL)
7040Sstevel@tonic-gate 			pp = page_first();
7050Sstevel@tonic-gate 
7060Sstevel@tonic-gate 		/*
7070Sstevel@tonic-gate 		 * The loop and function exit here if nothing found to steal.
7080Sstevel@tonic-gate 		 */
7090Sstevel@tonic-gate 		if (pp == last)
7100Sstevel@tonic-gate 			return (NULL);
7110Sstevel@tonic-gate 
7120Sstevel@tonic-gate 		/*
7130Sstevel@tonic-gate 		 * Only lock the page_t if it has hments.
7140Sstevel@tonic-gate 		 */
7150Sstevel@tonic-gate 		if (pp->p_mapping == NULL || pp->p_embed)
7160Sstevel@tonic-gate 			continue;
7170Sstevel@tonic-gate 
7180Sstevel@tonic-gate 		/*
7190Sstevel@tonic-gate 		 * Search the mapping list for a usable mapping.
7200Sstevel@tonic-gate 		 */
7210Sstevel@tonic-gate 		x86_hm_enter(pp);
7220Sstevel@tonic-gate 		if (!pp->p_embed) {
7230Sstevel@tonic-gate 			for (hm = pp->p_mapping; hm; hm = hm->hm_next) {
7240Sstevel@tonic-gate 				ht = hm->hm_htable;
7250Sstevel@tonic-gate 				if (ht->ht_hat != kas.a_hat &&
7260Sstevel@tonic-gate 				    ht->ht_busy == 0 &&
7270Sstevel@tonic-gate 				    ht->ht_lock_cnt == 0) {
7280Sstevel@tonic-gate 					found_one = 1;
7290Sstevel@tonic-gate 					break;
7300Sstevel@tonic-gate 				}
7310Sstevel@tonic-gate 			}
7320Sstevel@tonic-gate 		}
7330Sstevel@tonic-gate 		if (!found_one)
7340Sstevel@tonic-gate 			x86_hm_exit(pp);
7350Sstevel@tonic-gate 	}
7360Sstevel@tonic-gate 
7370Sstevel@tonic-gate 	/*
7380Sstevel@tonic-gate 	 * Steal the mapping we found.  Note that hati_page_unmap() will
7390Sstevel@tonic-gate 	 * do the x86_hm_exit().
7400Sstevel@tonic-gate 	 */
7410Sstevel@tonic-gate 	hm2 = hati_page_unmap(pp, ht, hm->hm_entry);
7420Sstevel@tonic-gate 	ASSERT(hm2 == hm);
7430Sstevel@tonic-gate 	last_page = pp;
7440Sstevel@tonic-gate 	return (hm);
7450Sstevel@tonic-gate }
746