xref: /onnv-gate/usr/src/uts/i86pc/vm/vm_machdep.c (revision 3290:256464cbb73c)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
51443Skchow  * Common Development and Distribution License (the "License").
61443Skchow  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
221373Skchow  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
270Sstevel@tonic-gate /*	All Rights Reserved   */
280Sstevel@tonic-gate 
290Sstevel@tonic-gate /*
300Sstevel@tonic-gate  * Portions of this source code were derived from Berkeley 4.3 BSD
310Sstevel@tonic-gate  * under license from the Regents of the University of California.
320Sstevel@tonic-gate  */
330Sstevel@tonic-gate 
340Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
350Sstevel@tonic-gate 
360Sstevel@tonic-gate /*
370Sstevel@tonic-gate  * UNIX machine dependent virtual memory support.
380Sstevel@tonic-gate  */
390Sstevel@tonic-gate 
400Sstevel@tonic-gate #include <sys/types.h>
410Sstevel@tonic-gate #include <sys/param.h>
420Sstevel@tonic-gate #include <sys/systm.h>
430Sstevel@tonic-gate #include <sys/user.h>
440Sstevel@tonic-gate #include <sys/proc.h>
450Sstevel@tonic-gate #include <sys/kmem.h>
460Sstevel@tonic-gate #include <sys/vmem.h>
470Sstevel@tonic-gate #include <sys/buf.h>
480Sstevel@tonic-gate #include <sys/cpuvar.h>
490Sstevel@tonic-gate #include <sys/lgrp.h>
500Sstevel@tonic-gate #include <sys/disp.h>
510Sstevel@tonic-gate #include <sys/vm.h>
520Sstevel@tonic-gate #include <sys/mman.h>
530Sstevel@tonic-gate #include <sys/vnode.h>
540Sstevel@tonic-gate #include <sys/cred.h>
550Sstevel@tonic-gate #include <sys/exec.h>
560Sstevel@tonic-gate #include <sys/exechdr.h>
570Sstevel@tonic-gate #include <sys/debug.h>
582991Ssusans #include <sys/vmsystm.h>
590Sstevel@tonic-gate 
600Sstevel@tonic-gate #include <vm/hat.h>
610Sstevel@tonic-gate #include <vm/as.h>
620Sstevel@tonic-gate #include <vm/seg.h>
630Sstevel@tonic-gate #include <vm/seg_kp.h>
640Sstevel@tonic-gate #include <vm/seg_vn.h>
650Sstevel@tonic-gate #include <vm/page.h>
660Sstevel@tonic-gate #include <vm/seg_kmem.h>
670Sstevel@tonic-gate #include <vm/seg_kpm.h>
680Sstevel@tonic-gate #include <vm/vm_dep.h>
690Sstevel@tonic-gate 
700Sstevel@tonic-gate #include <sys/cpu.h>
710Sstevel@tonic-gate #include <sys/vm_machparam.h>
720Sstevel@tonic-gate #include <sys/memlist.h>
730Sstevel@tonic-gate #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
740Sstevel@tonic-gate #include <vm/hat_i86.h>
750Sstevel@tonic-gate #include <sys/x86_archext.h>
760Sstevel@tonic-gate #include <sys/elf_386.h>
770Sstevel@tonic-gate #include <sys/cmn_err.h>
780Sstevel@tonic-gate #include <sys/archsystm.h>
790Sstevel@tonic-gate #include <sys/machsystm.h>
800Sstevel@tonic-gate 
810Sstevel@tonic-gate #include <sys/vtrace.h>
820Sstevel@tonic-gate #include <sys/ddidmareq.h>
830Sstevel@tonic-gate #include <sys/promif.h>
840Sstevel@tonic-gate #include <sys/memnode.h>
850Sstevel@tonic-gate #include <sys/stack.h>
860Sstevel@tonic-gate 
872961Sdp78419 uint_t vac_colors = 1;
880Sstevel@tonic-gate 
890Sstevel@tonic-gate int largepagesupport = 0;
900Sstevel@tonic-gate extern uint_t page_create_new;
910Sstevel@tonic-gate extern uint_t page_create_exists;
920Sstevel@tonic-gate extern uint_t page_create_putbacks;
930Sstevel@tonic-gate extern uint_t page_create_putbacks;
940Sstevel@tonic-gate extern uintptr_t eprom_kernelbase;
950Sstevel@tonic-gate extern int use_sse_pagecopy, use_sse_pagezero;	/* in ml/float.s */
960Sstevel@tonic-gate 
970Sstevel@tonic-gate /* 4g memory management */
980Sstevel@tonic-gate pgcnt_t		maxmem4g;
990Sstevel@tonic-gate pgcnt_t		freemem4g;
1000Sstevel@tonic-gate int		physmax4g;
1010Sstevel@tonic-gate int		desfree4gshift = 4;	/* maxmem4g shift to derive DESFREE4G */
1020Sstevel@tonic-gate int		lotsfree4gshift = 3;
1030Sstevel@tonic-gate 
1041385Skchow /* 16m memory management: desired number of free pages below 16m. */
1051385Skchow pgcnt_t		desfree16m = 0x380;
1061385Skchow 
1070Sstevel@tonic-gate #ifdef VM_STATS
1080Sstevel@tonic-gate struct {
1090Sstevel@tonic-gate 	ulong_t	pga_alloc;
1100Sstevel@tonic-gate 	ulong_t	pga_notfullrange;
1110Sstevel@tonic-gate 	ulong_t	pga_nulldmaattr;
1120Sstevel@tonic-gate 	ulong_t	pga_allocok;
1130Sstevel@tonic-gate 	ulong_t	pga_allocfailed;
1140Sstevel@tonic-gate 	ulong_t	pgma_alloc;
1150Sstevel@tonic-gate 	ulong_t	pgma_allocok;
1160Sstevel@tonic-gate 	ulong_t	pgma_allocfailed;
1170Sstevel@tonic-gate 	ulong_t	pgma_allocempty;
1180Sstevel@tonic-gate } pga_vmstats;
1190Sstevel@tonic-gate #endif
1200Sstevel@tonic-gate 
1210Sstevel@tonic-gate uint_t mmu_page_sizes;
1220Sstevel@tonic-gate 
1230Sstevel@tonic-gate /* How many page sizes the users can see */
1240Sstevel@tonic-gate uint_t mmu_exported_page_sizes;
1250Sstevel@tonic-gate 
126423Sdavemq /*
127423Sdavemq  * Number of pages in 1 GB.  Don't enable automatic large pages if we have
128423Sdavemq  * fewer than this many pages.
129423Sdavemq  */
1302991Ssusans pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
1312991Ssusans pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
1322991Ssusans 
1332991Ssusans /*
1342991Ssusans  * Maximum and default segment size tunables for user private
1352991Ssusans  * and shared anon memory, and user text and initialized data.
1362991Ssusans  * These can be patched via /etc/system to allow large pages
1372991Ssusans  * to be used for mapping application private and shared anon memory.
1382991Ssusans  */
1392991Ssusans size_t mcntl0_lpsize = MMU_PAGESIZE;
1402991Ssusans size_t max_uheap_lpsize = MMU_PAGESIZE;
1412991Ssusans size_t default_uheap_lpsize = MMU_PAGESIZE;
1422991Ssusans size_t max_ustack_lpsize = MMU_PAGESIZE;
1432991Ssusans size_t default_ustack_lpsize = MMU_PAGESIZE;
1442991Ssusans size_t max_privmap_lpsize = MMU_PAGESIZE;
1452991Ssusans size_t max_uidata_lpsize = MMU_PAGESIZE;
1462991Ssusans size_t max_utext_lpsize = MMU_PAGESIZE;
1472991Ssusans size_t max_shm_lpsize = MMU_PAGESIZE;
1480Sstevel@tonic-gate 
1490Sstevel@tonic-gate /*
1500Sstevel@tonic-gate  * Return the optimum page size for a given mapping
1510Sstevel@tonic-gate  */
1520Sstevel@tonic-gate /*ARGSUSED*/
1530Sstevel@tonic-gate size_t
1542991Ssusans map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
1550Sstevel@tonic-gate {
1562991Ssusans 	level_t l = 0;
1572991Ssusans 	size_t pgsz = MMU_PAGESIZE;
1582991Ssusans 	size_t max_lpsize;
1592991Ssusans 	uint_t mszc;
1600Sstevel@tonic-gate 
1612991Ssusans 	ASSERT(maptype != MAPPGSZ_VA);
1622991Ssusans 
1632991Ssusans 	if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
1642991Ssusans 		return (MMU_PAGESIZE);
1652991Ssusans 	}
1660Sstevel@tonic-gate 
1670Sstevel@tonic-gate 	switch (maptype) {
1682991Ssusans 	case MAPPGSZ_HEAP:
1690Sstevel@tonic-gate 	case MAPPGSZ_STK:
1702991Ssusans 		max_lpsize = memcntl ? mcntl0_lpsize : (maptype ==
1712991Ssusans 		    MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize);
1722991Ssusans 		if (max_lpsize == MMU_PAGESIZE) {
1732991Ssusans 			return (MMU_PAGESIZE);
1742991Ssusans 		}
1752991Ssusans 		if (len == 0) {
1762991Ssusans 			len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase +
1772991Ssusans 			    p->p_brksize - p->p_bssbase : p->p_stksize;
1782991Ssusans 		}
1792991Ssusans 		len = (maptype == MAPPGSZ_HEAP) ? MAX(len,
1802991Ssusans 		    default_uheap_lpsize) : MAX(len, default_ustack_lpsize);
1812991Ssusans 
1820Sstevel@tonic-gate 		/*
1830Sstevel@tonic-gate 		 * use the pages size that best fits len
1840Sstevel@tonic-gate 		 */
1850Sstevel@tonic-gate 		for (l = mmu.max_page_level; l > 0; --l) {
1862991Ssusans 			if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) {
1870Sstevel@tonic-gate 				continue;
1882991Ssusans 			} else {
1892991Ssusans 				pgsz = LEVEL_SIZE(l);
1902991Ssusans 			}
1910Sstevel@tonic-gate 			break;
1920Sstevel@tonic-gate 		}
1932991Ssusans 
1942991Ssusans 		mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc :
1952991Ssusans 		    p->p_stkpageszc);
1962991Ssusans 		if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) {
1972991Ssusans 			pgsz = hw_page_array[mszc].hp_size;
1982991Ssusans 		}
1992991Ssusans 		return (pgsz);
2000Sstevel@tonic-gate 
2010Sstevel@tonic-gate 	/*
2020Sstevel@tonic-gate 	 * for ISM use the 1st large page size.
2030Sstevel@tonic-gate 	 */
2040Sstevel@tonic-gate 	case MAPPGSZ_ISM:
2050Sstevel@tonic-gate 		if (mmu.max_page_level == 0)
2060Sstevel@tonic-gate 			return (MMU_PAGESIZE);
2070Sstevel@tonic-gate 		return (LEVEL_SIZE(1));
2080Sstevel@tonic-gate 	}
2092991Ssusans 	return (pgsz);
2100Sstevel@tonic-gate }
2110Sstevel@tonic-gate 
2122991Ssusans static uint_t
2132991Ssusans map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize,
2142991Ssusans     size_t min_physmem)
2152991Ssusans {
2162991Ssusans 	caddr_t eaddr = addr + size;
2172991Ssusans 	uint_t szcvec = 0;
2182991Ssusans 	caddr_t raddr;
2192991Ssusans 	caddr_t readdr;
2202991Ssusans 	size_t	pgsz;
2212991Ssusans 	int i;
2222991Ssusans 
2232991Ssusans 	if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
2242991Ssusans 		return (0);
2252991Ssusans 	}
2262991Ssusans 
2272991Ssusans 	for (i = mmu_page_sizes - 1; i > 0; i--) {
2282991Ssusans 		pgsz = page_get_pagesize(i);
2292991Ssusans 		if (pgsz > max_lpsize) {
2302991Ssusans 			continue;
2312991Ssusans 		}
2322991Ssusans 		raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
2332991Ssusans 		readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
2342991Ssusans 		if (raddr < addr || raddr >= readdr) {
2352991Ssusans 			continue;
2362991Ssusans 		}
2372991Ssusans 		if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
2382991Ssusans 			continue;
2392991Ssusans 		}
2402991Ssusans 		/*
2412991Ssusans 		 * Set szcvec to the remaining page sizes.
2422991Ssusans 		 */
2432991Ssusans 		szcvec = ((1 << (i + 1)) - 1) & ~1;
2442991Ssusans 		break;
2452991Ssusans 	}
2462991Ssusans 	return (szcvec);
2472991Ssusans }
2480Sstevel@tonic-gate 
2490Sstevel@tonic-gate /*
2500Sstevel@tonic-gate  * Return a bit vector of large page size codes that
2510Sstevel@tonic-gate  * can be used to map [addr, addr + len) region.
2520Sstevel@tonic-gate  */
2530Sstevel@tonic-gate /*ARGSUSED*/
2540Sstevel@tonic-gate uint_t
2552991Ssusans map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
2562991Ssusans     int memcntl)
2570Sstevel@tonic-gate {
2582991Ssusans 	size_t max_lpsize = mcntl0_lpsize;
2590Sstevel@tonic-gate 
2602991Ssusans 	if (mmu.max_page_level == 0)
2610Sstevel@tonic-gate 		return (0);
2620Sstevel@tonic-gate 
2632991Ssusans 	if (flags & MAP_TEXT) {
2642991Ssusans 	    if (!memcntl)
2652991Ssusans 		max_lpsize = max_utext_lpsize;
2662991Ssusans 	    return (map_szcvec(addr, size, off, max_lpsize,
2672991Ssusans 		    shm_lpg_min_physmem));
2682991Ssusans 
2692991Ssusans 	} else if (flags & MAP_INITDATA) {
2702991Ssusans 	    if (!memcntl)
2712991Ssusans 		max_lpsize = max_uidata_lpsize;
2722991Ssusans 	    return (map_szcvec(addr, size, off, max_lpsize,
2732991Ssusans 		    privm_lpg_min_physmem));
2742991Ssusans 
2752991Ssusans 	} else if (type == MAPPGSZC_SHM) {
2762991Ssusans 	    if (!memcntl)
2772991Ssusans 		max_lpsize = max_shm_lpsize;
2782991Ssusans 	    return (map_szcvec(addr, size, off, max_lpsize,
2792991Ssusans 		    shm_lpg_min_physmem));
2800Sstevel@tonic-gate 
2812991Ssusans 	} else if (type == MAPPGSZC_HEAP) {
2822991Ssusans 	    if (!memcntl)
2832991Ssusans 		max_lpsize = max_uheap_lpsize;
2842991Ssusans 	    return (map_szcvec(addr, size, off, max_lpsize,
2852991Ssusans 		    privm_lpg_min_physmem));
2862414Saguzovsk 
2872991Ssusans 	} else if (type == MAPPGSZC_STACK) {
2882991Ssusans 	    if (!memcntl)
2892991Ssusans 		max_lpsize = max_ustack_lpsize;
2902991Ssusans 	    return (map_szcvec(addr, size, off, max_lpsize,
2912991Ssusans 		    privm_lpg_min_physmem));
2922991Ssusans 
2932991Ssusans 	} else {
2942991Ssusans 	    if (!memcntl)
2952991Ssusans 		max_lpsize = max_privmap_lpsize;
2962991Ssusans 	    return (map_szcvec(addr, size, off, max_lpsize,
2972991Ssusans 		    privm_lpg_min_physmem));
2982414Saguzovsk 	}
2992414Saguzovsk }
3002414Saguzovsk 
3010Sstevel@tonic-gate /*
3020Sstevel@tonic-gate  * Handle a pagefault.
3030Sstevel@tonic-gate  */
3040Sstevel@tonic-gate faultcode_t
3050Sstevel@tonic-gate pagefault(
3060Sstevel@tonic-gate 	caddr_t addr,
3070Sstevel@tonic-gate 	enum fault_type type,
3080Sstevel@tonic-gate 	enum seg_rw rw,
3090Sstevel@tonic-gate 	int iskernel)
3100Sstevel@tonic-gate {
3110Sstevel@tonic-gate 	struct as *as;
3120Sstevel@tonic-gate 	struct hat *hat;
3130Sstevel@tonic-gate 	struct proc *p;
3140Sstevel@tonic-gate 	kthread_t *t;
3150Sstevel@tonic-gate 	faultcode_t res;
3160Sstevel@tonic-gate 	caddr_t base;
3170Sstevel@tonic-gate 	size_t len;
3180Sstevel@tonic-gate 	int err;
3190Sstevel@tonic-gate 	int mapped_red;
3200Sstevel@tonic-gate 	uintptr_t ea;
3210Sstevel@tonic-gate 
3220Sstevel@tonic-gate 	ASSERT_STACK_ALIGNED();
3230Sstevel@tonic-gate 
3240Sstevel@tonic-gate 	if (INVALID_VADDR(addr))
3250Sstevel@tonic-gate 		return (FC_NOMAP);
3260Sstevel@tonic-gate 
3270Sstevel@tonic-gate 	mapped_red = segkp_map_red();
3280Sstevel@tonic-gate 
3290Sstevel@tonic-gate 	if (iskernel) {
3300Sstevel@tonic-gate 		as = &kas;
3310Sstevel@tonic-gate 		hat = as->a_hat;
3320Sstevel@tonic-gate 	} else {
3330Sstevel@tonic-gate 		t = curthread;
3340Sstevel@tonic-gate 		p = ttoproc(t);
3350Sstevel@tonic-gate 		as = p->p_as;
3360Sstevel@tonic-gate 		hat = as->a_hat;
3370Sstevel@tonic-gate 	}
3380Sstevel@tonic-gate 
3390Sstevel@tonic-gate 	/*
3400Sstevel@tonic-gate 	 * Dispatch pagefault.
3410Sstevel@tonic-gate 	 */
3420Sstevel@tonic-gate 	res = as_fault(hat, as, addr, 1, type, rw);
3430Sstevel@tonic-gate 
3440Sstevel@tonic-gate 	/*
3450Sstevel@tonic-gate 	 * If this isn't a potential unmapped hole in the user's
3460Sstevel@tonic-gate 	 * UNIX data or stack segments, just return status info.
3470Sstevel@tonic-gate 	 */
3480Sstevel@tonic-gate 	if (res != FC_NOMAP || iskernel)
3490Sstevel@tonic-gate 		goto out;
3500Sstevel@tonic-gate 
3510Sstevel@tonic-gate 	/*
3520Sstevel@tonic-gate 	 * Check to see if we happened to faulted on a currently unmapped
3530Sstevel@tonic-gate 	 * part of the UNIX data or stack segments.  If so, create a zfod
3540Sstevel@tonic-gate 	 * mapping there and then try calling the fault routine again.
3550Sstevel@tonic-gate 	 */
3560Sstevel@tonic-gate 	base = p->p_brkbase;
3570Sstevel@tonic-gate 	len = p->p_brksize;
3580Sstevel@tonic-gate 
3590Sstevel@tonic-gate 	if (addr < base || addr >= base + len) {		/* data seg? */
3600Sstevel@tonic-gate 		base = (caddr_t)p->p_usrstack - p->p_stksize;
3610Sstevel@tonic-gate 		len = p->p_stksize;
3620Sstevel@tonic-gate 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
3630Sstevel@tonic-gate 			/* not in either UNIX data or stack segments */
3640Sstevel@tonic-gate 			res = FC_NOMAP;
3650Sstevel@tonic-gate 			goto out;
3660Sstevel@tonic-gate 		}
3670Sstevel@tonic-gate 	}
3680Sstevel@tonic-gate 
3690Sstevel@tonic-gate 	/*
3700Sstevel@tonic-gate 	 * the rest of this function implements a 3.X 4.X 5.X compatibility
3710Sstevel@tonic-gate 	 * This code is probably not needed anymore
3720Sstevel@tonic-gate 	 */
3730Sstevel@tonic-gate 	if (p->p_model == DATAMODEL_ILP32) {
3740Sstevel@tonic-gate 
3750Sstevel@tonic-gate 		/* expand the gap to the page boundaries on each side */
3760Sstevel@tonic-gate 		ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
3770Sstevel@tonic-gate 		base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
3780Sstevel@tonic-gate 		len = ea - (uintptr_t)base;
3790Sstevel@tonic-gate 
3800Sstevel@tonic-gate 		as_rangelock(as);
3810Sstevel@tonic-gate 		if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
3820Sstevel@tonic-gate 		    0) {
3830Sstevel@tonic-gate 			err = as_map(as, base, len, segvn_create, zfod_argsp);
3840Sstevel@tonic-gate 			as_rangeunlock(as);
3850Sstevel@tonic-gate 			if (err) {
3860Sstevel@tonic-gate 				res = FC_MAKE_ERR(err);
3870Sstevel@tonic-gate 				goto out;
3880Sstevel@tonic-gate 			}
3890Sstevel@tonic-gate 		} else {
3900Sstevel@tonic-gate 			/*
3910Sstevel@tonic-gate 			 * This page is already mapped by another thread after
3920Sstevel@tonic-gate 			 * we returned from as_fault() above.  We just fall
3930Sstevel@tonic-gate 			 * through as_fault() below.
3940Sstevel@tonic-gate 			 */
3950Sstevel@tonic-gate 			as_rangeunlock(as);
3960Sstevel@tonic-gate 		}
3970Sstevel@tonic-gate 
3980Sstevel@tonic-gate 		res = as_fault(hat, as, addr, 1, F_INVAL, rw);
3990Sstevel@tonic-gate 	}
4000Sstevel@tonic-gate 
4010Sstevel@tonic-gate out:
4020Sstevel@tonic-gate 	if (mapped_red)
4030Sstevel@tonic-gate 		segkp_unmap_red();
4040Sstevel@tonic-gate 
4050Sstevel@tonic-gate 	return (res);
4060Sstevel@tonic-gate }
4070Sstevel@tonic-gate 
4080Sstevel@tonic-gate void
4090Sstevel@tonic-gate map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
4100Sstevel@tonic-gate {
4110Sstevel@tonic-gate 	struct proc *p = curproc;
4120Sstevel@tonic-gate 	caddr_t userlimit = (flags & _MAP_LOW32) ?
4130Sstevel@tonic-gate 	    (caddr_t)_userlimit32 : p->p_as->a_userlimit;
4140Sstevel@tonic-gate 
4150Sstevel@tonic-gate 	map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
4160Sstevel@tonic-gate }
4170Sstevel@tonic-gate 
4180Sstevel@tonic-gate /*ARGSUSED*/
4190Sstevel@tonic-gate int
4200Sstevel@tonic-gate map_addr_vacalign_check(caddr_t addr, u_offset_t off)
4210Sstevel@tonic-gate {
4220Sstevel@tonic-gate 	return (0);
4230Sstevel@tonic-gate }
4240Sstevel@tonic-gate 
4250Sstevel@tonic-gate /*
4260Sstevel@tonic-gate  * map_addr_proc() is the routine called when the system is to
4270Sstevel@tonic-gate  * choose an address for the user.  We will pick an address
4280Sstevel@tonic-gate  * range which is the highest available below kernelbase.
4290Sstevel@tonic-gate  *
4300Sstevel@tonic-gate  * addrp is a value/result parameter.
4310Sstevel@tonic-gate  *	On input it is a hint from the user to be used in a completely
4320Sstevel@tonic-gate  *	machine dependent fashion.  We decide to completely ignore this hint.
4330Sstevel@tonic-gate  *
4340Sstevel@tonic-gate  *	On output it is NULL if no address can be found in the current
4350Sstevel@tonic-gate  *	processes address space or else an address that is currently
4360Sstevel@tonic-gate  *	not mapped for len bytes with a page of red zone on either side.
4370Sstevel@tonic-gate  *
4380Sstevel@tonic-gate  *	align is not needed on x86 (it's for viturally addressed caches)
4390Sstevel@tonic-gate  */
4400Sstevel@tonic-gate /*ARGSUSED*/
4410Sstevel@tonic-gate void
4420Sstevel@tonic-gate map_addr_proc(
4430Sstevel@tonic-gate 	caddr_t *addrp,
4440Sstevel@tonic-gate 	size_t len,
4450Sstevel@tonic-gate 	offset_t off,
4460Sstevel@tonic-gate 	int vacalign,
4470Sstevel@tonic-gate 	caddr_t userlimit,
4480Sstevel@tonic-gate 	struct proc *p,
4490Sstevel@tonic-gate 	uint_t flags)
4500Sstevel@tonic-gate {
4510Sstevel@tonic-gate 	struct as *as = p->p_as;
4520Sstevel@tonic-gate 	caddr_t addr;
4530Sstevel@tonic-gate 	caddr_t base;
4540Sstevel@tonic-gate 	size_t slen;
4550Sstevel@tonic-gate 	size_t align_amount;
4560Sstevel@tonic-gate 
4570Sstevel@tonic-gate 	ASSERT32(userlimit == as->a_userlimit);
4580Sstevel@tonic-gate 
4590Sstevel@tonic-gate 	base = p->p_brkbase;
4600Sstevel@tonic-gate #if defined(__amd64)
4610Sstevel@tonic-gate 	/*
4620Sstevel@tonic-gate 	 * XX64 Yes, this needs more work.
4630Sstevel@tonic-gate 	 */
4640Sstevel@tonic-gate 	if (p->p_model == DATAMODEL_NATIVE) {
4650Sstevel@tonic-gate 		if (userlimit < as->a_userlimit) {
4660Sstevel@tonic-gate 			/*
4670Sstevel@tonic-gate 			 * This happens when a program wants to map
4680Sstevel@tonic-gate 			 * something in a range that's accessible to a
4690Sstevel@tonic-gate 			 * program in a smaller address space.  For example,
4700Sstevel@tonic-gate 			 * a 64-bit program calling mmap32(2) to guarantee
4710Sstevel@tonic-gate 			 * that the returned address is below 4Gbytes.
4720Sstevel@tonic-gate 			 */
4730Sstevel@tonic-gate 			ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
4740Sstevel@tonic-gate 
4750Sstevel@tonic-gate 			if (userlimit > base)
4760Sstevel@tonic-gate 				slen = userlimit - base;
4770Sstevel@tonic-gate 			else {
4780Sstevel@tonic-gate 				*addrp = NULL;
4790Sstevel@tonic-gate 				return;
4800Sstevel@tonic-gate 			}
4810Sstevel@tonic-gate 		} else {
4820Sstevel@tonic-gate 			/*
4830Sstevel@tonic-gate 			 * XX64 This layout is probably wrong .. but in
4840Sstevel@tonic-gate 			 * the event we make the amd64 address space look
4850Sstevel@tonic-gate 			 * like sparcv9 i.e. with the stack -above- the
4860Sstevel@tonic-gate 			 * heap, this bit of code might even be correct.
4870Sstevel@tonic-gate 			 */
4880Sstevel@tonic-gate 			slen = p->p_usrstack - base -
4890Sstevel@tonic-gate 			    (((size_t)rctl_enforced_value(
4900Sstevel@tonic-gate 			    rctlproc_legacy[RLIMIT_STACK],
4910Sstevel@tonic-gate 			    p->p_rctls, p) + PAGEOFFSET) & PAGEMASK);
4920Sstevel@tonic-gate 		}
4930Sstevel@tonic-gate 	} else
4940Sstevel@tonic-gate #endif
4950Sstevel@tonic-gate 		slen = userlimit - base;
4960Sstevel@tonic-gate 
4970Sstevel@tonic-gate 	len = (len + PAGEOFFSET) & PAGEMASK;
4980Sstevel@tonic-gate 
4990Sstevel@tonic-gate 	/*
5000Sstevel@tonic-gate 	 * Redzone for each side of the request. This is done to leave
5010Sstevel@tonic-gate 	 * one page unmapped between segments. This is not required, but
5020Sstevel@tonic-gate 	 * it's useful for the user because if their program strays across
5030Sstevel@tonic-gate 	 * a segment boundary, it will catch a fault immediately making
5040Sstevel@tonic-gate 	 * debugging a little easier.
5050Sstevel@tonic-gate 	 */
5060Sstevel@tonic-gate 	len += 2 * MMU_PAGESIZE;
5070Sstevel@tonic-gate 
5080Sstevel@tonic-gate 	/*
5090Sstevel@tonic-gate 	 * figure out what the alignment should be
5100Sstevel@tonic-gate 	 *
5110Sstevel@tonic-gate 	 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
5120Sstevel@tonic-gate 	 */
5130Sstevel@tonic-gate 	if (len <= ELF_386_MAXPGSZ) {
5140Sstevel@tonic-gate 		/*
5150Sstevel@tonic-gate 		 * Align virtual addresses to ensure that ELF shared libraries
5160Sstevel@tonic-gate 		 * are mapped with the appropriate alignment constraints by
5170Sstevel@tonic-gate 		 * the run-time linker.
5180Sstevel@tonic-gate 		 */
5190Sstevel@tonic-gate 		align_amount = ELF_386_MAXPGSZ;
5200Sstevel@tonic-gate 	} else {
5210Sstevel@tonic-gate 		int l = mmu.max_page_level;
5220Sstevel@tonic-gate 
5230Sstevel@tonic-gate 		while (l && len < LEVEL_SIZE(l))
5240Sstevel@tonic-gate 			--l;
5250Sstevel@tonic-gate 
5260Sstevel@tonic-gate 		align_amount = LEVEL_SIZE(l);
5270Sstevel@tonic-gate 	}
5280Sstevel@tonic-gate 
5290Sstevel@tonic-gate 	if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
5300Sstevel@tonic-gate 		align_amount = (uintptr_t)*addrp;
5310Sstevel@tonic-gate 
5320Sstevel@tonic-gate 	len += align_amount;
5330Sstevel@tonic-gate 
5340Sstevel@tonic-gate 	/*
5350Sstevel@tonic-gate 	 * Look for a large enough hole starting below userlimit.
5360Sstevel@tonic-gate 	 * After finding it, use the upper part.  Addition of PAGESIZE
5370Sstevel@tonic-gate 	 * is for the redzone as described above.
5380Sstevel@tonic-gate 	 */
5390Sstevel@tonic-gate 	if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) {
5400Sstevel@tonic-gate 		caddr_t as_addr;
5410Sstevel@tonic-gate 
5420Sstevel@tonic-gate 		addr = base + slen - len + MMU_PAGESIZE;
5430Sstevel@tonic-gate 		as_addr = addr;
5440Sstevel@tonic-gate 		/*
5450Sstevel@tonic-gate 		 * Round address DOWN to the alignment amount,
5460Sstevel@tonic-gate 		 * add the offset, and if this address is less
5470Sstevel@tonic-gate 		 * than the original address, add alignment amount.
5480Sstevel@tonic-gate 		 */
5490Sstevel@tonic-gate 		addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
5500Sstevel@tonic-gate 		addr += (uintptr_t)(off & (align_amount - 1));
5510Sstevel@tonic-gate 		if (addr < as_addr)
5520Sstevel@tonic-gate 			addr += align_amount;
5530Sstevel@tonic-gate 
5540Sstevel@tonic-gate 		ASSERT(addr <= (as_addr + align_amount));
5550Sstevel@tonic-gate 		ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
5560Sstevel@tonic-gate 		    ((uintptr_t)(off & (align_amount - 1))));
5570Sstevel@tonic-gate 		*addrp = addr;
5580Sstevel@tonic-gate 	} else {
5590Sstevel@tonic-gate 		*addrp = NULL;	/* no more virtual space */
5600Sstevel@tonic-gate 	}
5610Sstevel@tonic-gate }
5620Sstevel@tonic-gate 
5630Sstevel@tonic-gate /*
5640Sstevel@tonic-gate  * Determine whether [base, base+len] contains a valid range of
5650Sstevel@tonic-gate  * addresses at least minlen long. base and len are adjusted if
5660Sstevel@tonic-gate  * required to provide a valid range.
5670Sstevel@tonic-gate  */
5680Sstevel@tonic-gate /*ARGSUSED3*/
5690Sstevel@tonic-gate int
5700Sstevel@tonic-gate valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
5710Sstevel@tonic-gate {
5720Sstevel@tonic-gate 	uintptr_t hi, lo;
5730Sstevel@tonic-gate 
5740Sstevel@tonic-gate 	lo = (uintptr_t)*basep;
5750Sstevel@tonic-gate 	hi = lo + *lenp;
5760Sstevel@tonic-gate 
5770Sstevel@tonic-gate 	/*
5780Sstevel@tonic-gate 	 * If hi rolled over the top, try cutting back.
5790Sstevel@tonic-gate 	 */
5800Sstevel@tonic-gate 	if (hi < lo) {
5810Sstevel@tonic-gate 		if (0 - lo + hi < minlen)
5820Sstevel@tonic-gate 			return (0);
5830Sstevel@tonic-gate 		if (0 - lo < minlen)
5840Sstevel@tonic-gate 			return (0);
5850Sstevel@tonic-gate 		*lenp = 0 - lo;
5860Sstevel@tonic-gate 	} else if (hi - lo < minlen) {
5870Sstevel@tonic-gate 		return (0);
5880Sstevel@tonic-gate 	}
5890Sstevel@tonic-gate #if defined(__amd64)
5900Sstevel@tonic-gate 	/*
5910Sstevel@tonic-gate 	 * Deal with a possible hole in the address range between
5920Sstevel@tonic-gate 	 * hole_start and hole_end that should never be mapped.
5930Sstevel@tonic-gate 	 */
5940Sstevel@tonic-gate 	if (lo < hole_start) {
5950Sstevel@tonic-gate 		if (hi > hole_start) {
5960Sstevel@tonic-gate 			if (hi < hole_end) {
5970Sstevel@tonic-gate 				hi = hole_start;
5980Sstevel@tonic-gate 			} else {
5990Sstevel@tonic-gate 				/* lo < hole_start && hi >= hole_end */
6000Sstevel@tonic-gate 				if (dir == AH_LO) {
6010Sstevel@tonic-gate 					/*
6020Sstevel@tonic-gate 					 * prefer lowest range
6030Sstevel@tonic-gate 					 */
6040Sstevel@tonic-gate 					if (hole_start - lo >= minlen)
6050Sstevel@tonic-gate 						hi = hole_start;
6060Sstevel@tonic-gate 					else if (hi - hole_end >= minlen)
6070Sstevel@tonic-gate 						lo = hole_end;
6080Sstevel@tonic-gate 					else
6090Sstevel@tonic-gate 						return (0);
6100Sstevel@tonic-gate 				} else {
6110Sstevel@tonic-gate 					/*
6120Sstevel@tonic-gate 					 * prefer highest range
6130Sstevel@tonic-gate 					 */
6140Sstevel@tonic-gate 					if (hi - hole_end >= minlen)
6150Sstevel@tonic-gate 						lo = hole_end;
6160Sstevel@tonic-gate 					else if (hole_start - lo >= minlen)
6170Sstevel@tonic-gate 						hi = hole_start;
6180Sstevel@tonic-gate 					else
6190Sstevel@tonic-gate 						return (0);
6200Sstevel@tonic-gate 				}
6210Sstevel@tonic-gate 			}
6220Sstevel@tonic-gate 		}
6230Sstevel@tonic-gate 	} else {
6240Sstevel@tonic-gate 		/* lo >= hole_start */
6250Sstevel@tonic-gate 		if (hi < hole_end)
6260Sstevel@tonic-gate 			return (0);
6270Sstevel@tonic-gate 		if (lo < hole_end)
6280Sstevel@tonic-gate 			lo = hole_end;
6290Sstevel@tonic-gate 	}
6300Sstevel@tonic-gate 
6310Sstevel@tonic-gate 	if (hi - lo < minlen)
6320Sstevel@tonic-gate 		return (0);
6330Sstevel@tonic-gate 
6340Sstevel@tonic-gate 	*basep = (caddr_t)lo;
6350Sstevel@tonic-gate 	*lenp = hi - lo;
6360Sstevel@tonic-gate #endif
6370Sstevel@tonic-gate 	return (1);
6380Sstevel@tonic-gate }
6390Sstevel@tonic-gate 
6400Sstevel@tonic-gate /*
6410Sstevel@tonic-gate  * Determine whether [addr, addr+len] are valid user addresses.
6420Sstevel@tonic-gate  */
6430Sstevel@tonic-gate /*ARGSUSED*/
6440Sstevel@tonic-gate int
6450Sstevel@tonic-gate valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
6460Sstevel@tonic-gate     caddr_t userlimit)
6470Sstevel@tonic-gate {
6480Sstevel@tonic-gate 	caddr_t eaddr = addr + len;
6490Sstevel@tonic-gate 
6500Sstevel@tonic-gate 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
6510Sstevel@tonic-gate 		return (RANGE_BADADDR);
6520Sstevel@tonic-gate 
6530Sstevel@tonic-gate #if defined(__amd64)
6540Sstevel@tonic-gate 	/*
6550Sstevel@tonic-gate 	 * Check for the VA hole
6560Sstevel@tonic-gate 	 */
6570Sstevel@tonic-gate 	if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
6580Sstevel@tonic-gate 		return (RANGE_BADADDR);
6590Sstevel@tonic-gate #endif
6600Sstevel@tonic-gate 
6610Sstevel@tonic-gate 	return (RANGE_OKAY);
6620Sstevel@tonic-gate }
6630Sstevel@tonic-gate 
6640Sstevel@tonic-gate /*
6650Sstevel@tonic-gate  * Return 1 if the page frame is onboard memory, else 0.
6660Sstevel@tonic-gate  */
6670Sstevel@tonic-gate int
6680Sstevel@tonic-gate pf_is_memory(pfn_t pf)
6690Sstevel@tonic-gate {
6700Sstevel@tonic-gate 	return (address_in_memlist(phys_install, mmu_ptob((uint64_t)pf), 1));
6710Sstevel@tonic-gate }
6720Sstevel@tonic-gate 
6730Sstevel@tonic-gate 
6740Sstevel@tonic-gate /*
6750Sstevel@tonic-gate  * initialized by page_coloring_init().
6760Sstevel@tonic-gate  */
6770Sstevel@tonic-gate uint_t	page_colors;
6780Sstevel@tonic-gate uint_t	page_colors_mask;
6790Sstevel@tonic-gate uint_t	page_coloring_shift;
6800Sstevel@tonic-gate int	cpu_page_colors;
6810Sstevel@tonic-gate static uint_t	l2_colors;
6820Sstevel@tonic-gate 
6830Sstevel@tonic-gate /*
6840Sstevel@tonic-gate  * Page freelists and cachelists are dynamically allocated once mnoderangecnt
6850Sstevel@tonic-gate  * and page_colors are calculated from the l2 cache n-way set size.  Within a
6860Sstevel@tonic-gate  * mnode range, the page freelist and cachelist are hashed into bins based on
6870Sstevel@tonic-gate  * color. This makes it easier to search for a page within a specific memory
6880Sstevel@tonic-gate  * range.
6890Sstevel@tonic-gate  */
6900Sstevel@tonic-gate #define	PAGE_COLORS_MIN	16
6910Sstevel@tonic-gate 
6920Sstevel@tonic-gate page_t ****page_freelists;
6930Sstevel@tonic-gate page_t ***page_cachelists;
6940Sstevel@tonic-gate 
6950Sstevel@tonic-gate /*
6960Sstevel@tonic-gate  * As the PC architecture evolved memory up was clumped into several
6970Sstevel@tonic-gate  * ranges for various historical I/O devices to do DMA.
6980Sstevel@tonic-gate  * < 16Meg - ISA bus
6990Sstevel@tonic-gate  * < 2Gig - ???
7000Sstevel@tonic-gate  * < 4Gig - PCI bus or drivers that don't understand PAE mode
7010Sstevel@tonic-gate  */
7020Sstevel@tonic-gate static pfn_t arch_memranges[NUM_MEM_RANGES] = {
7030Sstevel@tonic-gate     0x100000,	/* pfn range for 4G and above */
7040Sstevel@tonic-gate     0x80000,	/* pfn range for 2G-4G */
7050Sstevel@tonic-gate     0x01000,	/* pfn range for 16M-2G */
7060Sstevel@tonic-gate     0x00000,	/* pfn range for 0-16M */
7070Sstevel@tonic-gate };
7080Sstevel@tonic-gate 
7090Sstevel@tonic-gate /*
7100Sstevel@tonic-gate  * These are changed during startup if the machine has limited memory.
7110Sstevel@tonic-gate  */
7120Sstevel@tonic-gate pfn_t *memranges = &arch_memranges[0];
7130Sstevel@tonic-gate int nranges = NUM_MEM_RANGES;
7140Sstevel@tonic-gate 
7150Sstevel@tonic-gate /*
7160Sstevel@tonic-gate  * Used by page layer to know about page sizes
7170Sstevel@tonic-gate  */
7180Sstevel@tonic-gate hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
7190Sstevel@tonic-gate 
7200Sstevel@tonic-gate /*
7210Sstevel@tonic-gate  * This can be patched via /etc/system to allow old non-PAE aware device
7220Sstevel@tonic-gate  * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
7230Sstevel@tonic-gate  */
7240Sstevel@tonic-gate #if defined(__i386)
7251443Skchow int restricted_kmemalloc = 0;
7260Sstevel@tonic-gate #elif defined(__amd64)
7270Sstevel@tonic-gate int restricted_kmemalloc = 0;
7280Sstevel@tonic-gate #endif
7290Sstevel@tonic-gate 
7300Sstevel@tonic-gate kmutex_t	*fpc_mutex[NPC_MUTEX];
7310Sstevel@tonic-gate kmutex_t	*cpc_mutex[NPC_MUTEX];
7320Sstevel@tonic-gate 
7330Sstevel@tonic-gate 
7340Sstevel@tonic-gate /*
7350Sstevel@tonic-gate  * return the memrange containing pfn
7360Sstevel@tonic-gate  */
7370Sstevel@tonic-gate int
7380Sstevel@tonic-gate memrange_num(pfn_t pfn)
7390Sstevel@tonic-gate {
7400Sstevel@tonic-gate 	int n;
7410Sstevel@tonic-gate 
7420Sstevel@tonic-gate 	for (n = 0; n < nranges - 1; ++n) {
7430Sstevel@tonic-gate 		if (pfn >= memranges[n])
7440Sstevel@tonic-gate 			break;
7450Sstevel@tonic-gate 	}
7460Sstevel@tonic-gate 	return (n);
7470Sstevel@tonic-gate }
7480Sstevel@tonic-gate 
7490Sstevel@tonic-gate /*
7500Sstevel@tonic-gate  * return the mnoderange containing pfn
7510Sstevel@tonic-gate  */
7520Sstevel@tonic-gate int
7530Sstevel@tonic-gate pfn_2_mtype(pfn_t pfn)
7540Sstevel@tonic-gate {
7550Sstevel@tonic-gate 	int	n;
7560Sstevel@tonic-gate 
7570Sstevel@tonic-gate 	for (n = mnoderangecnt - 1; n >= 0; n--) {
7580Sstevel@tonic-gate 		if (pfn >= mnoderanges[n].mnr_pfnlo) {
7590Sstevel@tonic-gate 			break;
7600Sstevel@tonic-gate 		}
7610Sstevel@tonic-gate 	}
7620Sstevel@tonic-gate 	return (n);
7630Sstevel@tonic-gate }
7640Sstevel@tonic-gate 
7650Sstevel@tonic-gate /*
7660Sstevel@tonic-gate  * is_contigpage_free:
7670Sstevel@tonic-gate  *	returns a page list of contiguous pages. It minimally has to return
7680Sstevel@tonic-gate  *	minctg pages. Caller determines minctg based on the scatter-gather
7690Sstevel@tonic-gate  *	list length.
7700Sstevel@tonic-gate  *
7710Sstevel@tonic-gate  *	pfnp is set to the next page frame to search on return.
7720Sstevel@tonic-gate  */
7730Sstevel@tonic-gate static page_t *
7740Sstevel@tonic-gate is_contigpage_free(
7750Sstevel@tonic-gate 	pfn_t *pfnp,
7760Sstevel@tonic-gate 	pgcnt_t *pgcnt,
7770Sstevel@tonic-gate 	pgcnt_t minctg,
7780Sstevel@tonic-gate 	uint64_t pfnseg,
7790Sstevel@tonic-gate 	int iolock)
7800Sstevel@tonic-gate {
7810Sstevel@tonic-gate 	int	i = 0;
7820Sstevel@tonic-gate 	pfn_t	pfn = *pfnp;
7830Sstevel@tonic-gate 	page_t	*pp;
7840Sstevel@tonic-gate 	page_t	*plist = NULL;
7850Sstevel@tonic-gate 
7860Sstevel@tonic-gate 	/*
7870Sstevel@tonic-gate 	 * fail if pfn + minctg crosses a segment boundary.
7880Sstevel@tonic-gate 	 * Adjust for next starting pfn to begin at segment boundary.
7890Sstevel@tonic-gate 	 */
7900Sstevel@tonic-gate 
7910Sstevel@tonic-gate 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
7920Sstevel@tonic-gate 		*pfnp = roundup(*pfnp, pfnseg + 1);
7930Sstevel@tonic-gate 		return (NULL);
7940Sstevel@tonic-gate 	}
7950Sstevel@tonic-gate 
7960Sstevel@tonic-gate 	do {
7970Sstevel@tonic-gate retry:
7980Sstevel@tonic-gate 		pp = page_numtopp_nolock(pfn + i);
7990Sstevel@tonic-gate 		if ((pp == NULL) ||
8000Sstevel@tonic-gate 		    (page_trylock(pp, SE_EXCL) == 0)) {
8010Sstevel@tonic-gate 			(*pfnp)++;
8020Sstevel@tonic-gate 			break;
8030Sstevel@tonic-gate 		}
8040Sstevel@tonic-gate 		if (page_pptonum(pp) != pfn + i) {
8050Sstevel@tonic-gate 			page_unlock(pp);
8060Sstevel@tonic-gate 			goto retry;
8070Sstevel@tonic-gate 		}
8080Sstevel@tonic-gate 
8090Sstevel@tonic-gate 		if (!(PP_ISFREE(pp))) {
8100Sstevel@tonic-gate 			page_unlock(pp);
8110Sstevel@tonic-gate 			(*pfnp)++;
8120Sstevel@tonic-gate 			break;
8130Sstevel@tonic-gate 		}
8140Sstevel@tonic-gate 
8150Sstevel@tonic-gate 		if (!PP_ISAGED(pp)) {
8160Sstevel@tonic-gate 			page_list_sub(pp, PG_CACHE_LIST);
8170Sstevel@tonic-gate 			page_hashout(pp, (kmutex_t *)NULL);
8180Sstevel@tonic-gate 		} else {
8190Sstevel@tonic-gate 			page_list_sub(pp, PG_FREE_LIST);
8200Sstevel@tonic-gate 		}
8210Sstevel@tonic-gate 
8220Sstevel@tonic-gate 		if (iolock)
8230Sstevel@tonic-gate 			page_io_lock(pp);
8240Sstevel@tonic-gate 		page_list_concat(&plist, &pp);
8250Sstevel@tonic-gate 
8260Sstevel@tonic-gate 		/*
8270Sstevel@tonic-gate 		 * exit loop when pgcnt satisfied or segment boundary reached.
8280Sstevel@tonic-gate 		 */
8290Sstevel@tonic-gate 
8300Sstevel@tonic-gate 	} while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
8310Sstevel@tonic-gate 
8320Sstevel@tonic-gate 	*pfnp += i;		/* set to next pfn to search */
8330Sstevel@tonic-gate 
8340Sstevel@tonic-gate 	if (i >= minctg) {
8350Sstevel@tonic-gate 		*pgcnt -= i;
8360Sstevel@tonic-gate 		return (plist);
8370Sstevel@tonic-gate 	}
8380Sstevel@tonic-gate 
8390Sstevel@tonic-gate 	/*
8400Sstevel@tonic-gate 	 * failure: minctg not satisfied.
8410Sstevel@tonic-gate 	 *
8420Sstevel@tonic-gate 	 * if next request crosses segment boundary, set next pfn
8430Sstevel@tonic-gate 	 * to search from the segment boundary.
8440Sstevel@tonic-gate 	 */
8450Sstevel@tonic-gate 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
8460Sstevel@tonic-gate 		*pfnp = roundup(*pfnp, pfnseg + 1);
8470Sstevel@tonic-gate 
8480Sstevel@tonic-gate 	/* clean up any pages already allocated */
8490Sstevel@tonic-gate 
8500Sstevel@tonic-gate 	while (plist) {
8510Sstevel@tonic-gate 		pp = plist;
8520Sstevel@tonic-gate 		page_sub(&plist, pp);
8530Sstevel@tonic-gate 		page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
8540Sstevel@tonic-gate 		if (iolock)
8550Sstevel@tonic-gate 			page_io_unlock(pp);
8560Sstevel@tonic-gate 		page_unlock(pp);
8570Sstevel@tonic-gate 	}
8580Sstevel@tonic-gate 
8590Sstevel@tonic-gate 	return (NULL);
8600Sstevel@tonic-gate }
8610Sstevel@tonic-gate 
8620Sstevel@tonic-gate /*
8630Sstevel@tonic-gate  * verify that pages being returned from allocator have correct DMA attribute
8640Sstevel@tonic-gate  */
8650Sstevel@tonic-gate #ifndef DEBUG
8660Sstevel@tonic-gate #define	check_dma(a, b, c) (0)
8670Sstevel@tonic-gate #else
8680Sstevel@tonic-gate static void
8690Sstevel@tonic-gate check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
8700Sstevel@tonic-gate {
8710Sstevel@tonic-gate 	if (dma_attr == NULL)
8720Sstevel@tonic-gate 		return;
8730Sstevel@tonic-gate 
8740Sstevel@tonic-gate 	while (cnt-- > 0) {
8750Sstevel@tonic-gate 		if (mmu_ptob((uint64_t)pp->p_pagenum) <
8760Sstevel@tonic-gate 		    dma_attr->dma_attr_addr_lo)
8770Sstevel@tonic-gate 			panic("PFN (pp=%p) below dma_attr_addr_lo", pp);
8780Sstevel@tonic-gate 		if (mmu_ptob((uint64_t)pp->p_pagenum) >=
8790Sstevel@tonic-gate 		    dma_attr->dma_attr_addr_hi)
8800Sstevel@tonic-gate 			panic("PFN (pp=%p) above dma_attr_addr_hi", pp);
8810Sstevel@tonic-gate 		pp = pp->p_next;
8820Sstevel@tonic-gate 	}
8830Sstevel@tonic-gate }
8840Sstevel@tonic-gate #endif
8850Sstevel@tonic-gate 
8860Sstevel@tonic-gate static kmutex_t	contig_lock;
8870Sstevel@tonic-gate 
8880Sstevel@tonic-gate #define	CONTIG_LOCK()	mutex_enter(&contig_lock);
8890Sstevel@tonic-gate #define	CONTIG_UNLOCK()	mutex_exit(&contig_lock);
8900Sstevel@tonic-gate 
8910Sstevel@tonic-gate #define	PFN_16M		(mmu_btop((uint64_t)0x1000000))
8920Sstevel@tonic-gate 
8930Sstevel@tonic-gate static page_t *
8940Sstevel@tonic-gate page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
8950Sstevel@tonic-gate {
8960Sstevel@tonic-gate 	pfn_t		pfn;
8970Sstevel@tonic-gate 	int		sgllen;
8980Sstevel@tonic-gate 	uint64_t	pfnseg;
8990Sstevel@tonic-gate 	pgcnt_t		minctg;
9000Sstevel@tonic-gate 	page_t		*pplist = NULL, *plist;
9010Sstevel@tonic-gate 	uint64_t	lo, hi;
9020Sstevel@tonic-gate 	pgcnt_t		pfnalign = 0;
9030Sstevel@tonic-gate 	static pfn_t	startpfn;
9040Sstevel@tonic-gate 	static pgcnt_t	lastctgcnt;
9050Sstevel@tonic-gate 	uintptr_t	align;
9060Sstevel@tonic-gate 
9070Sstevel@tonic-gate 	CONTIG_LOCK();
9080Sstevel@tonic-gate 
9090Sstevel@tonic-gate 	if (mattr) {
9100Sstevel@tonic-gate 		lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
9110Sstevel@tonic-gate 		hi = mmu_btop(mattr->dma_attr_addr_hi);
9120Sstevel@tonic-gate 		if (hi >= physmax)
9130Sstevel@tonic-gate 			hi = physmax - 1;
9140Sstevel@tonic-gate 		sgllen = mattr->dma_attr_sgllen;
9150Sstevel@tonic-gate 		pfnseg = mmu_btop(mattr->dma_attr_seg);
9160Sstevel@tonic-gate 
9170Sstevel@tonic-gate 		align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
9180Sstevel@tonic-gate 		if (align > MMU_PAGESIZE)
9190Sstevel@tonic-gate 			pfnalign = mmu_btop(align);
9200Sstevel@tonic-gate 
9210Sstevel@tonic-gate 		/*
9220Sstevel@tonic-gate 		 * in order to satisfy the request, must minimally
9230Sstevel@tonic-gate 		 * acquire minctg contiguous pages
9240Sstevel@tonic-gate 		 */
9250Sstevel@tonic-gate 		minctg = howmany(*pgcnt, sgllen);
9260Sstevel@tonic-gate 
9270Sstevel@tonic-gate 		ASSERT(hi >= lo);
9280Sstevel@tonic-gate 
9290Sstevel@tonic-gate 		/*
9300Sstevel@tonic-gate 		 * start from where last searched if the minctg >= lastctgcnt
9310Sstevel@tonic-gate 		 */
9320Sstevel@tonic-gate 		if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
9330Sstevel@tonic-gate 			startpfn = lo;
9340Sstevel@tonic-gate 	} else {
9350Sstevel@tonic-gate 		hi = physmax - 1;
9360Sstevel@tonic-gate 		lo = 0;
9370Sstevel@tonic-gate 		sgllen = 1;
9380Sstevel@tonic-gate 		pfnseg = mmu.highest_pfn;
9390Sstevel@tonic-gate 		minctg = *pgcnt;
9400Sstevel@tonic-gate 
9410Sstevel@tonic-gate 		if (minctg < lastctgcnt)
9420Sstevel@tonic-gate 			startpfn = lo;
9430Sstevel@tonic-gate 	}
9440Sstevel@tonic-gate 	lastctgcnt = minctg;
9450Sstevel@tonic-gate 
9460Sstevel@tonic-gate 	ASSERT(pfnseg + 1 >= (uint64_t)minctg);
9470Sstevel@tonic-gate 
9480Sstevel@tonic-gate 	/* conserve 16m memory - start search above 16m when possible */
9490Sstevel@tonic-gate 	if (hi > PFN_16M && startpfn < PFN_16M)
9500Sstevel@tonic-gate 		startpfn = PFN_16M;
9510Sstevel@tonic-gate 
9520Sstevel@tonic-gate 	pfn = startpfn;
9530Sstevel@tonic-gate 	if (pfnalign)
9540Sstevel@tonic-gate 		pfn = P2ROUNDUP(pfn, pfnalign);
9550Sstevel@tonic-gate 
9560Sstevel@tonic-gate 	while (pfn + minctg - 1 <= hi) {
9570Sstevel@tonic-gate 
9580Sstevel@tonic-gate 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
9590Sstevel@tonic-gate 		if (plist) {
9600Sstevel@tonic-gate 			page_list_concat(&pplist, &plist);
9610Sstevel@tonic-gate 			sgllen--;
9620Sstevel@tonic-gate 			/*
9630Sstevel@tonic-gate 			 * return when contig pages no longer needed
9640Sstevel@tonic-gate 			 */
9650Sstevel@tonic-gate 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
9660Sstevel@tonic-gate 				startpfn = pfn;
9670Sstevel@tonic-gate 				CONTIG_UNLOCK();
9680Sstevel@tonic-gate 				check_dma(mattr, pplist, *pgcnt);
9690Sstevel@tonic-gate 				return (pplist);
9700Sstevel@tonic-gate 			}
9710Sstevel@tonic-gate 			minctg = howmany(*pgcnt, sgllen);
9720Sstevel@tonic-gate 		}
9730Sstevel@tonic-gate 		if (pfnalign)
9740Sstevel@tonic-gate 			pfn = P2ROUNDUP(pfn, pfnalign);
9750Sstevel@tonic-gate 	}
9760Sstevel@tonic-gate 
9770Sstevel@tonic-gate 	/* cannot find contig pages in specified range */
9780Sstevel@tonic-gate 	if (startpfn == lo) {
9790Sstevel@tonic-gate 		CONTIG_UNLOCK();
9800Sstevel@tonic-gate 		return (NULL);
9810Sstevel@tonic-gate 	}
9820Sstevel@tonic-gate 
9830Sstevel@tonic-gate 	/* did not start with lo previously */
9840Sstevel@tonic-gate 	pfn = lo;
9850Sstevel@tonic-gate 	if (pfnalign)
9860Sstevel@tonic-gate 		pfn = P2ROUNDUP(pfn, pfnalign);
9870Sstevel@tonic-gate 
9880Sstevel@tonic-gate 	/* allow search to go above startpfn */
9890Sstevel@tonic-gate 	while (pfn < startpfn) {
9900Sstevel@tonic-gate 
9910Sstevel@tonic-gate 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
9920Sstevel@tonic-gate 		if (plist != NULL) {
9930Sstevel@tonic-gate 
9940Sstevel@tonic-gate 			page_list_concat(&pplist, &plist);
9950Sstevel@tonic-gate 			sgllen--;
9960Sstevel@tonic-gate 
9970Sstevel@tonic-gate 			/*
9980Sstevel@tonic-gate 			 * return when contig pages no longer needed
9990Sstevel@tonic-gate 			 */
10000Sstevel@tonic-gate 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
10010Sstevel@tonic-gate 				startpfn = pfn;
10020Sstevel@tonic-gate 				CONTIG_UNLOCK();
10030Sstevel@tonic-gate 				check_dma(mattr, pplist, *pgcnt);
10040Sstevel@tonic-gate 				return (pplist);
10050Sstevel@tonic-gate 			}
10060Sstevel@tonic-gate 			minctg = howmany(*pgcnt, sgllen);
10070Sstevel@tonic-gate 		}
10080Sstevel@tonic-gate 		if (pfnalign)
10090Sstevel@tonic-gate 			pfn = P2ROUNDUP(pfn, pfnalign);
10100Sstevel@tonic-gate 	}
10110Sstevel@tonic-gate 	CONTIG_UNLOCK();
10120Sstevel@tonic-gate 	return (NULL);
10130Sstevel@tonic-gate }
10140Sstevel@tonic-gate 
10150Sstevel@tonic-gate /*
10160Sstevel@tonic-gate  * combine mem_node_config and memrange memory ranges into one data
10170Sstevel@tonic-gate  * structure to be used for page list management.
10180Sstevel@tonic-gate  *
10190Sstevel@tonic-gate  * mnode_range_cnt() calculates the number of memory ranges for mnode and
10200Sstevel@tonic-gate  * memranges[]. Used to determine the size of page lists and mnoderanges.
10210Sstevel@tonic-gate  *
10220Sstevel@tonic-gate  * mnode_range_setup() initializes mnoderanges.
10230Sstevel@tonic-gate  */
10240Sstevel@tonic-gate mnoderange_t	*mnoderanges;
10250Sstevel@tonic-gate int		mnoderangecnt;
10260Sstevel@tonic-gate int		mtype4g;
10270Sstevel@tonic-gate 
10280Sstevel@tonic-gate int
10292961Sdp78419 mnode_range_cnt(int mnode)
10300Sstevel@tonic-gate {
10310Sstevel@tonic-gate 	int	mri;
10320Sstevel@tonic-gate 	int	mnrcnt = 0;
10330Sstevel@tonic-gate 
10342961Sdp78419 	if (mem_node_config[mnode].exists != 0) {
10350Sstevel@tonic-gate 		mri = nranges - 1;
10360Sstevel@tonic-gate 
10370Sstevel@tonic-gate 		/* find the memranges index below contained in mnode range */
10380Sstevel@tonic-gate 
10390Sstevel@tonic-gate 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
10400Sstevel@tonic-gate 			mri--;
10410Sstevel@tonic-gate 
10420Sstevel@tonic-gate 		/*
10430Sstevel@tonic-gate 		 * increment mnode range counter when memranges or mnode
10440Sstevel@tonic-gate 		 * boundary is reached.
10450Sstevel@tonic-gate 		 */
10460Sstevel@tonic-gate 		while (mri >= 0 &&
10470Sstevel@tonic-gate 		    mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
10480Sstevel@tonic-gate 			mnrcnt++;
10490Sstevel@tonic-gate 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
10500Sstevel@tonic-gate 				mri--;
10510Sstevel@tonic-gate 			else
10520Sstevel@tonic-gate 				break;
10530Sstevel@tonic-gate 		}
10540Sstevel@tonic-gate 	}
10552961Sdp78419 	ASSERT(mnrcnt <= MAX_MNODE_MRANGES);
10560Sstevel@tonic-gate 	return (mnrcnt);
10570Sstevel@tonic-gate }
10580Sstevel@tonic-gate 
10590Sstevel@tonic-gate void
10600Sstevel@tonic-gate mnode_range_setup(mnoderange_t *mnoderanges)
10610Sstevel@tonic-gate {
10620Sstevel@tonic-gate 	int	mnode, mri;
10630Sstevel@tonic-gate 
10640Sstevel@tonic-gate 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
10650Sstevel@tonic-gate 		if (mem_node_config[mnode].exists == 0)
10660Sstevel@tonic-gate 			continue;
10670Sstevel@tonic-gate 
10680Sstevel@tonic-gate 		mri = nranges - 1;
10690Sstevel@tonic-gate 
10700Sstevel@tonic-gate 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
10710Sstevel@tonic-gate 			mri--;
10720Sstevel@tonic-gate 
10730Sstevel@tonic-gate 		while (mri >= 0 && mem_node_config[mnode].physmax >=
10740Sstevel@tonic-gate 		    MEMRANGELO(mri)) {
10750Sstevel@tonic-gate 			mnoderanges->mnr_pfnlo =
10760Sstevel@tonic-gate 			    MAX(MEMRANGELO(mri),
10770Sstevel@tonic-gate 				mem_node_config[mnode].physbase);
10780Sstevel@tonic-gate 			mnoderanges->mnr_pfnhi =
10790Sstevel@tonic-gate 			    MIN(MEMRANGEHI(mri),
10800Sstevel@tonic-gate 				mem_node_config[mnode].physmax);
10810Sstevel@tonic-gate 			mnoderanges->mnr_mnode = mnode;
10820Sstevel@tonic-gate 			mnoderanges->mnr_memrange = mri;
10830Sstevel@tonic-gate 			mnoderanges++;
10840Sstevel@tonic-gate 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
10850Sstevel@tonic-gate 				mri--;
10860Sstevel@tonic-gate 			else
10870Sstevel@tonic-gate 				break;
10880Sstevel@tonic-gate 		}
10890Sstevel@tonic-gate 	}
10900Sstevel@tonic-gate }
10910Sstevel@tonic-gate 
10920Sstevel@tonic-gate /*
10930Sstevel@tonic-gate  * Determine if the mnode range specified in mtype contains memory belonging
10940Sstevel@tonic-gate  * to memory node mnode.  If flags & PGI_MT_RANGE is set then mtype contains
10951385Skchow  * the range of indices from high pfn to 0, 16m or 4g.
10960Sstevel@tonic-gate  *
10970Sstevel@tonic-gate  * Return first mnode range type index found otherwise return -1 if none found.
10980Sstevel@tonic-gate  */
10990Sstevel@tonic-gate int
11000Sstevel@tonic-gate mtype_func(int mnode, int mtype, uint_t flags)
11010Sstevel@tonic-gate {
11020Sstevel@tonic-gate 	if (flags & PGI_MT_RANGE) {
11031385Skchow 		int	mtlim;
11040Sstevel@tonic-gate 
11050Sstevel@tonic-gate 		if (flags & PGI_MT_NEXT)
11060Sstevel@tonic-gate 			mtype--;
11071385Skchow 		if (flags & PGI_MT_RANGE0)
11081385Skchow 			mtlim = 0;
11091385Skchow 		else if (flags & PGI_MT_RANGE4G)
11101385Skchow 			mtlim = mtype4g + 1;	/* exclude 0-4g range */
11111385Skchow 		else if (flags & PGI_MT_RANGE16M)
11121385Skchow 			mtlim = 1;		/* exclude 0-16m range */
11130Sstevel@tonic-gate 		while (mtype >= mtlim) {
11140Sstevel@tonic-gate 			if (mnoderanges[mtype].mnr_mnode == mnode)
11150Sstevel@tonic-gate 				return (mtype);
11160Sstevel@tonic-gate 			mtype--;
11170Sstevel@tonic-gate 		}
11180Sstevel@tonic-gate 	} else {
11190Sstevel@tonic-gate 		if (mnoderanges[mtype].mnr_mnode == mnode)
11200Sstevel@tonic-gate 			return (mtype);
11210Sstevel@tonic-gate 	}
11220Sstevel@tonic-gate 	return (-1);
11230Sstevel@tonic-gate }
11240Sstevel@tonic-gate 
11250Sstevel@tonic-gate /*
11261373Skchow  * Update the page list max counts with the pfn range specified by the
11271373Skchow  * input parameters.  Called from add_physmem() when physical memory with
11281373Skchow  * page_t's are initially added to the page lists.
11291373Skchow  */
11301373Skchow void
11311373Skchow mtype_modify_max(pfn_t startpfn, long cnt)
11321373Skchow {
11331373Skchow 	int	mtype = 0;
11341373Skchow 	pfn_t	endpfn = startpfn + cnt, pfn;
11351373Skchow 	pgcnt_t	inc;
11361373Skchow 
11371373Skchow 	ASSERT(cnt > 0);
11381373Skchow 
11391373Skchow 	for (pfn = startpfn; pfn < endpfn; ) {
11401373Skchow 		if (pfn <= mnoderanges[mtype].mnr_pfnhi) {
11411373Skchow 			if (endpfn < mnoderanges[mtype].mnr_pfnhi) {
11421373Skchow 				inc = endpfn - pfn;
11431373Skchow 			} else {
11441373Skchow 				inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1;
11451373Skchow 			}
11461373Skchow 			mnoderanges[mtype].mnr_mt_pgmax += inc;
11471373Skchow 			if (physmax4g && mtype <= mtype4g)
11481373Skchow 				maxmem4g += inc;
11491373Skchow 			pfn += inc;
11501373Skchow 		}
11511373Skchow 		mtype++;
11521373Skchow 		ASSERT(mtype < mnoderangecnt || pfn >= endpfn);
11531373Skchow 	}
11541373Skchow }
11551373Skchow 
11561373Skchow /*
1157414Skchow  * Returns the free page count for mnode
1158414Skchow  */
1159414Skchow int
1160414Skchow mnode_pgcnt(int mnode)
1161414Skchow {
1162414Skchow 	int	mtype = mnoderangecnt - 1;
1163414Skchow 	int	flags = PGI_MT_RANGE0;
1164414Skchow 	pgcnt_t	pgcnt = 0;
1165414Skchow 
1166414Skchow 	mtype = mtype_func(mnode, mtype, flags);
1167414Skchow 
1168414Skchow 	while (mtype != -1) {
11691385Skchow 		pgcnt += MTYPE_FREEMEM(mtype);
1170414Skchow 		mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);
1171414Skchow 	}
1172414Skchow 	return (pgcnt);
1173414Skchow }
1174414Skchow 
1175414Skchow /*
11760Sstevel@tonic-gate  * Initialize page coloring variables based on the l2 cache parameters.
11770Sstevel@tonic-gate  * Calculate and return memory needed for page coloring data structures.
11780Sstevel@tonic-gate  */
11790Sstevel@tonic-gate size_t
11800Sstevel@tonic-gate page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
11810Sstevel@tonic-gate {
11820Sstevel@tonic-gate 	size_t	colorsz = 0;
11830Sstevel@tonic-gate 	int	i;
11840Sstevel@tonic-gate 	int	colors;
11850Sstevel@tonic-gate 
11860Sstevel@tonic-gate 	/*
11870Sstevel@tonic-gate 	 * Reduce the memory ranges lists if we don't have large amounts
11880Sstevel@tonic-gate 	 * of memory. This avoids searching known empty free lists.
11890Sstevel@tonic-gate 	 */
11900Sstevel@tonic-gate 	i = memrange_num(physmax);
11910Sstevel@tonic-gate 	memranges += i;
11920Sstevel@tonic-gate 	nranges -= i;
11930Sstevel@tonic-gate #if defined(__i386)
11940Sstevel@tonic-gate 	if (i > 0)
11950Sstevel@tonic-gate 		restricted_kmemalloc = 0;
11960Sstevel@tonic-gate #endif
11970Sstevel@tonic-gate 	/* physmax greater than 4g */
11980Sstevel@tonic-gate 	if (i == 0)
11990Sstevel@tonic-gate 		physmax4g = 1;
12000Sstevel@tonic-gate 
12010Sstevel@tonic-gate 	ASSERT(ISP2(l2_sz));
12020Sstevel@tonic-gate 	ASSERT(ISP2(l2_linesz));
12030Sstevel@tonic-gate 	ASSERT(l2_sz > MMU_PAGESIZE);
12040Sstevel@tonic-gate 
12050Sstevel@tonic-gate 	/* l2_assoc is 0 for fully associative l2 cache */
12060Sstevel@tonic-gate 	if (l2_assoc)
12070Sstevel@tonic-gate 		l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
12080Sstevel@tonic-gate 	else
12090Sstevel@tonic-gate 		l2_colors = 1;
12100Sstevel@tonic-gate 
12110Sstevel@tonic-gate 	/* for scalability, configure at least PAGE_COLORS_MIN color bins */
12120Sstevel@tonic-gate 	page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
12130Sstevel@tonic-gate 
12140Sstevel@tonic-gate 	/*
12150Sstevel@tonic-gate 	 * cpu_page_colors is non-zero when a page color may be spread across
12160Sstevel@tonic-gate 	 * multiple bins.
12170Sstevel@tonic-gate 	 */
12180Sstevel@tonic-gate 	if (l2_colors < page_colors)
12190Sstevel@tonic-gate 		cpu_page_colors = l2_colors;
12200Sstevel@tonic-gate 
12210Sstevel@tonic-gate 	ASSERT(ISP2(page_colors));
12220Sstevel@tonic-gate 
12230Sstevel@tonic-gate 	page_colors_mask = page_colors - 1;
12240Sstevel@tonic-gate 
12250Sstevel@tonic-gate 	ASSERT(ISP2(CPUSETSIZE()));
12260Sstevel@tonic-gate 	page_coloring_shift = lowbit(CPUSETSIZE());
12270Sstevel@tonic-gate 
12282961Sdp78419 	/* initialize number of colors per page size */
12292961Sdp78419 	for (i = 0; i <= mmu.max_page_level; i++) {
12302961Sdp78419 		hw_page_array[i].hp_size = LEVEL_SIZE(i);
12312961Sdp78419 		hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
12322961Sdp78419 		hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
12332961Sdp78419 		hw_page_array[i].hp_colors = (page_colors_mask >>
12342961Sdp78419 		    (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
12352961Sdp78419 		    + 1;
12362961Sdp78419 	}
12372961Sdp78419 
12382961Sdp78419 	/*
12392961Sdp78419 	 * The value of cpu_page_colors determines if additional color bins
12402961Sdp78419 	 * need to be checked for a particular color in the page_get routines.
12412961Sdp78419 	 */
12422961Sdp78419 	if (cpu_page_colors != 0) {
12432961Sdp78419 
12442961Sdp78419 		int a = lowbit(page_colors) - lowbit(cpu_page_colors);
12452961Sdp78419 		ASSERT(a > 0);
12462961Sdp78419 		ASSERT(a < 16);
12472961Sdp78419 
12482961Sdp78419 		for (i = 0; i <= mmu.max_page_level; i++) {
12492961Sdp78419 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
12502961Sdp78419 				colorequivszc[i] = 0;
12512961Sdp78419 				continue;
12522961Sdp78419 			}
12532961Sdp78419 			while ((colors >> a) == 0)
12542961Sdp78419 				a--;
12552961Sdp78419 			ASSERT(a >= 0);
12562961Sdp78419 
12572961Sdp78419 			/* higher 4 bits encodes color equiv mask */
12582961Sdp78419 			colorequivszc[i] = (a << 4);
12592961Sdp78419 		}
12602961Sdp78419 	}
12612961Sdp78419 
12622961Sdp78419 	/* factor in colorequiv to check additional 'equivalent' bins. */
12632961Sdp78419 	if (colorequiv > 1) {
12642961Sdp78419 
12652961Sdp78419 		int a = lowbit(colorequiv) - 1;
12662961Sdp78419 		if (a > 15)
12672961Sdp78419 			a = 15;
12682961Sdp78419 
12692961Sdp78419 		for (i = 0; i <= mmu.max_page_level; i++) {
12702961Sdp78419 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
12712961Sdp78419 				continue;
12722961Sdp78419 			}
12732961Sdp78419 			while ((colors >> a) == 0)
12742961Sdp78419 				a--;
12752961Sdp78419 			if ((a << 4) > colorequivszc[i]) {
12762961Sdp78419 				colorequivszc[i] = (a << 4);
12772961Sdp78419 			}
12782961Sdp78419 		}
12792961Sdp78419 	}
12802961Sdp78419 
12810Sstevel@tonic-gate 	/* size for mnoderanges */
12822961Sdp78419 	for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++)
12832961Sdp78419 		mnoderangecnt += mnode_range_cnt(i);
12840Sstevel@tonic-gate 	colorsz = mnoderangecnt * sizeof (mnoderange_t);
12850Sstevel@tonic-gate 
12860Sstevel@tonic-gate 	/* size for fpc_mutex and cpc_mutex */
12870Sstevel@tonic-gate 	colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
12880Sstevel@tonic-gate 
12890Sstevel@tonic-gate 	/* size of page_freelists */
12900Sstevel@tonic-gate 	colorsz += mnoderangecnt * sizeof (page_t ***);
12910Sstevel@tonic-gate 	colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
12920Sstevel@tonic-gate 
12930Sstevel@tonic-gate 	for (i = 0; i < mmu_page_sizes; i++) {
12940Sstevel@tonic-gate 		colors = page_get_pagecolors(i);
12950Sstevel@tonic-gate 		colorsz += mnoderangecnt * colors * sizeof (page_t *);
12960Sstevel@tonic-gate 	}
12970Sstevel@tonic-gate 
12980Sstevel@tonic-gate 	/* size of page_cachelists */
12990Sstevel@tonic-gate 	colorsz += mnoderangecnt * sizeof (page_t **);
13000Sstevel@tonic-gate 	colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
13010Sstevel@tonic-gate 
13020Sstevel@tonic-gate 	return (colorsz);
13030Sstevel@tonic-gate }
13040Sstevel@tonic-gate 
13050Sstevel@tonic-gate /*
13060Sstevel@tonic-gate  * Called once at startup to configure page_coloring data structures and
13070Sstevel@tonic-gate  * does the 1st page_free()/page_freelist_add().
13080Sstevel@tonic-gate  */
13090Sstevel@tonic-gate void
13100Sstevel@tonic-gate page_coloring_setup(caddr_t pcmemaddr)
13110Sstevel@tonic-gate {
13120Sstevel@tonic-gate 	int	i;
13130Sstevel@tonic-gate 	int	j;
13140Sstevel@tonic-gate 	int	k;
13150Sstevel@tonic-gate 	caddr_t	addr;
13160Sstevel@tonic-gate 	int	colors;
13170Sstevel@tonic-gate 
13180Sstevel@tonic-gate 	/*
13190Sstevel@tonic-gate 	 * do page coloring setup
13200Sstevel@tonic-gate 	 */
13210Sstevel@tonic-gate 	addr = pcmemaddr;
13220Sstevel@tonic-gate 
13230Sstevel@tonic-gate 	mnoderanges = (mnoderange_t *)addr;
13240Sstevel@tonic-gate 	addr += (mnoderangecnt * sizeof (mnoderange_t));
13250Sstevel@tonic-gate 
13260Sstevel@tonic-gate 	mnode_range_setup(mnoderanges);
13270Sstevel@tonic-gate 
13280Sstevel@tonic-gate 	if (physmax4g)
13290Sstevel@tonic-gate 		mtype4g = pfn_2_mtype(0xfffff);
13300Sstevel@tonic-gate 
13310Sstevel@tonic-gate 	for (k = 0; k < NPC_MUTEX; k++) {
13320Sstevel@tonic-gate 		fpc_mutex[k] = (kmutex_t *)addr;
13330Sstevel@tonic-gate 		addr += (max_mem_nodes * sizeof (kmutex_t));
13340Sstevel@tonic-gate 	}
13350Sstevel@tonic-gate 	for (k = 0; k < NPC_MUTEX; k++) {
13360Sstevel@tonic-gate 		cpc_mutex[k] = (kmutex_t *)addr;
13370Sstevel@tonic-gate 		addr += (max_mem_nodes * sizeof (kmutex_t));
13380Sstevel@tonic-gate 	}
13390Sstevel@tonic-gate 	page_freelists = (page_t ****)addr;
13400Sstevel@tonic-gate 	addr += (mnoderangecnt * sizeof (page_t ***));
13410Sstevel@tonic-gate 
13420Sstevel@tonic-gate 	page_cachelists = (page_t ***)addr;
13430Sstevel@tonic-gate 	addr += (mnoderangecnt * sizeof (page_t **));
13440Sstevel@tonic-gate 
13450Sstevel@tonic-gate 	for (i = 0; i < mnoderangecnt; i++) {
13460Sstevel@tonic-gate 		page_freelists[i] = (page_t ***)addr;
13470Sstevel@tonic-gate 		addr += (mmu_page_sizes * sizeof (page_t **));
13480Sstevel@tonic-gate 
13490Sstevel@tonic-gate 		for (j = 0; j < mmu_page_sizes; j++) {
13500Sstevel@tonic-gate 			colors = page_get_pagecolors(j);
13510Sstevel@tonic-gate 			page_freelists[i][j] = (page_t **)addr;
13520Sstevel@tonic-gate 			addr += (colors * sizeof (page_t *));
13530Sstevel@tonic-gate 		}
13540Sstevel@tonic-gate 		page_cachelists[i] = (page_t **)addr;
13550Sstevel@tonic-gate 		addr += (page_colors * sizeof (page_t *));
13560Sstevel@tonic-gate 	}
13570Sstevel@tonic-gate }
13580Sstevel@tonic-gate 
13590Sstevel@tonic-gate /*ARGSUSED*/
13600Sstevel@tonic-gate int
13610Sstevel@tonic-gate bp_color(struct buf *bp)
13620Sstevel@tonic-gate {
13630Sstevel@tonic-gate 	return (0);
13640Sstevel@tonic-gate }
13650Sstevel@tonic-gate 
13660Sstevel@tonic-gate /*
13670Sstevel@tonic-gate  * get a page from any list with the given mnode
13680Sstevel@tonic-gate  */
13690Sstevel@tonic-gate page_t *
13700Sstevel@tonic-gate page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
13710Sstevel@tonic-gate     int mnode, int mtype, ddi_dma_attr_t *dma_attr)
13720Sstevel@tonic-gate {
13732961Sdp78419 	kmutex_t		*pcm;
13742961Sdp78419 	int			i;
13752961Sdp78419 	page_t			*pp;
13762961Sdp78419 	page_t			*first_pp;
13772961Sdp78419 	uint64_t		pgaddr;
13782961Sdp78419 	ulong_t			bin;
13792961Sdp78419 	int			mtypestart;
13802961Sdp78419 	int			plw_initialized;
13812961Sdp78419 	page_list_walker_t	plw;
13820Sstevel@tonic-gate 
13830Sstevel@tonic-gate 	VM_STAT_ADD(pga_vmstats.pgma_alloc);
13840Sstevel@tonic-gate 
13850Sstevel@tonic-gate 	ASSERT((flags & PG_MATCH_COLOR) == 0);
13860Sstevel@tonic-gate 	ASSERT(szc == 0);
13870Sstevel@tonic-gate 	ASSERT(dma_attr != NULL);
13880Sstevel@tonic-gate 
13890Sstevel@tonic-gate 	MTYPE_START(mnode, mtype, flags);
13900Sstevel@tonic-gate 	if (mtype < 0) {
13910Sstevel@tonic-gate 		VM_STAT_ADD(pga_vmstats.pgma_allocempty);
13920Sstevel@tonic-gate 		return (NULL);
13930Sstevel@tonic-gate 	}
13940Sstevel@tonic-gate 
13950Sstevel@tonic-gate 	mtypestart = mtype;
13960Sstevel@tonic-gate 
13970Sstevel@tonic-gate 	bin = origbin;
13980Sstevel@tonic-gate 
13990Sstevel@tonic-gate 	/*
14000Sstevel@tonic-gate 	 * check up to page_colors + 1 bins - origbin may be checked twice
14010Sstevel@tonic-gate 	 * because of BIN_STEP skip
14020Sstevel@tonic-gate 	 */
14030Sstevel@tonic-gate 	do {
14042961Sdp78419 		plw_initialized = 0;
14052961Sdp78419 
14062961Sdp78419 		for (plw.plw_count = 0;
14072961Sdp78419 		    plw.plw_count < page_colors; plw.plw_count++) {
14082961Sdp78419 
14090Sstevel@tonic-gate 			if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
14100Sstevel@tonic-gate 				goto nextfreebin;
14110Sstevel@tonic-gate 
14120Sstevel@tonic-gate 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
14130Sstevel@tonic-gate 			mutex_enter(pcm);
14140Sstevel@tonic-gate 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
14150Sstevel@tonic-gate 			first_pp = pp;
14160Sstevel@tonic-gate 			while (pp != NULL) {
14170Sstevel@tonic-gate 				if (page_trylock(pp, SE_EXCL) == 0) {
14180Sstevel@tonic-gate 					pp = pp->p_next;
14190Sstevel@tonic-gate 					if (pp == first_pp) {
14200Sstevel@tonic-gate 						pp = NULL;
14210Sstevel@tonic-gate 					}
14220Sstevel@tonic-gate 					continue;
14230Sstevel@tonic-gate 				}
14240Sstevel@tonic-gate 
14250Sstevel@tonic-gate 				ASSERT(PP_ISFREE(pp));
14260Sstevel@tonic-gate 				ASSERT(PP_ISAGED(pp));
14270Sstevel@tonic-gate 				ASSERT(pp->p_vnode == NULL);
14280Sstevel@tonic-gate 				ASSERT(pp->p_hash == NULL);
14290Sstevel@tonic-gate 				ASSERT(pp->p_offset == (u_offset_t)-1);
14300Sstevel@tonic-gate 				ASSERT(pp->p_szc == szc);
14310Sstevel@tonic-gate 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
14320Sstevel@tonic-gate 				/* check if page within DMA attributes */
14330Sstevel@tonic-gate 				pgaddr = mmu_ptob((uint64_t)(pp->p_pagenum));
14340Sstevel@tonic-gate 
14350Sstevel@tonic-gate 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
14360Sstevel@tonic-gate 				    (pgaddr + MMU_PAGESIZE - 1 <=
14370Sstevel@tonic-gate 				    dma_attr->dma_attr_addr_hi)) {
14380Sstevel@tonic-gate 					break;
14390Sstevel@tonic-gate 				}
14400Sstevel@tonic-gate 
14410Sstevel@tonic-gate 				/* continue looking */
14420Sstevel@tonic-gate 				page_unlock(pp);
14430Sstevel@tonic-gate 				pp = pp->p_next;
14440Sstevel@tonic-gate 				if (pp == first_pp)
14450Sstevel@tonic-gate 					pp = NULL;
14460Sstevel@tonic-gate 
14470Sstevel@tonic-gate 			}
14480Sstevel@tonic-gate 			if (pp != NULL) {
14490Sstevel@tonic-gate 				ASSERT(mtype == PP_2_MTYPE(pp));
14500Sstevel@tonic-gate 				ASSERT(pp->p_szc == 0);
14510Sstevel@tonic-gate 
14520Sstevel@tonic-gate 				/* found a page with specified DMA attributes */
14530Sstevel@tonic-gate 				page_sub(&PAGE_FREELISTS(mnode, szc, bin,
14540Sstevel@tonic-gate 				    mtype), pp);
1455414Skchow 				page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
14560Sstevel@tonic-gate 
14570Sstevel@tonic-gate 				if ((PP_ISFREE(pp) == 0) ||
14580Sstevel@tonic-gate 				    (PP_ISAGED(pp) == 0)) {
14590Sstevel@tonic-gate 					cmn_err(CE_PANIC, "page %p is not free",
14600Sstevel@tonic-gate 					    (void *)pp);
14610Sstevel@tonic-gate 				}
14620Sstevel@tonic-gate 
14630Sstevel@tonic-gate 				mutex_exit(pcm);
14640Sstevel@tonic-gate 				check_dma(dma_attr, pp, 1);
14650Sstevel@tonic-gate 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
14660Sstevel@tonic-gate 				return (pp);
14670Sstevel@tonic-gate 			}
14680Sstevel@tonic-gate 			mutex_exit(pcm);
14690Sstevel@tonic-gate nextfreebin:
14702961Sdp78419 			if (plw_initialized == 0) {
14712961Sdp78419 				page_list_walk_init(szc, 0, bin, 1, 0, &plw);
14722961Sdp78419 				ASSERT(plw.plw_ceq_dif == page_colors);
14732961Sdp78419 				plw_initialized = 1;
14742961Sdp78419 			}
14750Sstevel@tonic-gate 
14762961Sdp78419 			if (plw.plw_do_split) {
14772961Sdp78419 				pp = page_freelist_split(szc, bin, mnode,
14782961Sdp78419 				    mtype,
14792961Sdp78419 				    mmu_btop(dma_attr->dma_attr_addr_hi + 1),
14802961Sdp78419 				    &plw);
14812961Sdp78419 				if (pp != NULL)
14822961Sdp78419 					return (pp);
14832961Sdp78419 			}
14842961Sdp78419 
14852961Sdp78419 			bin = page_list_walk_next_bin(szc, bin, &plw);
14860Sstevel@tonic-gate 		}
14872961Sdp78419 
1488414Skchow 		MTYPE_NEXT(mnode, mtype, flags);
1489414Skchow 	} while (mtype >= 0);
14900Sstevel@tonic-gate 
14910Sstevel@tonic-gate 	/* failed to find a page in the freelist; try it in the cachelist */
14920Sstevel@tonic-gate 
14930Sstevel@tonic-gate 	/* reset mtype start for cachelist search */
14940Sstevel@tonic-gate 	mtype = mtypestart;
14950Sstevel@tonic-gate 	ASSERT(mtype >= 0);
14960Sstevel@tonic-gate 
14970Sstevel@tonic-gate 	/* start with the bin of matching color */
14980Sstevel@tonic-gate 	bin = origbin;
14990Sstevel@tonic-gate 
15000Sstevel@tonic-gate 	do {
15010Sstevel@tonic-gate 		for (i = 0; i <= page_colors; i++) {
15020Sstevel@tonic-gate 			if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
15030Sstevel@tonic-gate 				goto nextcachebin;
15040Sstevel@tonic-gate 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
15050Sstevel@tonic-gate 			mutex_enter(pcm);
15060Sstevel@tonic-gate 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
15070Sstevel@tonic-gate 			first_pp = pp;
15080Sstevel@tonic-gate 			while (pp != NULL) {
15090Sstevel@tonic-gate 				if (page_trylock(pp, SE_EXCL) == 0) {
15100Sstevel@tonic-gate 					pp = pp->p_next;
15110Sstevel@tonic-gate 					if (pp == first_pp)
15120Sstevel@tonic-gate 						break;
15130Sstevel@tonic-gate 					continue;
15140Sstevel@tonic-gate 				}
15150Sstevel@tonic-gate 				ASSERT(pp->p_vnode);
15160Sstevel@tonic-gate 				ASSERT(PP_ISAGED(pp) == 0);
15170Sstevel@tonic-gate 				ASSERT(pp->p_szc == 0);
15180Sstevel@tonic-gate 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
15190Sstevel@tonic-gate 
15200Sstevel@tonic-gate 				/* check if page within DMA attributes */
15210Sstevel@tonic-gate 
15220Sstevel@tonic-gate 				pgaddr = ptob((uint64_t)(pp->p_pagenum));
15230Sstevel@tonic-gate 
15240Sstevel@tonic-gate 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
15250Sstevel@tonic-gate 				    (pgaddr + MMU_PAGESIZE - 1 <=
15260Sstevel@tonic-gate 				    dma_attr->dma_attr_addr_hi)) {
15270Sstevel@tonic-gate 					break;
15280Sstevel@tonic-gate 				}
15290Sstevel@tonic-gate 
15300Sstevel@tonic-gate 				/* continue looking */
15310Sstevel@tonic-gate 				page_unlock(pp);
15320Sstevel@tonic-gate 				pp = pp->p_next;
15330Sstevel@tonic-gate 				if (pp == first_pp)
15340Sstevel@tonic-gate 					pp = NULL;
15350Sstevel@tonic-gate 			}
15360Sstevel@tonic-gate 
15370Sstevel@tonic-gate 			if (pp != NULL) {
15380Sstevel@tonic-gate 				ASSERT(mtype == PP_2_MTYPE(pp));
15390Sstevel@tonic-gate 				ASSERT(pp->p_szc == 0);
15400Sstevel@tonic-gate 
15410Sstevel@tonic-gate 				/* found a page with specified DMA attributes */
15420Sstevel@tonic-gate 				page_sub(&PAGE_CACHELISTS(mnode, bin,
15430Sstevel@tonic-gate 				    mtype), pp);
1544414Skchow 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
15450Sstevel@tonic-gate 
15460Sstevel@tonic-gate 				mutex_exit(pcm);
15470Sstevel@tonic-gate 				ASSERT(pp->p_vnode);
15480Sstevel@tonic-gate 				ASSERT(PP_ISAGED(pp) == 0);
15490Sstevel@tonic-gate 				check_dma(dma_attr, pp, 1);
15500Sstevel@tonic-gate 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
15510Sstevel@tonic-gate 				return (pp);
15520Sstevel@tonic-gate 			}
15530Sstevel@tonic-gate 			mutex_exit(pcm);
15540Sstevel@tonic-gate nextcachebin:
15550Sstevel@tonic-gate 			bin += (i == 0) ? BIN_STEP : 1;
15560Sstevel@tonic-gate 			bin &= page_colors_mask;
15570Sstevel@tonic-gate 		}
1558414Skchow 		MTYPE_NEXT(mnode, mtype, flags);
1559414Skchow 	} while (mtype >= 0);
15600Sstevel@tonic-gate 
15610Sstevel@tonic-gate 	VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
15620Sstevel@tonic-gate 	return (NULL);
15630Sstevel@tonic-gate }
15640Sstevel@tonic-gate 
15650Sstevel@tonic-gate /*
15660Sstevel@tonic-gate  * This function is similar to page_get_freelist()/page_get_cachelist()
15670Sstevel@tonic-gate  * but it searches both the lists to find a page with the specified
15680Sstevel@tonic-gate  * color (or no color) and DMA attributes. The search is done in the
15690Sstevel@tonic-gate  * freelist first and then in the cache list within the highest memory
15700Sstevel@tonic-gate  * range (based on DMA attributes) before searching in the lower
15710Sstevel@tonic-gate  * memory ranges.
15720Sstevel@tonic-gate  *
15730Sstevel@tonic-gate  * Note: This function is called only by page_create_io().
15740Sstevel@tonic-gate  */
15750Sstevel@tonic-gate /*ARGSUSED*/
15760Sstevel@tonic-gate page_t *
15770Sstevel@tonic-gate page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
15780Sstevel@tonic-gate     size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t	*lgrp)
15790Sstevel@tonic-gate {
15800Sstevel@tonic-gate 	uint_t		bin;
15810Sstevel@tonic-gate 	int		mtype;
15820Sstevel@tonic-gate 	page_t		*pp;
15830Sstevel@tonic-gate 	int		n;
15840Sstevel@tonic-gate 	int		m;
15850Sstevel@tonic-gate 	int		szc;
15860Sstevel@tonic-gate 	int		fullrange;
15870Sstevel@tonic-gate 	int		mnode;
15880Sstevel@tonic-gate 	int		local_failed_stat = 0;
15890Sstevel@tonic-gate 	lgrp_mnode_cookie_t	lgrp_cookie;
15900Sstevel@tonic-gate 
15910Sstevel@tonic-gate 	VM_STAT_ADD(pga_vmstats.pga_alloc);
15920Sstevel@tonic-gate 
15930Sstevel@tonic-gate 	/* only base pagesize currently supported */
15940Sstevel@tonic-gate 	if (size != MMU_PAGESIZE)
15950Sstevel@tonic-gate 		return (NULL);
15960Sstevel@tonic-gate 
15970Sstevel@tonic-gate 	/*
15980Sstevel@tonic-gate 	 * If we're passed a specific lgroup, we use it.  Otherwise,
15990Sstevel@tonic-gate 	 * assume first-touch placement is desired.
16000Sstevel@tonic-gate 	 */
16010Sstevel@tonic-gate 	if (!LGRP_EXISTS(lgrp))
16020Sstevel@tonic-gate 		lgrp = lgrp_home_lgrp();
16030Sstevel@tonic-gate 
16040Sstevel@tonic-gate 	/* LINTED */
16052961Sdp78419 	AS_2_BIN(as, seg, vp, vaddr, bin, 0);
16060Sstevel@tonic-gate 
16070Sstevel@tonic-gate 	/*
16080Sstevel@tonic-gate 	 * Only hold one freelist or cachelist lock at a time, that way we
16090Sstevel@tonic-gate 	 * can start anywhere and not have to worry about lock
16100Sstevel@tonic-gate 	 * ordering.
16110Sstevel@tonic-gate 	 */
16120Sstevel@tonic-gate 	if (dma_attr == NULL) {
16130Sstevel@tonic-gate 		n = 0;
16140Sstevel@tonic-gate 		m = mnoderangecnt - 1;
16150Sstevel@tonic-gate 		fullrange = 1;
16160Sstevel@tonic-gate 		VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
16170Sstevel@tonic-gate 	} else {
16180Sstevel@tonic-gate 		pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
16190Sstevel@tonic-gate 		pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
16200Sstevel@tonic-gate 
16210Sstevel@tonic-gate 		/*
16220Sstevel@tonic-gate 		 * We can guarantee alignment only for page boundary.
16230Sstevel@tonic-gate 		 */
16240Sstevel@tonic-gate 		if (dma_attr->dma_attr_align > MMU_PAGESIZE)
16250Sstevel@tonic-gate 			return (NULL);
16260Sstevel@tonic-gate 
16270Sstevel@tonic-gate 		n = pfn_2_mtype(pfnlo);
16280Sstevel@tonic-gate 		m = pfn_2_mtype(pfnhi);
16290Sstevel@tonic-gate 
16300Sstevel@tonic-gate 		fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
16310Sstevel@tonic-gate 		    (pfnhi >= mnoderanges[m].mnr_pfnhi));
16320Sstevel@tonic-gate 	}
16330Sstevel@tonic-gate 	VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
16340Sstevel@tonic-gate 
16350Sstevel@tonic-gate 	if (n > m)
16360Sstevel@tonic-gate 		return (NULL);
16370Sstevel@tonic-gate 
16380Sstevel@tonic-gate 	szc = 0;
16390Sstevel@tonic-gate 
16400Sstevel@tonic-gate 	/* cylcing thru mtype handled by RANGE0 if n == 0 */
16410Sstevel@tonic-gate 	if (n == 0) {
16420Sstevel@tonic-gate 		flags |= PGI_MT_RANGE0;
16430Sstevel@tonic-gate 		n = m;
16440Sstevel@tonic-gate 	}
16450Sstevel@tonic-gate 
16460Sstevel@tonic-gate 	/*
16470Sstevel@tonic-gate 	 * Try local memory node first, but try remote if we can't
16480Sstevel@tonic-gate 	 * get a page of the right color.
16490Sstevel@tonic-gate 	 */
16500Sstevel@tonic-gate 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
16510Sstevel@tonic-gate 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
16520Sstevel@tonic-gate 		/*
16530Sstevel@tonic-gate 		 * allocate pages from high pfn to low.
16540Sstevel@tonic-gate 		 */
16550Sstevel@tonic-gate 		for (mtype = m; mtype >= n; mtype--) {
16560Sstevel@tonic-gate 			if (fullrange != 0) {
16570Sstevel@tonic-gate 				pp = page_get_mnode_freelist(mnode,
16580Sstevel@tonic-gate 				    bin, mtype, szc, flags);
16590Sstevel@tonic-gate 				if (pp == NULL) {
16600Sstevel@tonic-gate 					pp = page_get_mnode_cachelist(
16610Sstevel@tonic-gate 						bin, flags, mnode, mtype);
16620Sstevel@tonic-gate 				}
16630Sstevel@tonic-gate 			} else {
16640Sstevel@tonic-gate 				pp = page_get_mnode_anylist(bin, szc,
16650Sstevel@tonic-gate 				    flags, mnode, mtype, dma_attr);
16660Sstevel@tonic-gate 			}
16670Sstevel@tonic-gate 			if (pp != NULL) {
16680Sstevel@tonic-gate 				VM_STAT_ADD(pga_vmstats.pga_allocok);
16690Sstevel@tonic-gate 				check_dma(dma_attr, pp, 1);
16700Sstevel@tonic-gate 				return (pp);
16710Sstevel@tonic-gate 			}
16720Sstevel@tonic-gate 		}
16730Sstevel@tonic-gate 		if (!local_failed_stat) {
16740Sstevel@tonic-gate 			lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
16750Sstevel@tonic-gate 			local_failed_stat = 1;
16760Sstevel@tonic-gate 		}
16770Sstevel@tonic-gate 	}
16780Sstevel@tonic-gate 	VM_STAT_ADD(pga_vmstats.pga_allocfailed);
16790Sstevel@tonic-gate 
16800Sstevel@tonic-gate 	return (NULL);
16810Sstevel@tonic-gate }
16820Sstevel@tonic-gate 
16830Sstevel@tonic-gate /*
16840Sstevel@tonic-gate  * page_create_io()
16850Sstevel@tonic-gate  *
16860Sstevel@tonic-gate  * This function is a copy of page_create_va() with an additional
16870Sstevel@tonic-gate  * argument 'mattr' that specifies DMA memory requirements to
16880Sstevel@tonic-gate  * the page list functions. This function is used by the segkmem
16890Sstevel@tonic-gate  * allocator so it is only to create new pages (i.e PG_EXCL is
16900Sstevel@tonic-gate  * set).
16910Sstevel@tonic-gate  *
16920Sstevel@tonic-gate  * Note: This interface is currently used by x86 PSM only and is
16930Sstevel@tonic-gate  *	 not fully specified so the commitment level is only for
16940Sstevel@tonic-gate  *	 private interface specific to x86. This interface uses PSM
16950Sstevel@tonic-gate  *	 specific page_get_anylist() interface.
16960Sstevel@tonic-gate  */
16970Sstevel@tonic-gate 
16980Sstevel@tonic-gate #define	PAGE_HASH_SEARCH(index, pp, vp, off) { \
16990Sstevel@tonic-gate 	for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
17000Sstevel@tonic-gate 		if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
17010Sstevel@tonic-gate 			break; \
17020Sstevel@tonic-gate 	} \
17030Sstevel@tonic-gate }
17040Sstevel@tonic-gate 
17050Sstevel@tonic-gate 
17060Sstevel@tonic-gate page_t *
17070Sstevel@tonic-gate page_create_io(
17080Sstevel@tonic-gate 	struct vnode	*vp,
17090Sstevel@tonic-gate 	u_offset_t	off,
17100Sstevel@tonic-gate 	uint_t		bytes,
17110Sstevel@tonic-gate 	uint_t		flags,
17120Sstevel@tonic-gate 	struct as	*as,
17130Sstevel@tonic-gate 	caddr_t		vaddr,
17140Sstevel@tonic-gate 	ddi_dma_attr_t	*mattr)	/* DMA memory attributes if any */
17150Sstevel@tonic-gate {
17160Sstevel@tonic-gate 	page_t		*plist = NULL;
17170Sstevel@tonic-gate 	uint_t		plist_len = 0;
17180Sstevel@tonic-gate 	pgcnt_t		npages;
17190Sstevel@tonic-gate 	page_t		*npp = NULL;
17200Sstevel@tonic-gate 	uint_t		pages_req;
17210Sstevel@tonic-gate 	page_t		*pp;
17220Sstevel@tonic-gate 	kmutex_t	*phm = NULL;
17230Sstevel@tonic-gate 	uint_t		index;
17240Sstevel@tonic-gate 
17250Sstevel@tonic-gate 	TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
17260Sstevel@tonic-gate 		"page_create_start:vp %p off %llx bytes %u flags %x",
17270Sstevel@tonic-gate 		vp, off, bytes, flags);
17280Sstevel@tonic-gate 
17290Sstevel@tonic-gate 	ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
17300Sstevel@tonic-gate 
17310Sstevel@tonic-gate 	pages_req = npages = mmu_btopr(bytes);
17320Sstevel@tonic-gate 
17330Sstevel@tonic-gate 	/*
17340Sstevel@tonic-gate 	 * Do the freemem and pcf accounting.
17350Sstevel@tonic-gate 	 */
17360Sstevel@tonic-gate 	if (!page_create_wait(npages, flags)) {
17370Sstevel@tonic-gate 		return (NULL);
17380Sstevel@tonic-gate 	}
17390Sstevel@tonic-gate 
17400Sstevel@tonic-gate 	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
17410Sstevel@tonic-gate 		"page_create_success:vp %p off %llx",
17420Sstevel@tonic-gate 		vp, off);
17430Sstevel@tonic-gate 
17440Sstevel@tonic-gate 	/*
17450Sstevel@tonic-gate 	 * If satisfying this request has left us with too little
17460Sstevel@tonic-gate 	 * memory, start the wheels turning to get some back.  The
17470Sstevel@tonic-gate 	 * first clause of the test prevents waking up the pageout
17480Sstevel@tonic-gate 	 * daemon in situations where it would decide that there's
17490Sstevel@tonic-gate 	 * nothing to do.
17500Sstevel@tonic-gate 	 */
17510Sstevel@tonic-gate 	if (nscan < desscan && freemem < minfree) {
17520Sstevel@tonic-gate 		TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
17530Sstevel@tonic-gate 			"pageout_cv_signal:freemem %ld", freemem);
17540Sstevel@tonic-gate 		cv_signal(&proc_pageout->p_cv);
17550Sstevel@tonic-gate 	}
17560Sstevel@tonic-gate 
17570Sstevel@tonic-gate 	if (flags & PG_PHYSCONTIG) {
17580Sstevel@tonic-gate 
17590Sstevel@tonic-gate 		plist = page_get_contigpage(&npages, mattr, 1);
17600Sstevel@tonic-gate 		if (plist == NULL) {
17610Sstevel@tonic-gate 			page_create_putback(npages);
17620Sstevel@tonic-gate 			return (NULL);
17630Sstevel@tonic-gate 		}
17640Sstevel@tonic-gate 
17650Sstevel@tonic-gate 		pp = plist;
17660Sstevel@tonic-gate 
17670Sstevel@tonic-gate 		do {
17680Sstevel@tonic-gate 			if (!page_hashin(pp, vp, off, NULL)) {
17690Sstevel@tonic-gate 				panic("pg_creat_io: hashin failed %p %p %llx",
17700Sstevel@tonic-gate 				    (void *)pp, (void *)vp, off);
17710Sstevel@tonic-gate 			}
17720Sstevel@tonic-gate 			VM_STAT_ADD(page_create_new);
17730Sstevel@tonic-gate 			off += MMU_PAGESIZE;
17740Sstevel@tonic-gate 			PP_CLRFREE(pp);
17750Sstevel@tonic-gate 			PP_CLRAGED(pp);
17760Sstevel@tonic-gate 			page_set_props(pp, P_REF);
17770Sstevel@tonic-gate 			pp = pp->p_next;
17780Sstevel@tonic-gate 		} while (pp != plist);
17790Sstevel@tonic-gate 
17800Sstevel@tonic-gate 		if (!npages) {
17810Sstevel@tonic-gate 			check_dma(mattr, plist, pages_req);
17820Sstevel@tonic-gate 			return (plist);
17830Sstevel@tonic-gate 		} else {
17840Sstevel@tonic-gate 			vaddr += (pages_req - npages) << MMU_PAGESHIFT;
17850Sstevel@tonic-gate 		}
17860Sstevel@tonic-gate 
17870Sstevel@tonic-gate 		/*
17880Sstevel@tonic-gate 		 * fall-thru:
17890Sstevel@tonic-gate 		 *
17900Sstevel@tonic-gate 		 * page_get_contigpage returns when npages <= sgllen.
17910Sstevel@tonic-gate 		 * Grab the rest of the non-contig pages below from anylist.
17920Sstevel@tonic-gate 		 */
17930Sstevel@tonic-gate 	}
17940Sstevel@tonic-gate 
17950Sstevel@tonic-gate 	/*
17960Sstevel@tonic-gate 	 * Loop around collecting the requested number of pages.
17970Sstevel@tonic-gate 	 * Most of the time, we have to `create' a new page. With
17980Sstevel@tonic-gate 	 * this in mind, pull the page off the free list before
17990Sstevel@tonic-gate 	 * getting the hash lock.  This will minimize the hash
18000Sstevel@tonic-gate 	 * lock hold time, nesting, and the like.  If it turns
18010Sstevel@tonic-gate 	 * out we don't need the page, we put it back at the end.
18020Sstevel@tonic-gate 	 */
18030Sstevel@tonic-gate 	while (npages--) {
18040Sstevel@tonic-gate 		phm = NULL;
18050Sstevel@tonic-gate 
18060Sstevel@tonic-gate 		index = PAGE_HASH_FUNC(vp, off);
18070Sstevel@tonic-gate top:
18080Sstevel@tonic-gate 		ASSERT(phm == NULL);
18090Sstevel@tonic-gate 		ASSERT(index == PAGE_HASH_FUNC(vp, off));
18100Sstevel@tonic-gate 		ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
18110Sstevel@tonic-gate 
18120Sstevel@tonic-gate 		if (npp == NULL) {
18130Sstevel@tonic-gate 			/*
18140Sstevel@tonic-gate 			 * Try to get the page of any color either from
18150Sstevel@tonic-gate 			 * the freelist or from the cache list.
18160Sstevel@tonic-gate 			 */
18170Sstevel@tonic-gate 			npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
18180Sstevel@tonic-gate 			    flags & ~PG_MATCH_COLOR, mattr, NULL);
18190Sstevel@tonic-gate 			if (npp == NULL) {
18200Sstevel@tonic-gate 				if (mattr == NULL) {
18210Sstevel@tonic-gate 					/*
18220Sstevel@tonic-gate 					 * Not looking for a special page;
18230Sstevel@tonic-gate 					 * panic!
18240Sstevel@tonic-gate 					 */
18250Sstevel@tonic-gate 					panic("no page found %d", (int)npages);
18260Sstevel@tonic-gate 				}
18270Sstevel@tonic-gate 				/*
18280Sstevel@tonic-gate 				 * No page found! This can happen
18290Sstevel@tonic-gate 				 * if we are looking for a page
18300Sstevel@tonic-gate 				 * within a specific memory range
18310Sstevel@tonic-gate 				 * for DMA purposes. If PG_WAIT is
18320Sstevel@tonic-gate 				 * specified then we wait for a
18330Sstevel@tonic-gate 				 * while and then try again. The
18340Sstevel@tonic-gate 				 * wait could be forever if we
18350Sstevel@tonic-gate 				 * don't get the page(s) we need.
18360Sstevel@tonic-gate 				 *
18370Sstevel@tonic-gate 				 * Note: XXX We really need a mechanism
18380Sstevel@tonic-gate 				 * to wait for pages in the desired
18390Sstevel@tonic-gate 				 * range. For now, we wait for any
18400Sstevel@tonic-gate 				 * pages and see if we can use it.
18410Sstevel@tonic-gate 				 */
18420Sstevel@tonic-gate 
18430Sstevel@tonic-gate 				if ((mattr != NULL) && (flags & PG_WAIT)) {
18440Sstevel@tonic-gate 					delay(10);
18450Sstevel@tonic-gate 					goto top;
18460Sstevel@tonic-gate 				}
18470Sstevel@tonic-gate 
18480Sstevel@tonic-gate 				goto fail; /* undo accounting stuff */
18490Sstevel@tonic-gate 			}
18500Sstevel@tonic-gate 
18510Sstevel@tonic-gate 			if (PP_ISAGED(npp) == 0) {
18520Sstevel@tonic-gate 				/*
18530Sstevel@tonic-gate 				 * Since this page came from the
18540Sstevel@tonic-gate 				 * cachelist, we must destroy the
18550Sstevel@tonic-gate 				 * old vnode association.
18560Sstevel@tonic-gate 				 */
18570Sstevel@tonic-gate 				page_hashout(npp, (kmutex_t *)NULL);
18580Sstevel@tonic-gate 			}
18590Sstevel@tonic-gate 		}
18600Sstevel@tonic-gate 
18610Sstevel@tonic-gate 		/*
18620Sstevel@tonic-gate 		 * We own this page!
18630Sstevel@tonic-gate 		 */
18640Sstevel@tonic-gate 		ASSERT(PAGE_EXCL(npp));
18650Sstevel@tonic-gate 		ASSERT(npp->p_vnode == NULL);
18660Sstevel@tonic-gate 		ASSERT(!hat_page_is_mapped(npp));
18670Sstevel@tonic-gate 		PP_CLRFREE(npp);
18680Sstevel@tonic-gate 		PP_CLRAGED(npp);
18690Sstevel@tonic-gate 
18700Sstevel@tonic-gate 		/*
18710Sstevel@tonic-gate 		 * Here we have a page in our hot little mits and are
18720Sstevel@tonic-gate 		 * just waiting to stuff it on the appropriate lists.
18730Sstevel@tonic-gate 		 * Get the mutex and check to see if it really does
18740Sstevel@tonic-gate 		 * not exist.
18750Sstevel@tonic-gate 		 */
18760Sstevel@tonic-gate 		phm = PAGE_HASH_MUTEX(index);
18770Sstevel@tonic-gate 		mutex_enter(phm);
18780Sstevel@tonic-gate 		PAGE_HASH_SEARCH(index, pp, vp, off);
18790Sstevel@tonic-gate 		if (pp == NULL) {
18800Sstevel@tonic-gate 			VM_STAT_ADD(page_create_new);
18810Sstevel@tonic-gate 			pp = npp;
18820Sstevel@tonic-gate 			npp = NULL;
18830Sstevel@tonic-gate 			if (!page_hashin(pp, vp, off, phm)) {
18840Sstevel@tonic-gate 				/*
18850Sstevel@tonic-gate 				 * Since we hold the page hash mutex and
18860Sstevel@tonic-gate 				 * just searched for this page, page_hashin
18870Sstevel@tonic-gate 				 * had better not fail.  If it does, that
18880Sstevel@tonic-gate 				 * means somethread did not follow the
18890Sstevel@tonic-gate 				 * page hash mutex rules.  Panic now and
18900Sstevel@tonic-gate 				 * get it over with.  As usual, go down
18910Sstevel@tonic-gate 				 * holding all the locks.
18920Sstevel@tonic-gate 				 */
18930Sstevel@tonic-gate 				ASSERT(MUTEX_HELD(phm));
18940Sstevel@tonic-gate 				panic("page_create: hashin fail %p %p %llx %p",
18950Sstevel@tonic-gate 				    (void *)pp, (void *)vp, off, (void *)phm);
18960Sstevel@tonic-gate 
18970Sstevel@tonic-gate 			}
18980Sstevel@tonic-gate 			ASSERT(MUTEX_HELD(phm));
18990Sstevel@tonic-gate 			mutex_exit(phm);
19000Sstevel@tonic-gate 			phm = NULL;
19010Sstevel@tonic-gate 
19020Sstevel@tonic-gate 			/*
19030Sstevel@tonic-gate 			 * Hat layer locking need not be done to set
19040Sstevel@tonic-gate 			 * the following bits since the page is not hashed
19050Sstevel@tonic-gate 			 * and was on the free list (i.e., had no mappings).
19060Sstevel@tonic-gate 			 *
19070Sstevel@tonic-gate 			 * Set the reference bit to protect
19080Sstevel@tonic-gate 			 * against immediate pageout
19090Sstevel@tonic-gate 			 *
19100Sstevel@tonic-gate 			 * XXXmh modify freelist code to set reference
19110Sstevel@tonic-gate 			 * bit so we don't have to do it here.
19120Sstevel@tonic-gate 			 */
19130Sstevel@tonic-gate 			page_set_props(pp, P_REF);
19140Sstevel@tonic-gate 		} else {
19150Sstevel@tonic-gate 			ASSERT(MUTEX_HELD(phm));
19160Sstevel@tonic-gate 			mutex_exit(phm);
19170Sstevel@tonic-gate 			phm = NULL;
19180Sstevel@tonic-gate 			/*
19190Sstevel@tonic-gate 			 * NOTE: This should not happen for pages associated
19200Sstevel@tonic-gate 			 *	 with kernel vnode 'kvp'.
19210Sstevel@tonic-gate 			 */
19220Sstevel@tonic-gate 			/* XX64 - to debug why this happens! */
1923*3290Sjohansen 			ASSERT(!VN_ISKAS(vp));
1924*3290Sjohansen 			if (VN_ISKAS(vp))
19250Sstevel@tonic-gate 				cmn_err(CE_NOTE,
19260Sstevel@tonic-gate 				    "page_create: page not expected "
19270Sstevel@tonic-gate 				    "in hash list for kernel vnode - pp 0x%p",
19280Sstevel@tonic-gate 				    (void *)pp);
19290Sstevel@tonic-gate 			VM_STAT_ADD(page_create_exists);
19300Sstevel@tonic-gate 			goto fail;
19310Sstevel@tonic-gate 		}
19320Sstevel@tonic-gate 
19330Sstevel@tonic-gate 		/*
19340Sstevel@tonic-gate 		 * Got a page!  It is locked.  Acquire the i/o
19350Sstevel@tonic-gate 		 * lock since we are going to use the p_next and
19360Sstevel@tonic-gate 		 * p_prev fields to link the requested pages together.
19370Sstevel@tonic-gate 		 */
19380Sstevel@tonic-gate 		page_io_lock(pp);
19390Sstevel@tonic-gate 		page_add(&plist, pp);
19400Sstevel@tonic-gate 		plist = plist->p_next;
19410Sstevel@tonic-gate 		off += MMU_PAGESIZE;
19420Sstevel@tonic-gate 		vaddr += MMU_PAGESIZE;
19430Sstevel@tonic-gate 	}
19440Sstevel@tonic-gate 
19450Sstevel@tonic-gate 	check_dma(mattr, plist, pages_req);
19460Sstevel@tonic-gate 	return (plist);
19470Sstevel@tonic-gate 
19480Sstevel@tonic-gate fail:
19490Sstevel@tonic-gate 	if (npp != NULL) {
19500Sstevel@tonic-gate 		/*
19510Sstevel@tonic-gate 		 * Did not need this page after all.
19520Sstevel@tonic-gate 		 * Put it back on the free list.
19530Sstevel@tonic-gate 		 */
19540Sstevel@tonic-gate 		VM_STAT_ADD(page_create_putbacks);
19550Sstevel@tonic-gate 		PP_SETFREE(npp);
19560Sstevel@tonic-gate 		PP_SETAGED(npp);
19570Sstevel@tonic-gate 		npp->p_offset = (u_offset_t)-1;
19580Sstevel@tonic-gate 		page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
19590Sstevel@tonic-gate 		page_unlock(npp);
19600Sstevel@tonic-gate 	}
19610Sstevel@tonic-gate 
19620Sstevel@tonic-gate 	/*
19630Sstevel@tonic-gate 	 * Give up the pages we already got.
19640Sstevel@tonic-gate 	 */
19650Sstevel@tonic-gate 	while (plist != NULL) {
19660Sstevel@tonic-gate 		pp = plist;
19670Sstevel@tonic-gate 		page_sub(&plist, pp);
19680Sstevel@tonic-gate 		page_io_unlock(pp);
19690Sstevel@tonic-gate 		plist_len++;
19700Sstevel@tonic-gate 		/*LINTED: constant in conditional ctx*/
19710Sstevel@tonic-gate 		VN_DISPOSE(pp, B_INVAL, 0, kcred);
19720Sstevel@tonic-gate 	}
19730Sstevel@tonic-gate 
19740Sstevel@tonic-gate 	/*
19750Sstevel@tonic-gate 	 * VN_DISPOSE does freemem accounting for the pages in plist
19760Sstevel@tonic-gate 	 * by calling page_free. So, we need to undo the pcf accounting
19770Sstevel@tonic-gate 	 * for only the remaining pages.
19780Sstevel@tonic-gate 	 */
19790Sstevel@tonic-gate 	VM_STAT_ADD(page_create_putbacks);
19800Sstevel@tonic-gate 	page_create_putback(pages_req - plist_len);
19810Sstevel@tonic-gate 
19820Sstevel@tonic-gate 	return (NULL);
19830Sstevel@tonic-gate }
19840Sstevel@tonic-gate 
19850Sstevel@tonic-gate 
19860Sstevel@tonic-gate /*
19870Sstevel@tonic-gate  * Copy the data from the physical page represented by "frompp" to
19880Sstevel@tonic-gate  * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
19890Sstevel@tonic-gate  * CPU->cpu_caddr2.  It assumes that no one uses either map at interrupt
19900Sstevel@tonic-gate  * level and no one sleeps with an active mapping there.
19910Sstevel@tonic-gate  *
19920Sstevel@tonic-gate  * Note that the ref/mod bits in the page_t's are not affected by
19930Sstevel@tonic-gate  * this operation, hence it is up to the caller to update them appropriately.
19940Sstevel@tonic-gate  */
19953253Smec int
19960Sstevel@tonic-gate ppcopy(page_t *frompp, page_t *topp)
19970Sstevel@tonic-gate {
19980Sstevel@tonic-gate 	caddr_t		pp_addr1;
19990Sstevel@tonic-gate 	caddr_t		pp_addr2;
20000Sstevel@tonic-gate 	void		*pte1;
20010Sstevel@tonic-gate 	void		*pte2;
20020Sstevel@tonic-gate 	kmutex_t	*ppaddr_mutex;
20033253Smec 	label_t		ljb;
20043253Smec 	int		ret = 1;
20050Sstevel@tonic-gate 
20060Sstevel@tonic-gate 	ASSERT_STACK_ALIGNED();
20070Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(frompp));
20080Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(topp));
20090Sstevel@tonic-gate 
20100Sstevel@tonic-gate 	if (kpm_enable) {
20110Sstevel@tonic-gate 		pp_addr1 = hat_kpm_page2va(frompp, 0);
20120Sstevel@tonic-gate 		pp_addr2 = hat_kpm_page2va(topp, 0);
20130Sstevel@tonic-gate 		kpreempt_disable();
20140Sstevel@tonic-gate 	} else {
20150Sstevel@tonic-gate 		/*
20160Sstevel@tonic-gate 		 * disable pre-emption so that CPU can't change
20170Sstevel@tonic-gate 		 */
20180Sstevel@tonic-gate 		kpreempt_disable();
20190Sstevel@tonic-gate 
20200Sstevel@tonic-gate 		pp_addr1 = CPU->cpu_caddr1;
20210Sstevel@tonic-gate 		pp_addr2 = CPU->cpu_caddr2;
20220Sstevel@tonic-gate 		pte1 = (void *)CPU->cpu_caddr1pte;
20230Sstevel@tonic-gate 		pte2 = (void *)CPU->cpu_caddr2pte;
20240Sstevel@tonic-gate 
20250Sstevel@tonic-gate 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
20260Sstevel@tonic-gate 		mutex_enter(ppaddr_mutex);
20270Sstevel@tonic-gate 
20280Sstevel@tonic-gate 		hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
20290Sstevel@tonic-gate 		    PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
20300Sstevel@tonic-gate 		hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
20310Sstevel@tonic-gate 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
20320Sstevel@tonic-gate 		    HAT_LOAD_NOCONSIST);
20330Sstevel@tonic-gate 	}
20340Sstevel@tonic-gate 
20353253Smec 	if (on_fault(&ljb)) {
20363253Smec 		ret = 0;
20373253Smec 		goto faulted;
20383253Smec 	}
20390Sstevel@tonic-gate 	if (use_sse_pagecopy)
20400Sstevel@tonic-gate 		hwblkpagecopy(pp_addr1, pp_addr2);
20410Sstevel@tonic-gate 	else
20420Sstevel@tonic-gate 		bcopy(pp_addr1, pp_addr2, PAGESIZE);
20430Sstevel@tonic-gate 
20443253Smec 	no_fault();
20453253Smec faulted:
20460Sstevel@tonic-gate 	if (!kpm_enable)
20470Sstevel@tonic-gate 		mutex_exit(ppaddr_mutex);
20480Sstevel@tonic-gate 	kpreempt_enable();
20493253Smec 	return (ret);
20500Sstevel@tonic-gate }
20510Sstevel@tonic-gate 
20520Sstevel@tonic-gate /*
20530Sstevel@tonic-gate  * Zero the physical page from off to off + len given by `pp'
20540Sstevel@tonic-gate  * without changing the reference and modified bits of page.
20550Sstevel@tonic-gate  *
20560Sstevel@tonic-gate  * We use this using CPU private page address #2, see ppcopy() for more info.
20570Sstevel@tonic-gate  * pagezero() must not be called at interrupt level.
20580Sstevel@tonic-gate  */
20590Sstevel@tonic-gate void
20600Sstevel@tonic-gate pagezero(page_t *pp, uint_t off, uint_t len)
20610Sstevel@tonic-gate {
20620Sstevel@tonic-gate 	caddr_t		pp_addr2;
20630Sstevel@tonic-gate 	void		*pte2;
20640Sstevel@tonic-gate 	kmutex_t	*ppaddr_mutex;
20650Sstevel@tonic-gate 
20660Sstevel@tonic-gate 	ASSERT_STACK_ALIGNED();
20670Sstevel@tonic-gate 	ASSERT(len <= MMU_PAGESIZE);
20680Sstevel@tonic-gate 	ASSERT(off <= MMU_PAGESIZE);
20690Sstevel@tonic-gate 	ASSERT(off + len <= MMU_PAGESIZE);
20700Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(pp));
20710Sstevel@tonic-gate 
20720Sstevel@tonic-gate 	if (kpm_enable) {
20730Sstevel@tonic-gate 		pp_addr2 = hat_kpm_page2va(pp, 0);
20740Sstevel@tonic-gate 		kpreempt_disable();
20750Sstevel@tonic-gate 	} else {
20760Sstevel@tonic-gate 		kpreempt_disable();
20770Sstevel@tonic-gate 
20780Sstevel@tonic-gate 		pp_addr2 = CPU->cpu_caddr2;
20790Sstevel@tonic-gate 		pte2 = (void *)CPU->cpu_caddr2pte;
20800Sstevel@tonic-gate 
20810Sstevel@tonic-gate 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
20820Sstevel@tonic-gate 		mutex_enter(ppaddr_mutex);
20830Sstevel@tonic-gate 
20840Sstevel@tonic-gate 		hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2,
20850Sstevel@tonic-gate 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
20860Sstevel@tonic-gate 		    HAT_LOAD_NOCONSIST);
20870Sstevel@tonic-gate 	}
20880Sstevel@tonic-gate 
20890Sstevel@tonic-gate 	if (use_sse_pagezero)
20900Sstevel@tonic-gate 		hwblkclr(pp_addr2 + off, len);
20910Sstevel@tonic-gate 	else
20920Sstevel@tonic-gate 		bzero(pp_addr2 + off, len);
20930Sstevel@tonic-gate 
20940Sstevel@tonic-gate 	if (!kpm_enable)
20950Sstevel@tonic-gate 		mutex_exit(ppaddr_mutex);
20960Sstevel@tonic-gate 	kpreempt_enable();
20970Sstevel@tonic-gate }
20980Sstevel@tonic-gate 
20990Sstevel@tonic-gate /*
21000Sstevel@tonic-gate  * Platform-dependent page scrub call.
21010Sstevel@tonic-gate  */
21020Sstevel@tonic-gate void
21030Sstevel@tonic-gate pagescrub(page_t *pp, uint_t off, uint_t len)
21040Sstevel@tonic-gate {
21050Sstevel@tonic-gate 	/*
21060Sstevel@tonic-gate 	 * For now, we rely on the fact that pagezero() will
21070Sstevel@tonic-gate 	 * always clear UEs.
21080Sstevel@tonic-gate 	 */
21090Sstevel@tonic-gate 	pagezero(pp, off, len);
21100Sstevel@tonic-gate }
21110Sstevel@tonic-gate 
21120Sstevel@tonic-gate /*
21130Sstevel@tonic-gate  * set up two private addresses for use on a given CPU for use in ppcopy()
21140Sstevel@tonic-gate  */
21150Sstevel@tonic-gate void
21160Sstevel@tonic-gate setup_vaddr_for_ppcopy(struct cpu *cpup)
21170Sstevel@tonic-gate {
21180Sstevel@tonic-gate 	void *addr;
21190Sstevel@tonic-gate 	void *pte;
21200Sstevel@tonic-gate 
21210Sstevel@tonic-gate 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
21220Sstevel@tonic-gate 	pte = hat_mempte_setup(addr);
21230Sstevel@tonic-gate 	cpup->cpu_caddr1 = addr;
21240Sstevel@tonic-gate 	cpup->cpu_caddr1pte = (pteptr_t)pte;
21250Sstevel@tonic-gate 
21260Sstevel@tonic-gate 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
21270Sstevel@tonic-gate 	pte = hat_mempte_setup(addr);
21280Sstevel@tonic-gate 	cpup->cpu_caddr2 = addr;
21290Sstevel@tonic-gate 	cpup->cpu_caddr2pte = (pteptr_t)pte;
21300Sstevel@tonic-gate 
21310Sstevel@tonic-gate 	mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
21320Sstevel@tonic-gate }
21330Sstevel@tonic-gate 
21340Sstevel@tonic-gate 
21350Sstevel@tonic-gate /*
21360Sstevel@tonic-gate  * Create the pageout scanner thread. The thread has to
21370Sstevel@tonic-gate  * start at procedure with process pp and priority pri.
21380Sstevel@tonic-gate  */
21390Sstevel@tonic-gate void
21400Sstevel@tonic-gate pageout_init(void (*procedure)(), proc_t *pp, pri_t pri)
21410Sstevel@tonic-gate {
21420Sstevel@tonic-gate 	(void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri);
21430Sstevel@tonic-gate }
21440Sstevel@tonic-gate 
21450Sstevel@tonic-gate /*
21460Sstevel@tonic-gate  * Function for flushing D-cache when performing module relocations
21470Sstevel@tonic-gate  * to an alternate mapping.  Unnecessary on Intel / AMD platforms.
21480Sstevel@tonic-gate  */
21490Sstevel@tonic-gate void
21500Sstevel@tonic-gate dcache_flushall()
21510Sstevel@tonic-gate {}
21523177Sdp78419 
21533177Sdp78419 size_t
21543177Sdp78419 exec_get_spslew(void)
21553177Sdp78419 {
21563177Sdp78419 	return (0);
21573177Sdp78419 }
2158