xref: /onnv-gate/usr/src/uts/i86pc/vm/vm_machdep.c (revision 1373:21e3b2045b46)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
50Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
60Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
70Sstevel@tonic-gate  * with the License.
80Sstevel@tonic-gate  *
90Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
100Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
110Sstevel@tonic-gate  * See the License for the specific language governing permissions
120Sstevel@tonic-gate  * and limitations under the License.
130Sstevel@tonic-gate  *
140Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
150Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
160Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
170Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
180Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
190Sstevel@tonic-gate  *
200Sstevel@tonic-gate  * CDDL HEADER END
210Sstevel@tonic-gate  */
220Sstevel@tonic-gate /*
23*1373Skchow  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
240Sstevel@tonic-gate  * Use is subject to license terms.
250Sstevel@tonic-gate  */
260Sstevel@tonic-gate 
270Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
280Sstevel@tonic-gate /*	All Rights Reserved   */
290Sstevel@tonic-gate 
300Sstevel@tonic-gate /*
310Sstevel@tonic-gate  * Portions of this source code were derived from Berkeley 4.3 BSD
320Sstevel@tonic-gate  * under license from the Regents of the University of California.
330Sstevel@tonic-gate  */
340Sstevel@tonic-gate 
350Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
360Sstevel@tonic-gate 
370Sstevel@tonic-gate /*
380Sstevel@tonic-gate  * UNIX machine dependent virtual memory support.
390Sstevel@tonic-gate  */
400Sstevel@tonic-gate 
410Sstevel@tonic-gate #include <sys/types.h>
420Sstevel@tonic-gate #include <sys/param.h>
430Sstevel@tonic-gate #include <sys/systm.h>
440Sstevel@tonic-gate #include <sys/user.h>
450Sstevel@tonic-gate #include <sys/proc.h>
460Sstevel@tonic-gate #include <sys/kmem.h>
470Sstevel@tonic-gate #include <sys/vmem.h>
480Sstevel@tonic-gate #include <sys/buf.h>
490Sstevel@tonic-gate #include <sys/cpuvar.h>
500Sstevel@tonic-gate #include <sys/lgrp.h>
510Sstevel@tonic-gate #include <sys/disp.h>
520Sstevel@tonic-gate #include <sys/vm.h>
530Sstevel@tonic-gate #include <sys/mman.h>
540Sstevel@tonic-gate #include <sys/vnode.h>
550Sstevel@tonic-gate #include <sys/cred.h>
560Sstevel@tonic-gate #include <sys/exec.h>
570Sstevel@tonic-gate #include <sys/exechdr.h>
580Sstevel@tonic-gate #include <sys/debug.h>
590Sstevel@tonic-gate 
600Sstevel@tonic-gate #include <vm/hat.h>
610Sstevel@tonic-gate #include <vm/as.h>
620Sstevel@tonic-gate #include <vm/seg.h>
630Sstevel@tonic-gate #include <vm/seg_kp.h>
640Sstevel@tonic-gate #include <vm/seg_vn.h>
650Sstevel@tonic-gate #include <vm/page.h>
660Sstevel@tonic-gate #include <vm/seg_kmem.h>
670Sstevel@tonic-gate #include <vm/seg_kpm.h>
680Sstevel@tonic-gate #include <vm/vm_dep.h>
690Sstevel@tonic-gate 
700Sstevel@tonic-gate #include <sys/cpu.h>
710Sstevel@tonic-gate #include <sys/vm_machparam.h>
720Sstevel@tonic-gate #include <sys/memlist.h>
730Sstevel@tonic-gate #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
740Sstevel@tonic-gate #include <vm/hat_i86.h>
750Sstevel@tonic-gate #include <sys/x86_archext.h>
760Sstevel@tonic-gate #include <sys/elf_386.h>
770Sstevel@tonic-gate #include <sys/cmn_err.h>
780Sstevel@tonic-gate #include <sys/archsystm.h>
790Sstevel@tonic-gate #include <sys/machsystm.h>
800Sstevel@tonic-gate 
810Sstevel@tonic-gate #include <sys/vtrace.h>
820Sstevel@tonic-gate #include <sys/ddidmareq.h>
830Sstevel@tonic-gate #include <sys/promif.h>
840Sstevel@tonic-gate #include <sys/memnode.h>
850Sstevel@tonic-gate #include <sys/stack.h>
860Sstevel@tonic-gate 
870Sstevel@tonic-gate uint_t vac_colors = 0;
880Sstevel@tonic-gate 
890Sstevel@tonic-gate int largepagesupport = 0;
900Sstevel@tonic-gate extern uint_t page_create_new;
910Sstevel@tonic-gate extern uint_t page_create_exists;
920Sstevel@tonic-gate extern uint_t page_create_putbacks;
930Sstevel@tonic-gate extern uint_t page_create_putbacks;
940Sstevel@tonic-gate extern uintptr_t eprom_kernelbase;
950Sstevel@tonic-gate extern int use_sse_pagecopy, use_sse_pagezero;	/* in ml/float.s */
960Sstevel@tonic-gate 
970Sstevel@tonic-gate /* 4g memory management */
980Sstevel@tonic-gate pgcnt_t		maxmem4g;
990Sstevel@tonic-gate pgcnt_t		freemem4g;
1000Sstevel@tonic-gate int		physmax4g;
1010Sstevel@tonic-gate int		desfree4gshift = 4;	/* maxmem4g shift to derive DESFREE4G */
1020Sstevel@tonic-gate int		lotsfree4gshift = 3;
1030Sstevel@tonic-gate 
1040Sstevel@tonic-gate #ifdef VM_STATS
1050Sstevel@tonic-gate struct {
1060Sstevel@tonic-gate 	ulong_t	pga_alloc;
1070Sstevel@tonic-gate 	ulong_t	pga_notfullrange;
1080Sstevel@tonic-gate 	ulong_t	pga_nulldmaattr;
1090Sstevel@tonic-gate 	ulong_t	pga_allocok;
1100Sstevel@tonic-gate 	ulong_t	pga_allocfailed;
1110Sstevel@tonic-gate 	ulong_t	pgma_alloc;
1120Sstevel@tonic-gate 	ulong_t	pgma_allocok;
1130Sstevel@tonic-gate 	ulong_t	pgma_allocfailed;
1140Sstevel@tonic-gate 	ulong_t	pgma_allocempty;
1150Sstevel@tonic-gate } pga_vmstats;
1160Sstevel@tonic-gate #endif
1170Sstevel@tonic-gate 
1180Sstevel@tonic-gate uint_t mmu_page_sizes;
1190Sstevel@tonic-gate 
1200Sstevel@tonic-gate /* How many page sizes the users can see */
1210Sstevel@tonic-gate uint_t mmu_exported_page_sizes;
1220Sstevel@tonic-gate 
1230Sstevel@tonic-gate size_t auto_lpg_va_default = MMU_PAGESIZE; /* used by zmap() */
124423Sdavemq /*
125423Sdavemq  * Number of pages in 1 GB.  Don't enable automatic large pages if we have
126423Sdavemq  * fewer than this many pages.
127423Sdavemq  */
128423Sdavemq pgcnt_t auto_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
1290Sstevel@tonic-gate 
1300Sstevel@tonic-gate /*
1310Sstevel@tonic-gate  * Return the optimum page size for a given mapping
1320Sstevel@tonic-gate  */
1330Sstevel@tonic-gate /*ARGSUSED*/
1340Sstevel@tonic-gate size_t
1350Sstevel@tonic-gate map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int *remap)
1360Sstevel@tonic-gate {
1370Sstevel@tonic-gate 	level_t l;
1380Sstevel@tonic-gate 
1390Sstevel@tonic-gate 	if (remap)
1400Sstevel@tonic-gate 		*remap = 0;
1410Sstevel@tonic-gate 
1420Sstevel@tonic-gate 	switch (maptype) {
1430Sstevel@tonic-gate 
1440Sstevel@tonic-gate 	case MAPPGSZ_STK:
1450Sstevel@tonic-gate 	case MAPPGSZ_HEAP:
1460Sstevel@tonic-gate 	case MAPPGSZ_VA:
1470Sstevel@tonic-gate 		/*
1480Sstevel@tonic-gate 		 * use the pages size that best fits len
1490Sstevel@tonic-gate 		 */
1500Sstevel@tonic-gate 		for (l = mmu.max_page_level; l > 0; --l) {
1510Sstevel@tonic-gate 			if (len < LEVEL_SIZE(l))
1520Sstevel@tonic-gate 				continue;
1530Sstevel@tonic-gate 			break;
1540Sstevel@tonic-gate 		}
1550Sstevel@tonic-gate 		return (LEVEL_SIZE(l));
1560Sstevel@tonic-gate 
1570Sstevel@tonic-gate 	/*
1580Sstevel@tonic-gate 	 * for ISM use the 1st large page size.
1590Sstevel@tonic-gate 	 */
1600Sstevel@tonic-gate 	case MAPPGSZ_ISM:
1610Sstevel@tonic-gate 		if (mmu.max_page_level == 0)
1620Sstevel@tonic-gate 			return (MMU_PAGESIZE);
1630Sstevel@tonic-gate 		return (LEVEL_SIZE(1));
1640Sstevel@tonic-gate 	}
1650Sstevel@tonic-gate 	return (0);
1660Sstevel@tonic-gate }
1670Sstevel@tonic-gate 
1680Sstevel@tonic-gate /*
1690Sstevel@tonic-gate  * This can be patched via /etc/system to allow large pages
1700Sstevel@tonic-gate  * to be used for mapping application and libraries text segments.
1710Sstevel@tonic-gate  */
1720Sstevel@tonic-gate int	use_text_largepages = 0;
1730Sstevel@tonic-gate 
1740Sstevel@tonic-gate /*
1750Sstevel@tonic-gate  * Return a bit vector of large page size codes that
1760Sstevel@tonic-gate  * can be used to map [addr, addr + len) region.
1770Sstevel@tonic-gate  */
1780Sstevel@tonic-gate 
1790Sstevel@tonic-gate /*ARGSUSED*/
1800Sstevel@tonic-gate uint_t
1810Sstevel@tonic-gate map_execseg_pgszcvec(int text, caddr_t addr, size_t len)
1820Sstevel@tonic-gate {
1830Sstevel@tonic-gate 	size_t	pgsz;
1840Sstevel@tonic-gate 	caddr_t a;
1850Sstevel@tonic-gate 
1860Sstevel@tonic-gate 	if (!text || !use_text_largepages ||
1870Sstevel@tonic-gate 	    mmu.max_page_level == 0)
1880Sstevel@tonic-gate 		return (0);
1890Sstevel@tonic-gate 
1900Sstevel@tonic-gate 	pgsz = LEVEL_SIZE(1);
1910Sstevel@tonic-gate 	a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1920Sstevel@tonic-gate 	if (a < addr || a >= addr + len) {
1930Sstevel@tonic-gate 		return (0);
1940Sstevel@tonic-gate 	}
1950Sstevel@tonic-gate 	len -= (a - addr);
1960Sstevel@tonic-gate 	if (len < pgsz) {
1970Sstevel@tonic-gate 		return (0);
1980Sstevel@tonic-gate 	}
1990Sstevel@tonic-gate 	return (1 << 1);
2000Sstevel@tonic-gate }
2010Sstevel@tonic-gate 
2020Sstevel@tonic-gate /*
2030Sstevel@tonic-gate  * Handle a pagefault.
2040Sstevel@tonic-gate  */
2050Sstevel@tonic-gate faultcode_t
2060Sstevel@tonic-gate pagefault(
2070Sstevel@tonic-gate 	caddr_t addr,
2080Sstevel@tonic-gate 	enum fault_type type,
2090Sstevel@tonic-gate 	enum seg_rw rw,
2100Sstevel@tonic-gate 	int iskernel)
2110Sstevel@tonic-gate {
2120Sstevel@tonic-gate 	struct as *as;
2130Sstevel@tonic-gate 	struct hat *hat;
2140Sstevel@tonic-gate 	struct proc *p;
2150Sstevel@tonic-gate 	kthread_t *t;
2160Sstevel@tonic-gate 	faultcode_t res;
2170Sstevel@tonic-gate 	caddr_t base;
2180Sstevel@tonic-gate 	size_t len;
2190Sstevel@tonic-gate 	int err;
2200Sstevel@tonic-gate 	int mapped_red;
2210Sstevel@tonic-gate 	uintptr_t ea;
2220Sstevel@tonic-gate 
2230Sstevel@tonic-gate 	ASSERT_STACK_ALIGNED();
2240Sstevel@tonic-gate 
2250Sstevel@tonic-gate 	if (INVALID_VADDR(addr))
2260Sstevel@tonic-gate 		return (FC_NOMAP);
2270Sstevel@tonic-gate 
2280Sstevel@tonic-gate 	mapped_red = segkp_map_red();
2290Sstevel@tonic-gate 
2300Sstevel@tonic-gate 	if (iskernel) {
2310Sstevel@tonic-gate 		as = &kas;
2320Sstevel@tonic-gate 		hat = as->a_hat;
2330Sstevel@tonic-gate 	} else {
2340Sstevel@tonic-gate 		t = curthread;
2350Sstevel@tonic-gate 		p = ttoproc(t);
2360Sstevel@tonic-gate 		as = p->p_as;
2370Sstevel@tonic-gate 		hat = as->a_hat;
2380Sstevel@tonic-gate 	}
2390Sstevel@tonic-gate 
2400Sstevel@tonic-gate 	/*
2410Sstevel@tonic-gate 	 * Dispatch pagefault.
2420Sstevel@tonic-gate 	 */
2430Sstevel@tonic-gate 	res = as_fault(hat, as, addr, 1, type, rw);
2440Sstevel@tonic-gate 
2450Sstevel@tonic-gate 	/*
2460Sstevel@tonic-gate 	 * If this isn't a potential unmapped hole in the user's
2470Sstevel@tonic-gate 	 * UNIX data or stack segments, just return status info.
2480Sstevel@tonic-gate 	 */
2490Sstevel@tonic-gate 	if (res != FC_NOMAP || iskernel)
2500Sstevel@tonic-gate 		goto out;
2510Sstevel@tonic-gate 
2520Sstevel@tonic-gate 	/*
2530Sstevel@tonic-gate 	 * Check to see if we happened to faulted on a currently unmapped
2540Sstevel@tonic-gate 	 * part of the UNIX data or stack segments.  If so, create a zfod
2550Sstevel@tonic-gate 	 * mapping there and then try calling the fault routine again.
2560Sstevel@tonic-gate 	 */
2570Sstevel@tonic-gate 	base = p->p_brkbase;
2580Sstevel@tonic-gate 	len = p->p_brksize;
2590Sstevel@tonic-gate 
2600Sstevel@tonic-gate 	if (addr < base || addr >= base + len) {		/* data seg? */
2610Sstevel@tonic-gate 		base = (caddr_t)p->p_usrstack - p->p_stksize;
2620Sstevel@tonic-gate 		len = p->p_stksize;
2630Sstevel@tonic-gate 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
2640Sstevel@tonic-gate 			/* not in either UNIX data or stack segments */
2650Sstevel@tonic-gate 			res = FC_NOMAP;
2660Sstevel@tonic-gate 			goto out;
2670Sstevel@tonic-gate 		}
2680Sstevel@tonic-gate 	}
2690Sstevel@tonic-gate 
2700Sstevel@tonic-gate 	/*
2710Sstevel@tonic-gate 	 * the rest of this function implements a 3.X 4.X 5.X compatibility
2720Sstevel@tonic-gate 	 * This code is probably not needed anymore
2730Sstevel@tonic-gate 	 */
2740Sstevel@tonic-gate 	if (p->p_model == DATAMODEL_ILP32) {
2750Sstevel@tonic-gate 
2760Sstevel@tonic-gate 		/* expand the gap to the page boundaries on each side */
2770Sstevel@tonic-gate 		ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
2780Sstevel@tonic-gate 		base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
2790Sstevel@tonic-gate 		len = ea - (uintptr_t)base;
2800Sstevel@tonic-gate 
2810Sstevel@tonic-gate 		as_rangelock(as);
2820Sstevel@tonic-gate 		if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
2830Sstevel@tonic-gate 		    0) {
2840Sstevel@tonic-gate 			err = as_map(as, base, len, segvn_create, zfod_argsp);
2850Sstevel@tonic-gate 			as_rangeunlock(as);
2860Sstevel@tonic-gate 			if (err) {
2870Sstevel@tonic-gate 				res = FC_MAKE_ERR(err);
2880Sstevel@tonic-gate 				goto out;
2890Sstevel@tonic-gate 			}
2900Sstevel@tonic-gate 		} else {
2910Sstevel@tonic-gate 			/*
2920Sstevel@tonic-gate 			 * This page is already mapped by another thread after
2930Sstevel@tonic-gate 			 * we returned from as_fault() above.  We just fall
2940Sstevel@tonic-gate 			 * through as_fault() below.
2950Sstevel@tonic-gate 			 */
2960Sstevel@tonic-gate 			as_rangeunlock(as);
2970Sstevel@tonic-gate 		}
2980Sstevel@tonic-gate 
2990Sstevel@tonic-gate 		res = as_fault(hat, as, addr, 1, F_INVAL, rw);
3000Sstevel@tonic-gate 	}
3010Sstevel@tonic-gate 
3020Sstevel@tonic-gate out:
3030Sstevel@tonic-gate 	if (mapped_red)
3040Sstevel@tonic-gate 		segkp_unmap_red();
3050Sstevel@tonic-gate 
3060Sstevel@tonic-gate 	return (res);
3070Sstevel@tonic-gate }
3080Sstevel@tonic-gate 
3090Sstevel@tonic-gate void
3100Sstevel@tonic-gate map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
3110Sstevel@tonic-gate {
3120Sstevel@tonic-gate 	struct proc *p = curproc;
3130Sstevel@tonic-gate 	caddr_t userlimit = (flags & _MAP_LOW32) ?
3140Sstevel@tonic-gate 	    (caddr_t)_userlimit32 : p->p_as->a_userlimit;
3150Sstevel@tonic-gate 
3160Sstevel@tonic-gate 	map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
3170Sstevel@tonic-gate }
3180Sstevel@tonic-gate 
3190Sstevel@tonic-gate /*ARGSUSED*/
3200Sstevel@tonic-gate int
3210Sstevel@tonic-gate map_addr_vacalign_check(caddr_t addr, u_offset_t off)
3220Sstevel@tonic-gate {
3230Sstevel@tonic-gate 	return (0);
3240Sstevel@tonic-gate }
3250Sstevel@tonic-gate 
3260Sstevel@tonic-gate /*
3270Sstevel@tonic-gate  * map_addr_proc() is the routine called when the system is to
3280Sstevel@tonic-gate  * choose an address for the user.  We will pick an address
3290Sstevel@tonic-gate  * range which is the highest available below kernelbase.
3300Sstevel@tonic-gate  *
3310Sstevel@tonic-gate  * addrp is a value/result parameter.
3320Sstevel@tonic-gate  *	On input it is a hint from the user to be used in a completely
3330Sstevel@tonic-gate  *	machine dependent fashion.  We decide to completely ignore this hint.
3340Sstevel@tonic-gate  *
3350Sstevel@tonic-gate  *	On output it is NULL if no address can be found in the current
3360Sstevel@tonic-gate  *	processes address space or else an address that is currently
3370Sstevel@tonic-gate  *	not mapped for len bytes with a page of red zone on either side.
3380Sstevel@tonic-gate  *
3390Sstevel@tonic-gate  *	align is not needed on x86 (it's for viturally addressed caches)
3400Sstevel@tonic-gate  */
3410Sstevel@tonic-gate /*ARGSUSED*/
3420Sstevel@tonic-gate void
3430Sstevel@tonic-gate map_addr_proc(
3440Sstevel@tonic-gate 	caddr_t *addrp,
3450Sstevel@tonic-gate 	size_t len,
3460Sstevel@tonic-gate 	offset_t off,
3470Sstevel@tonic-gate 	int vacalign,
3480Sstevel@tonic-gate 	caddr_t userlimit,
3490Sstevel@tonic-gate 	struct proc *p,
3500Sstevel@tonic-gate 	uint_t flags)
3510Sstevel@tonic-gate {
3520Sstevel@tonic-gate 	struct as *as = p->p_as;
3530Sstevel@tonic-gate 	caddr_t addr;
3540Sstevel@tonic-gate 	caddr_t base;
3550Sstevel@tonic-gate 	size_t slen;
3560Sstevel@tonic-gate 	size_t align_amount;
3570Sstevel@tonic-gate 
3580Sstevel@tonic-gate 	ASSERT32(userlimit == as->a_userlimit);
3590Sstevel@tonic-gate 
3600Sstevel@tonic-gate 	base = p->p_brkbase;
3610Sstevel@tonic-gate #if defined(__amd64)
3620Sstevel@tonic-gate 	/*
3630Sstevel@tonic-gate 	 * XX64 Yes, this needs more work.
3640Sstevel@tonic-gate 	 */
3650Sstevel@tonic-gate 	if (p->p_model == DATAMODEL_NATIVE) {
3660Sstevel@tonic-gate 		if (userlimit < as->a_userlimit) {
3670Sstevel@tonic-gate 			/*
3680Sstevel@tonic-gate 			 * This happens when a program wants to map
3690Sstevel@tonic-gate 			 * something in a range that's accessible to a
3700Sstevel@tonic-gate 			 * program in a smaller address space.  For example,
3710Sstevel@tonic-gate 			 * a 64-bit program calling mmap32(2) to guarantee
3720Sstevel@tonic-gate 			 * that the returned address is below 4Gbytes.
3730Sstevel@tonic-gate 			 */
3740Sstevel@tonic-gate 			ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
3750Sstevel@tonic-gate 
3760Sstevel@tonic-gate 			if (userlimit > base)
3770Sstevel@tonic-gate 				slen = userlimit - base;
3780Sstevel@tonic-gate 			else {
3790Sstevel@tonic-gate 				*addrp = NULL;
3800Sstevel@tonic-gate 				return;
3810Sstevel@tonic-gate 			}
3820Sstevel@tonic-gate 		} else {
3830Sstevel@tonic-gate 			/*
3840Sstevel@tonic-gate 			 * XX64 This layout is probably wrong .. but in
3850Sstevel@tonic-gate 			 * the event we make the amd64 address space look
3860Sstevel@tonic-gate 			 * like sparcv9 i.e. with the stack -above- the
3870Sstevel@tonic-gate 			 * heap, this bit of code might even be correct.
3880Sstevel@tonic-gate 			 */
3890Sstevel@tonic-gate 			slen = p->p_usrstack - base -
3900Sstevel@tonic-gate 			    (((size_t)rctl_enforced_value(
3910Sstevel@tonic-gate 			    rctlproc_legacy[RLIMIT_STACK],
3920Sstevel@tonic-gate 			    p->p_rctls, p) + PAGEOFFSET) & PAGEMASK);
3930Sstevel@tonic-gate 		}
3940Sstevel@tonic-gate 	} else
3950Sstevel@tonic-gate #endif
3960Sstevel@tonic-gate 		slen = userlimit - base;
3970Sstevel@tonic-gate 
3980Sstevel@tonic-gate 	len = (len + PAGEOFFSET) & PAGEMASK;
3990Sstevel@tonic-gate 
4000Sstevel@tonic-gate 	/*
4010Sstevel@tonic-gate 	 * Redzone for each side of the request. This is done to leave
4020Sstevel@tonic-gate 	 * one page unmapped between segments. This is not required, but
4030Sstevel@tonic-gate 	 * it's useful for the user because if their program strays across
4040Sstevel@tonic-gate 	 * a segment boundary, it will catch a fault immediately making
4050Sstevel@tonic-gate 	 * debugging a little easier.
4060Sstevel@tonic-gate 	 */
4070Sstevel@tonic-gate 	len += 2 * MMU_PAGESIZE;
4080Sstevel@tonic-gate 
4090Sstevel@tonic-gate 	/*
4100Sstevel@tonic-gate 	 * figure out what the alignment should be
4110Sstevel@tonic-gate 	 *
4120Sstevel@tonic-gate 	 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
4130Sstevel@tonic-gate 	 */
4140Sstevel@tonic-gate 	if (len <= ELF_386_MAXPGSZ) {
4150Sstevel@tonic-gate 		/*
4160Sstevel@tonic-gate 		 * Align virtual addresses to ensure that ELF shared libraries
4170Sstevel@tonic-gate 		 * are mapped with the appropriate alignment constraints by
4180Sstevel@tonic-gate 		 * the run-time linker.
4190Sstevel@tonic-gate 		 */
4200Sstevel@tonic-gate 		align_amount = ELF_386_MAXPGSZ;
4210Sstevel@tonic-gate 	} else {
4220Sstevel@tonic-gate 		int l = mmu.max_page_level;
4230Sstevel@tonic-gate 
4240Sstevel@tonic-gate 		while (l && len < LEVEL_SIZE(l))
4250Sstevel@tonic-gate 			--l;
4260Sstevel@tonic-gate 
4270Sstevel@tonic-gate 		align_amount = LEVEL_SIZE(l);
4280Sstevel@tonic-gate 	}
4290Sstevel@tonic-gate 
4300Sstevel@tonic-gate 	if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
4310Sstevel@tonic-gate 		align_amount = (uintptr_t)*addrp;
4320Sstevel@tonic-gate 
4330Sstevel@tonic-gate 	len += align_amount;
4340Sstevel@tonic-gate 
4350Sstevel@tonic-gate 	/*
4360Sstevel@tonic-gate 	 * Look for a large enough hole starting below userlimit.
4370Sstevel@tonic-gate 	 * After finding it, use the upper part.  Addition of PAGESIZE
4380Sstevel@tonic-gate 	 * is for the redzone as described above.
4390Sstevel@tonic-gate 	 */
4400Sstevel@tonic-gate 	if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) {
4410Sstevel@tonic-gate 		caddr_t as_addr;
4420Sstevel@tonic-gate 
4430Sstevel@tonic-gate 		addr = base + slen - len + MMU_PAGESIZE;
4440Sstevel@tonic-gate 		as_addr = addr;
4450Sstevel@tonic-gate 		/*
4460Sstevel@tonic-gate 		 * Round address DOWN to the alignment amount,
4470Sstevel@tonic-gate 		 * add the offset, and if this address is less
4480Sstevel@tonic-gate 		 * than the original address, add alignment amount.
4490Sstevel@tonic-gate 		 */
4500Sstevel@tonic-gate 		addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
4510Sstevel@tonic-gate 		addr += (uintptr_t)(off & (align_amount - 1));
4520Sstevel@tonic-gate 		if (addr < as_addr)
4530Sstevel@tonic-gate 			addr += align_amount;
4540Sstevel@tonic-gate 
4550Sstevel@tonic-gate 		ASSERT(addr <= (as_addr + align_amount));
4560Sstevel@tonic-gate 		ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
4570Sstevel@tonic-gate 		    ((uintptr_t)(off & (align_amount - 1))));
4580Sstevel@tonic-gate 		*addrp = addr;
4590Sstevel@tonic-gate 	} else {
4600Sstevel@tonic-gate 		*addrp = NULL;	/* no more virtual space */
4610Sstevel@tonic-gate 	}
4620Sstevel@tonic-gate }
4630Sstevel@tonic-gate 
4640Sstevel@tonic-gate /*
4650Sstevel@tonic-gate  * Determine whether [base, base+len] contains a valid range of
4660Sstevel@tonic-gate  * addresses at least minlen long. base and len are adjusted if
4670Sstevel@tonic-gate  * required to provide a valid range.
4680Sstevel@tonic-gate  */
4690Sstevel@tonic-gate /*ARGSUSED3*/
4700Sstevel@tonic-gate int
4710Sstevel@tonic-gate valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
4720Sstevel@tonic-gate {
4730Sstevel@tonic-gate 	uintptr_t hi, lo;
4740Sstevel@tonic-gate 
4750Sstevel@tonic-gate 	lo = (uintptr_t)*basep;
4760Sstevel@tonic-gate 	hi = lo + *lenp;
4770Sstevel@tonic-gate 
4780Sstevel@tonic-gate 	/*
4790Sstevel@tonic-gate 	 * If hi rolled over the top, try cutting back.
4800Sstevel@tonic-gate 	 */
4810Sstevel@tonic-gate 	if (hi < lo) {
4820Sstevel@tonic-gate 		if (0 - lo + hi < minlen)
4830Sstevel@tonic-gate 			return (0);
4840Sstevel@tonic-gate 		if (0 - lo < minlen)
4850Sstevel@tonic-gate 			return (0);
4860Sstevel@tonic-gate 		*lenp = 0 - lo;
4870Sstevel@tonic-gate 	} else if (hi - lo < minlen) {
4880Sstevel@tonic-gate 		return (0);
4890Sstevel@tonic-gate 	}
4900Sstevel@tonic-gate #if defined(__amd64)
4910Sstevel@tonic-gate 	/*
4920Sstevel@tonic-gate 	 * Deal with a possible hole in the address range between
4930Sstevel@tonic-gate 	 * hole_start and hole_end that should never be mapped.
4940Sstevel@tonic-gate 	 */
4950Sstevel@tonic-gate 	if (lo < hole_start) {
4960Sstevel@tonic-gate 		if (hi > hole_start) {
4970Sstevel@tonic-gate 			if (hi < hole_end) {
4980Sstevel@tonic-gate 				hi = hole_start;
4990Sstevel@tonic-gate 			} else {
5000Sstevel@tonic-gate 				/* lo < hole_start && hi >= hole_end */
5010Sstevel@tonic-gate 				if (dir == AH_LO) {
5020Sstevel@tonic-gate 					/*
5030Sstevel@tonic-gate 					 * prefer lowest range
5040Sstevel@tonic-gate 					 */
5050Sstevel@tonic-gate 					if (hole_start - lo >= minlen)
5060Sstevel@tonic-gate 						hi = hole_start;
5070Sstevel@tonic-gate 					else if (hi - hole_end >= minlen)
5080Sstevel@tonic-gate 						lo = hole_end;
5090Sstevel@tonic-gate 					else
5100Sstevel@tonic-gate 						return (0);
5110Sstevel@tonic-gate 				} else {
5120Sstevel@tonic-gate 					/*
5130Sstevel@tonic-gate 					 * prefer highest range
5140Sstevel@tonic-gate 					 */
5150Sstevel@tonic-gate 					if (hi - hole_end >= minlen)
5160Sstevel@tonic-gate 						lo = hole_end;
5170Sstevel@tonic-gate 					else if (hole_start - lo >= minlen)
5180Sstevel@tonic-gate 						hi = hole_start;
5190Sstevel@tonic-gate 					else
5200Sstevel@tonic-gate 						return (0);
5210Sstevel@tonic-gate 				}
5220Sstevel@tonic-gate 			}
5230Sstevel@tonic-gate 		}
5240Sstevel@tonic-gate 	} else {
5250Sstevel@tonic-gate 		/* lo >= hole_start */
5260Sstevel@tonic-gate 		if (hi < hole_end)
5270Sstevel@tonic-gate 			return (0);
5280Sstevel@tonic-gate 		if (lo < hole_end)
5290Sstevel@tonic-gate 			lo = hole_end;
5300Sstevel@tonic-gate 	}
5310Sstevel@tonic-gate 
5320Sstevel@tonic-gate 	if (hi - lo < minlen)
5330Sstevel@tonic-gate 		return (0);
5340Sstevel@tonic-gate 
5350Sstevel@tonic-gate 	*basep = (caddr_t)lo;
5360Sstevel@tonic-gate 	*lenp = hi - lo;
5370Sstevel@tonic-gate #endif
5380Sstevel@tonic-gate 	return (1);
5390Sstevel@tonic-gate }
5400Sstevel@tonic-gate 
5410Sstevel@tonic-gate /*
5420Sstevel@tonic-gate  * Determine whether [addr, addr+len] are valid user addresses.
5430Sstevel@tonic-gate  */
5440Sstevel@tonic-gate /*ARGSUSED*/
5450Sstevel@tonic-gate int
5460Sstevel@tonic-gate valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
5470Sstevel@tonic-gate     caddr_t userlimit)
5480Sstevel@tonic-gate {
5490Sstevel@tonic-gate 	caddr_t eaddr = addr + len;
5500Sstevel@tonic-gate 
5510Sstevel@tonic-gate 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
5520Sstevel@tonic-gate 		return (RANGE_BADADDR);
5530Sstevel@tonic-gate 
5540Sstevel@tonic-gate #if defined(__amd64)
5550Sstevel@tonic-gate 	/*
5560Sstevel@tonic-gate 	 * Check for the VA hole
5570Sstevel@tonic-gate 	 */
5580Sstevel@tonic-gate 	if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
5590Sstevel@tonic-gate 		return (RANGE_BADADDR);
5600Sstevel@tonic-gate #endif
5610Sstevel@tonic-gate 
5620Sstevel@tonic-gate 	return (RANGE_OKAY);
5630Sstevel@tonic-gate }
5640Sstevel@tonic-gate 
5650Sstevel@tonic-gate /*
5660Sstevel@tonic-gate  * Return 1 if the page frame is onboard memory, else 0.
5670Sstevel@tonic-gate  */
5680Sstevel@tonic-gate int
5690Sstevel@tonic-gate pf_is_memory(pfn_t pf)
5700Sstevel@tonic-gate {
5710Sstevel@tonic-gate 	return (address_in_memlist(phys_install, mmu_ptob((uint64_t)pf), 1));
5720Sstevel@tonic-gate }
5730Sstevel@tonic-gate 
5740Sstevel@tonic-gate 
5750Sstevel@tonic-gate /*
5760Sstevel@tonic-gate  * initialized by page_coloring_init().
5770Sstevel@tonic-gate  */
5780Sstevel@tonic-gate uint_t	page_colors;
5790Sstevel@tonic-gate uint_t	page_colors_mask;
5800Sstevel@tonic-gate uint_t	page_coloring_shift;
5810Sstevel@tonic-gate int	cpu_page_colors;
5820Sstevel@tonic-gate static uint_t	l2_colors;
5830Sstevel@tonic-gate 
5840Sstevel@tonic-gate /*
5850Sstevel@tonic-gate  * Page freelists and cachelists are dynamically allocated once mnoderangecnt
5860Sstevel@tonic-gate  * and page_colors are calculated from the l2 cache n-way set size.  Within a
5870Sstevel@tonic-gate  * mnode range, the page freelist and cachelist are hashed into bins based on
5880Sstevel@tonic-gate  * color. This makes it easier to search for a page within a specific memory
5890Sstevel@tonic-gate  * range.
5900Sstevel@tonic-gate  */
5910Sstevel@tonic-gate #define	PAGE_COLORS_MIN	16
5920Sstevel@tonic-gate 
5930Sstevel@tonic-gate page_t ****page_freelists;
5940Sstevel@tonic-gate page_t ***page_cachelists;
5950Sstevel@tonic-gate 
5960Sstevel@tonic-gate /*
5970Sstevel@tonic-gate  * As the PC architecture evolved memory up was clumped into several
5980Sstevel@tonic-gate  * ranges for various historical I/O devices to do DMA.
5990Sstevel@tonic-gate  * < 16Meg - ISA bus
6000Sstevel@tonic-gate  * < 2Gig - ???
6010Sstevel@tonic-gate  * < 4Gig - PCI bus or drivers that don't understand PAE mode
6020Sstevel@tonic-gate  */
6030Sstevel@tonic-gate static pfn_t arch_memranges[NUM_MEM_RANGES] = {
6040Sstevel@tonic-gate     0x100000,	/* pfn range for 4G and above */
6050Sstevel@tonic-gate     0x80000,	/* pfn range for 2G-4G */
6060Sstevel@tonic-gate     0x01000,	/* pfn range for 16M-2G */
6070Sstevel@tonic-gate     0x00000,	/* pfn range for 0-16M */
6080Sstevel@tonic-gate };
6090Sstevel@tonic-gate 
6100Sstevel@tonic-gate /*
6110Sstevel@tonic-gate  * These are changed during startup if the machine has limited memory.
6120Sstevel@tonic-gate  */
6130Sstevel@tonic-gate pfn_t *memranges = &arch_memranges[0];
6140Sstevel@tonic-gate int nranges = NUM_MEM_RANGES;
6150Sstevel@tonic-gate 
6160Sstevel@tonic-gate /*
6170Sstevel@tonic-gate  * Used by page layer to know about page sizes
6180Sstevel@tonic-gate  */
6190Sstevel@tonic-gate hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
6200Sstevel@tonic-gate 
6210Sstevel@tonic-gate /*
6220Sstevel@tonic-gate  * This can be patched via /etc/system to allow old non-PAE aware device
6230Sstevel@tonic-gate  * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
6240Sstevel@tonic-gate  */
6250Sstevel@tonic-gate #if defined(__i386)
6260Sstevel@tonic-gate int restricted_kmemalloc = 1;	/* XX64 re-examine with PSARC 2004/405 */
6270Sstevel@tonic-gate #elif defined(__amd64)
6280Sstevel@tonic-gate int restricted_kmemalloc = 0;
6290Sstevel@tonic-gate #endif
6300Sstevel@tonic-gate 
6310Sstevel@tonic-gate kmutex_t	*fpc_mutex[NPC_MUTEX];
6320Sstevel@tonic-gate kmutex_t	*cpc_mutex[NPC_MUTEX];
6330Sstevel@tonic-gate 
6340Sstevel@tonic-gate 
6350Sstevel@tonic-gate /*
6360Sstevel@tonic-gate  * return the memrange containing pfn
6370Sstevel@tonic-gate  */
6380Sstevel@tonic-gate int
6390Sstevel@tonic-gate memrange_num(pfn_t pfn)
6400Sstevel@tonic-gate {
6410Sstevel@tonic-gate 	int n;
6420Sstevel@tonic-gate 
6430Sstevel@tonic-gate 	for (n = 0; n < nranges - 1; ++n) {
6440Sstevel@tonic-gate 		if (pfn >= memranges[n])
6450Sstevel@tonic-gate 			break;
6460Sstevel@tonic-gate 	}
6470Sstevel@tonic-gate 	return (n);
6480Sstevel@tonic-gate }
6490Sstevel@tonic-gate 
6500Sstevel@tonic-gate /*
6510Sstevel@tonic-gate  * return the mnoderange containing pfn
6520Sstevel@tonic-gate  */
6530Sstevel@tonic-gate int
6540Sstevel@tonic-gate pfn_2_mtype(pfn_t pfn)
6550Sstevel@tonic-gate {
6560Sstevel@tonic-gate 	int	n;
6570Sstevel@tonic-gate 
6580Sstevel@tonic-gate 	for (n = mnoderangecnt - 1; n >= 0; n--) {
6590Sstevel@tonic-gate 		if (pfn >= mnoderanges[n].mnr_pfnlo) {
6600Sstevel@tonic-gate 			break;
6610Sstevel@tonic-gate 		}
6620Sstevel@tonic-gate 	}
6630Sstevel@tonic-gate 	return (n);
6640Sstevel@tonic-gate }
6650Sstevel@tonic-gate 
6660Sstevel@tonic-gate /*
6670Sstevel@tonic-gate  * is_contigpage_free:
6680Sstevel@tonic-gate  *	returns a page list of contiguous pages. It minimally has to return
6690Sstevel@tonic-gate  *	minctg pages. Caller determines minctg based on the scatter-gather
6700Sstevel@tonic-gate  *	list length.
6710Sstevel@tonic-gate  *
6720Sstevel@tonic-gate  *	pfnp is set to the next page frame to search on return.
6730Sstevel@tonic-gate  */
6740Sstevel@tonic-gate static page_t *
6750Sstevel@tonic-gate is_contigpage_free(
6760Sstevel@tonic-gate 	pfn_t *pfnp,
6770Sstevel@tonic-gate 	pgcnt_t *pgcnt,
6780Sstevel@tonic-gate 	pgcnt_t minctg,
6790Sstevel@tonic-gate 	uint64_t pfnseg,
6800Sstevel@tonic-gate 	int iolock)
6810Sstevel@tonic-gate {
6820Sstevel@tonic-gate 	int	i = 0;
6830Sstevel@tonic-gate 	pfn_t	pfn = *pfnp;
6840Sstevel@tonic-gate 	page_t	*pp;
6850Sstevel@tonic-gate 	page_t	*plist = NULL;
6860Sstevel@tonic-gate 
6870Sstevel@tonic-gate 	/*
6880Sstevel@tonic-gate 	 * fail if pfn + minctg crosses a segment boundary.
6890Sstevel@tonic-gate 	 * Adjust for next starting pfn to begin at segment boundary.
6900Sstevel@tonic-gate 	 */
6910Sstevel@tonic-gate 
6920Sstevel@tonic-gate 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
6930Sstevel@tonic-gate 		*pfnp = roundup(*pfnp, pfnseg + 1);
6940Sstevel@tonic-gate 		return (NULL);
6950Sstevel@tonic-gate 	}
6960Sstevel@tonic-gate 
6970Sstevel@tonic-gate 	do {
6980Sstevel@tonic-gate retry:
6990Sstevel@tonic-gate 		pp = page_numtopp_nolock(pfn + i);
7000Sstevel@tonic-gate 		if ((pp == NULL) ||
7010Sstevel@tonic-gate 		    (page_trylock(pp, SE_EXCL) == 0)) {
7020Sstevel@tonic-gate 			(*pfnp)++;
7030Sstevel@tonic-gate 			break;
7040Sstevel@tonic-gate 		}
7050Sstevel@tonic-gate 		if (page_pptonum(pp) != pfn + i) {
7060Sstevel@tonic-gate 			page_unlock(pp);
7070Sstevel@tonic-gate 			goto retry;
7080Sstevel@tonic-gate 		}
7090Sstevel@tonic-gate 
7100Sstevel@tonic-gate 		if (!(PP_ISFREE(pp))) {
7110Sstevel@tonic-gate 			page_unlock(pp);
7120Sstevel@tonic-gate 			(*pfnp)++;
7130Sstevel@tonic-gate 			break;
7140Sstevel@tonic-gate 		}
7150Sstevel@tonic-gate 
7160Sstevel@tonic-gate 		if (!PP_ISAGED(pp)) {
7170Sstevel@tonic-gate 			page_list_sub(pp, PG_CACHE_LIST);
7180Sstevel@tonic-gate 			page_hashout(pp, (kmutex_t *)NULL);
7190Sstevel@tonic-gate 		} else {
7200Sstevel@tonic-gate 			page_list_sub(pp, PG_FREE_LIST);
7210Sstevel@tonic-gate 		}
7220Sstevel@tonic-gate 
7230Sstevel@tonic-gate 		if (iolock)
7240Sstevel@tonic-gate 			page_io_lock(pp);
7250Sstevel@tonic-gate 		page_list_concat(&plist, &pp);
7260Sstevel@tonic-gate 
7270Sstevel@tonic-gate 		/*
7280Sstevel@tonic-gate 		 * exit loop when pgcnt satisfied or segment boundary reached.
7290Sstevel@tonic-gate 		 */
7300Sstevel@tonic-gate 
7310Sstevel@tonic-gate 	} while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
7320Sstevel@tonic-gate 
7330Sstevel@tonic-gate 	*pfnp += i;		/* set to next pfn to search */
7340Sstevel@tonic-gate 
7350Sstevel@tonic-gate 	if (i >= minctg) {
7360Sstevel@tonic-gate 		*pgcnt -= i;
7370Sstevel@tonic-gate 		return (plist);
7380Sstevel@tonic-gate 	}
7390Sstevel@tonic-gate 
7400Sstevel@tonic-gate 	/*
7410Sstevel@tonic-gate 	 * failure: minctg not satisfied.
7420Sstevel@tonic-gate 	 *
7430Sstevel@tonic-gate 	 * if next request crosses segment boundary, set next pfn
7440Sstevel@tonic-gate 	 * to search from the segment boundary.
7450Sstevel@tonic-gate 	 */
7460Sstevel@tonic-gate 	if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
7470Sstevel@tonic-gate 		*pfnp = roundup(*pfnp, pfnseg + 1);
7480Sstevel@tonic-gate 
7490Sstevel@tonic-gate 	/* clean up any pages already allocated */
7500Sstevel@tonic-gate 
7510Sstevel@tonic-gate 	while (plist) {
7520Sstevel@tonic-gate 		pp = plist;
7530Sstevel@tonic-gate 		page_sub(&plist, pp);
7540Sstevel@tonic-gate 		page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
7550Sstevel@tonic-gate 		if (iolock)
7560Sstevel@tonic-gate 			page_io_unlock(pp);
7570Sstevel@tonic-gate 		page_unlock(pp);
7580Sstevel@tonic-gate 	}
7590Sstevel@tonic-gate 
7600Sstevel@tonic-gate 	return (NULL);
7610Sstevel@tonic-gate }
7620Sstevel@tonic-gate 
7630Sstevel@tonic-gate /*
7640Sstevel@tonic-gate  * verify that pages being returned from allocator have correct DMA attribute
7650Sstevel@tonic-gate  */
7660Sstevel@tonic-gate #ifndef DEBUG
7670Sstevel@tonic-gate #define	check_dma(a, b, c) (0)
7680Sstevel@tonic-gate #else
7690Sstevel@tonic-gate static void
7700Sstevel@tonic-gate check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
7710Sstevel@tonic-gate {
7720Sstevel@tonic-gate 	if (dma_attr == NULL)
7730Sstevel@tonic-gate 		return;
7740Sstevel@tonic-gate 
7750Sstevel@tonic-gate 	while (cnt-- > 0) {
7760Sstevel@tonic-gate 		if (mmu_ptob((uint64_t)pp->p_pagenum) <
7770Sstevel@tonic-gate 		    dma_attr->dma_attr_addr_lo)
7780Sstevel@tonic-gate 			panic("PFN (pp=%p) below dma_attr_addr_lo", pp);
7790Sstevel@tonic-gate 		if (mmu_ptob((uint64_t)pp->p_pagenum) >=
7800Sstevel@tonic-gate 		    dma_attr->dma_attr_addr_hi)
7810Sstevel@tonic-gate 			panic("PFN (pp=%p) above dma_attr_addr_hi", pp);
7820Sstevel@tonic-gate 		pp = pp->p_next;
7830Sstevel@tonic-gate 	}
7840Sstevel@tonic-gate }
7850Sstevel@tonic-gate #endif
7860Sstevel@tonic-gate 
7870Sstevel@tonic-gate static kmutex_t	contig_lock;
7880Sstevel@tonic-gate 
7890Sstevel@tonic-gate #define	CONTIG_LOCK()	mutex_enter(&contig_lock);
7900Sstevel@tonic-gate #define	CONTIG_UNLOCK()	mutex_exit(&contig_lock);
7910Sstevel@tonic-gate 
7920Sstevel@tonic-gate #define	PFN_16M		(mmu_btop((uint64_t)0x1000000))
7930Sstevel@tonic-gate 
7940Sstevel@tonic-gate static page_t *
7950Sstevel@tonic-gate page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
7960Sstevel@tonic-gate {
7970Sstevel@tonic-gate 	pfn_t		pfn;
7980Sstevel@tonic-gate 	int		sgllen;
7990Sstevel@tonic-gate 	uint64_t	pfnseg;
8000Sstevel@tonic-gate 	pgcnt_t		minctg;
8010Sstevel@tonic-gate 	page_t		*pplist = NULL, *plist;
8020Sstevel@tonic-gate 	uint64_t	lo, hi;
8030Sstevel@tonic-gate 	pgcnt_t		pfnalign = 0;
8040Sstevel@tonic-gate 	static pfn_t	startpfn;
8050Sstevel@tonic-gate 	static pgcnt_t	lastctgcnt;
8060Sstevel@tonic-gate 	uintptr_t	align;
8070Sstevel@tonic-gate 
8080Sstevel@tonic-gate 	CONTIG_LOCK();
8090Sstevel@tonic-gate 
8100Sstevel@tonic-gate 	if (mattr) {
8110Sstevel@tonic-gate 		lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
8120Sstevel@tonic-gate 		hi = mmu_btop(mattr->dma_attr_addr_hi);
8130Sstevel@tonic-gate 		if (hi >= physmax)
8140Sstevel@tonic-gate 			hi = physmax - 1;
8150Sstevel@tonic-gate 		sgllen = mattr->dma_attr_sgllen;
8160Sstevel@tonic-gate 		pfnseg = mmu_btop(mattr->dma_attr_seg);
8170Sstevel@tonic-gate 
8180Sstevel@tonic-gate 		align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
8190Sstevel@tonic-gate 		if (align > MMU_PAGESIZE)
8200Sstevel@tonic-gate 			pfnalign = mmu_btop(align);
8210Sstevel@tonic-gate 
8220Sstevel@tonic-gate 		/*
8230Sstevel@tonic-gate 		 * in order to satisfy the request, must minimally
8240Sstevel@tonic-gate 		 * acquire minctg contiguous pages
8250Sstevel@tonic-gate 		 */
8260Sstevel@tonic-gate 		minctg = howmany(*pgcnt, sgllen);
8270Sstevel@tonic-gate 
8280Sstevel@tonic-gate 		ASSERT(hi >= lo);
8290Sstevel@tonic-gate 
8300Sstevel@tonic-gate 		/*
8310Sstevel@tonic-gate 		 * start from where last searched if the minctg >= lastctgcnt
8320Sstevel@tonic-gate 		 */
8330Sstevel@tonic-gate 		if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
8340Sstevel@tonic-gate 			startpfn = lo;
8350Sstevel@tonic-gate 	} else {
8360Sstevel@tonic-gate 		hi = physmax - 1;
8370Sstevel@tonic-gate 		lo = 0;
8380Sstevel@tonic-gate 		sgllen = 1;
8390Sstevel@tonic-gate 		pfnseg = mmu.highest_pfn;
8400Sstevel@tonic-gate 		minctg = *pgcnt;
8410Sstevel@tonic-gate 
8420Sstevel@tonic-gate 		if (minctg < lastctgcnt)
8430Sstevel@tonic-gate 			startpfn = lo;
8440Sstevel@tonic-gate 	}
8450Sstevel@tonic-gate 	lastctgcnt = minctg;
8460Sstevel@tonic-gate 
8470Sstevel@tonic-gate 	ASSERT(pfnseg + 1 >= (uint64_t)minctg);
8480Sstevel@tonic-gate 
8490Sstevel@tonic-gate 	/* conserve 16m memory - start search above 16m when possible */
8500Sstevel@tonic-gate 	if (hi > PFN_16M && startpfn < PFN_16M)
8510Sstevel@tonic-gate 		startpfn = PFN_16M;
8520Sstevel@tonic-gate 
8530Sstevel@tonic-gate 	pfn = startpfn;
8540Sstevel@tonic-gate 	if (pfnalign)
8550Sstevel@tonic-gate 		pfn = P2ROUNDUP(pfn, pfnalign);
8560Sstevel@tonic-gate 
8570Sstevel@tonic-gate 	while (pfn + minctg - 1 <= hi) {
8580Sstevel@tonic-gate 
8590Sstevel@tonic-gate 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
8600Sstevel@tonic-gate 		if (plist) {
8610Sstevel@tonic-gate 			page_list_concat(&pplist, &plist);
8620Sstevel@tonic-gate 			sgllen--;
8630Sstevel@tonic-gate 			/*
8640Sstevel@tonic-gate 			 * return when contig pages no longer needed
8650Sstevel@tonic-gate 			 */
8660Sstevel@tonic-gate 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
8670Sstevel@tonic-gate 				startpfn = pfn;
8680Sstevel@tonic-gate 				CONTIG_UNLOCK();
8690Sstevel@tonic-gate 				check_dma(mattr, pplist, *pgcnt);
8700Sstevel@tonic-gate 				return (pplist);
8710Sstevel@tonic-gate 			}
8720Sstevel@tonic-gate 			minctg = howmany(*pgcnt, sgllen);
8730Sstevel@tonic-gate 		}
8740Sstevel@tonic-gate 		if (pfnalign)
8750Sstevel@tonic-gate 			pfn = P2ROUNDUP(pfn, pfnalign);
8760Sstevel@tonic-gate 	}
8770Sstevel@tonic-gate 
8780Sstevel@tonic-gate 	/* cannot find contig pages in specified range */
8790Sstevel@tonic-gate 	if (startpfn == lo) {
8800Sstevel@tonic-gate 		CONTIG_UNLOCK();
8810Sstevel@tonic-gate 		return (NULL);
8820Sstevel@tonic-gate 	}
8830Sstevel@tonic-gate 
8840Sstevel@tonic-gate 	/* did not start with lo previously */
8850Sstevel@tonic-gate 	pfn = lo;
8860Sstevel@tonic-gate 	if (pfnalign)
8870Sstevel@tonic-gate 		pfn = P2ROUNDUP(pfn, pfnalign);
8880Sstevel@tonic-gate 
8890Sstevel@tonic-gate 	/* allow search to go above startpfn */
8900Sstevel@tonic-gate 	while (pfn < startpfn) {
8910Sstevel@tonic-gate 
8920Sstevel@tonic-gate 		plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
8930Sstevel@tonic-gate 		if (plist != NULL) {
8940Sstevel@tonic-gate 
8950Sstevel@tonic-gate 			page_list_concat(&pplist, &plist);
8960Sstevel@tonic-gate 			sgllen--;
8970Sstevel@tonic-gate 
8980Sstevel@tonic-gate 			/*
8990Sstevel@tonic-gate 			 * return when contig pages no longer needed
9000Sstevel@tonic-gate 			 */
9010Sstevel@tonic-gate 			if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
9020Sstevel@tonic-gate 				startpfn = pfn;
9030Sstevel@tonic-gate 				CONTIG_UNLOCK();
9040Sstevel@tonic-gate 				check_dma(mattr, pplist, *pgcnt);
9050Sstevel@tonic-gate 				return (pplist);
9060Sstevel@tonic-gate 			}
9070Sstevel@tonic-gate 			minctg = howmany(*pgcnt, sgllen);
9080Sstevel@tonic-gate 		}
9090Sstevel@tonic-gate 		if (pfnalign)
9100Sstevel@tonic-gate 			pfn = P2ROUNDUP(pfn, pfnalign);
9110Sstevel@tonic-gate 	}
9120Sstevel@tonic-gate 	CONTIG_UNLOCK();
9130Sstevel@tonic-gate 	return (NULL);
9140Sstevel@tonic-gate }
9150Sstevel@tonic-gate 
9160Sstevel@tonic-gate /*
9170Sstevel@tonic-gate  * combine mem_node_config and memrange memory ranges into one data
9180Sstevel@tonic-gate  * structure to be used for page list management.
9190Sstevel@tonic-gate  *
9200Sstevel@tonic-gate  * mnode_range_cnt() calculates the number of memory ranges for mnode and
9210Sstevel@tonic-gate  * memranges[]. Used to determine the size of page lists and mnoderanges.
9220Sstevel@tonic-gate  *
9230Sstevel@tonic-gate  * mnode_range_setup() initializes mnoderanges.
9240Sstevel@tonic-gate  */
9250Sstevel@tonic-gate mnoderange_t	*mnoderanges;
9260Sstevel@tonic-gate int		mnoderangecnt;
9270Sstevel@tonic-gate int		mtype4g;
9280Sstevel@tonic-gate 
9290Sstevel@tonic-gate int
9300Sstevel@tonic-gate mnode_range_cnt()
9310Sstevel@tonic-gate {
9320Sstevel@tonic-gate 	int	mri;
9330Sstevel@tonic-gate 	int	mnrcnt = 0;
9340Sstevel@tonic-gate 	int	mnode;
9350Sstevel@tonic-gate 
9360Sstevel@tonic-gate 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
9370Sstevel@tonic-gate 		if (mem_node_config[mnode].exists == 0)
9380Sstevel@tonic-gate 			continue;
9390Sstevel@tonic-gate 
9400Sstevel@tonic-gate 		mri = nranges - 1;
9410Sstevel@tonic-gate 
9420Sstevel@tonic-gate 		/* find the memranges index below contained in mnode range */
9430Sstevel@tonic-gate 
9440Sstevel@tonic-gate 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
9450Sstevel@tonic-gate 			mri--;
9460Sstevel@tonic-gate 
9470Sstevel@tonic-gate 		/*
9480Sstevel@tonic-gate 		 * increment mnode range counter when memranges or mnode
9490Sstevel@tonic-gate 		 * boundary is reached.
9500Sstevel@tonic-gate 		 */
9510Sstevel@tonic-gate 		while (mri >= 0 &&
9520Sstevel@tonic-gate 		    mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
9530Sstevel@tonic-gate 			mnrcnt++;
9540Sstevel@tonic-gate 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
9550Sstevel@tonic-gate 				mri--;
9560Sstevel@tonic-gate 			else
9570Sstevel@tonic-gate 				break;
9580Sstevel@tonic-gate 		}
9590Sstevel@tonic-gate 	}
9600Sstevel@tonic-gate 	return (mnrcnt);
9610Sstevel@tonic-gate }
9620Sstevel@tonic-gate 
9630Sstevel@tonic-gate void
9640Sstevel@tonic-gate mnode_range_setup(mnoderange_t *mnoderanges)
9650Sstevel@tonic-gate {
9660Sstevel@tonic-gate 	int	mnode, mri;
9670Sstevel@tonic-gate 
9680Sstevel@tonic-gate 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
9690Sstevel@tonic-gate 		if (mem_node_config[mnode].exists == 0)
9700Sstevel@tonic-gate 			continue;
9710Sstevel@tonic-gate 
9720Sstevel@tonic-gate 		mri = nranges - 1;
9730Sstevel@tonic-gate 
9740Sstevel@tonic-gate 		while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
9750Sstevel@tonic-gate 			mri--;
9760Sstevel@tonic-gate 
9770Sstevel@tonic-gate 		while (mri >= 0 && mem_node_config[mnode].physmax >=
9780Sstevel@tonic-gate 		    MEMRANGELO(mri)) {
9790Sstevel@tonic-gate 			mnoderanges->mnr_pfnlo =
9800Sstevel@tonic-gate 			    MAX(MEMRANGELO(mri),
9810Sstevel@tonic-gate 				mem_node_config[mnode].physbase);
9820Sstevel@tonic-gate 			mnoderanges->mnr_pfnhi =
9830Sstevel@tonic-gate 			    MIN(MEMRANGEHI(mri),
9840Sstevel@tonic-gate 				mem_node_config[mnode].physmax);
9850Sstevel@tonic-gate 			mnoderanges->mnr_mnode = mnode;
9860Sstevel@tonic-gate 			mnoderanges->mnr_memrange = mri;
9870Sstevel@tonic-gate 			mnoderanges++;
9880Sstevel@tonic-gate 			if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
9890Sstevel@tonic-gate 				mri--;
9900Sstevel@tonic-gate 			else
9910Sstevel@tonic-gate 				break;
9920Sstevel@tonic-gate 		}
9930Sstevel@tonic-gate 	}
9940Sstevel@tonic-gate }
9950Sstevel@tonic-gate 
9960Sstevel@tonic-gate /*
9970Sstevel@tonic-gate  * Determine if the mnode range specified in mtype contains memory belonging
9980Sstevel@tonic-gate  * to memory node mnode.  If flags & PGI_MT_RANGE is set then mtype contains
9990Sstevel@tonic-gate  * the range of indices to 0 or 4g.
10000Sstevel@tonic-gate  *
10010Sstevel@tonic-gate  * Return first mnode range type index found otherwise return -1 if none found.
10020Sstevel@tonic-gate  */
10030Sstevel@tonic-gate int
10040Sstevel@tonic-gate mtype_func(int mnode, int mtype, uint_t flags)
10050Sstevel@tonic-gate {
10060Sstevel@tonic-gate 	if (flags & PGI_MT_RANGE) {
10070Sstevel@tonic-gate 		int	mtlim = 0;	/* default to PGI_MT_RANGEO */
10080Sstevel@tonic-gate 
10090Sstevel@tonic-gate 		if (flags & PGI_MT_NEXT)
10100Sstevel@tonic-gate 			mtype--;
10110Sstevel@tonic-gate 		if (flags & PGI_MT_RANGE4G)
10120Sstevel@tonic-gate 			mtlim = mtype4g + 1;
10130Sstevel@tonic-gate 		while (mtype >= mtlim) {
10140Sstevel@tonic-gate 			if (mnoderanges[mtype].mnr_mnode == mnode)
10150Sstevel@tonic-gate 				return (mtype);
10160Sstevel@tonic-gate 			mtype--;
10170Sstevel@tonic-gate 		}
10180Sstevel@tonic-gate 	} else {
10190Sstevel@tonic-gate 		if (mnoderanges[mtype].mnr_mnode == mnode)
10200Sstevel@tonic-gate 			return (mtype);
10210Sstevel@tonic-gate 	}
10220Sstevel@tonic-gate 	return (-1);
10230Sstevel@tonic-gate }
10240Sstevel@tonic-gate 
10250Sstevel@tonic-gate /*
1026*1373Skchow  * Update the page list max counts with the pfn range specified by the
1027*1373Skchow  * input parameters.  Called from add_physmem() when physical memory with
1028*1373Skchow  * page_t's are initially added to the page lists.
1029*1373Skchow  */
1030*1373Skchow void
1031*1373Skchow mtype_modify_max(pfn_t startpfn, long cnt)
1032*1373Skchow {
1033*1373Skchow 	int	mtype = 0;
1034*1373Skchow 	pfn_t	endpfn = startpfn + cnt, pfn;
1035*1373Skchow 	pgcnt_t	inc;
1036*1373Skchow 
1037*1373Skchow 	ASSERT(cnt > 0);
1038*1373Skchow 
1039*1373Skchow 	for (pfn = startpfn; pfn < endpfn; ) {
1040*1373Skchow 		if (pfn <= mnoderanges[mtype].mnr_pfnhi) {
1041*1373Skchow 			if (endpfn < mnoderanges[mtype].mnr_pfnhi) {
1042*1373Skchow 				inc = endpfn - pfn;
1043*1373Skchow 			} else {
1044*1373Skchow 				inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1;
1045*1373Skchow 			}
1046*1373Skchow 			mnoderanges[mtype].mnr_mt_pgmax += inc;
1047*1373Skchow 			if (physmax4g && mtype <= mtype4g)
1048*1373Skchow 				maxmem4g += inc;
1049*1373Skchow 			pfn += inc;
1050*1373Skchow 		}
1051*1373Skchow 		mtype++;
1052*1373Skchow 		ASSERT(mtype < mnoderangecnt || pfn >= endpfn);
1053*1373Skchow 	}
1054*1373Skchow }
1055*1373Skchow 
1056*1373Skchow /*
1057414Skchow  * Returns the free page count for mnode
1058414Skchow  */
1059414Skchow int
1060414Skchow mnode_pgcnt(int mnode)
1061414Skchow {
1062414Skchow 	int	mtype = mnoderangecnt - 1;
1063414Skchow 	int	flags = PGI_MT_RANGE0;
1064414Skchow 	pgcnt_t	pgcnt = 0;
1065414Skchow 
1066414Skchow 	mtype = mtype_func(mnode, mtype, flags);
1067414Skchow 
1068414Skchow 	while (mtype != -1) {
1069414Skchow 		pgcnt += (mnoderanges[mtype].mnr_mt_flpgcnt +
1070414Skchow 		    mnoderanges[mtype].mnr_mt_lgpgcnt +
1071414Skchow 		    mnoderanges[mtype].mnr_mt_clpgcnt);
1072414Skchow 		mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);
1073414Skchow 	}
1074414Skchow 	return (pgcnt);
1075414Skchow }
1076414Skchow 
1077414Skchow /*
10780Sstevel@tonic-gate  * Initialize page coloring variables based on the l2 cache parameters.
10790Sstevel@tonic-gate  * Calculate and return memory needed for page coloring data structures.
10800Sstevel@tonic-gate  */
10810Sstevel@tonic-gate size_t
10820Sstevel@tonic-gate page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
10830Sstevel@tonic-gate {
10840Sstevel@tonic-gate 	size_t	colorsz = 0;
10850Sstevel@tonic-gate 	int	i;
10860Sstevel@tonic-gate 	int	colors;
10870Sstevel@tonic-gate 
10880Sstevel@tonic-gate 	/*
10890Sstevel@tonic-gate 	 * Reduce the memory ranges lists if we don't have large amounts
10900Sstevel@tonic-gate 	 * of memory. This avoids searching known empty free lists.
10910Sstevel@tonic-gate 	 */
10920Sstevel@tonic-gate 	i = memrange_num(physmax);
10930Sstevel@tonic-gate 	memranges += i;
10940Sstevel@tonic-gate 	nranges -= i;
10950Sstevel@tonic-gate #if defined(__i386)
10960Sstevel@tonic-gate 	if (i > 0)
10970Sstevel@tonic-gate 		restricted_kmemalloc = 0;
10980Sstevel@tonic-gate #endif
10990Sstevel@tonic-gate 	/* physmax greater than 4g */
11000Sstevel@tonic-gate 	if (i == 0)
11010Sstevel@tonic-gate 		physmax4g = 1;
11020Sstevel@tonic-gate 
11030Sstevel@tonic-gate 	/*
11040Sstevel@tonic-gate 	 * setup pagesize for generic page layer
11050Sstevel@tonic-gate 	 */
11060Sstevel@tonic-gate 	for (i = 0; i <= mmu.max_page_level; ++i) {
11070Sstevel@tonic-gate 		hw_page_array[i].hp_size = LEVEL_SIZE(i);
11080Sstevel@tonic-gate 		hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
11090Sstevel@tonic-gate 		hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
11100Sstevel@tonic-gate 	}
11110Sstevel@tonic-gate 
11120Sstevel@tonic-gate 	ASSERT(ISP2(l2_sz));
11130Sstevel@tonic-gate 	ASSERT(ISP2(l2_linesz));
11140Sstevel@tonic-gate 	ASSERT(l2_sz > MMU_PAGESIZE);
11150Sstevel@tonic-gate 
11160Sstevel@tonic-gate 	/* l2_assoc is 0 for fully associative l2 cache */
11170Sstevel@tonic-gate 	if (l2_assoc)
11180Sstevel@tonic-gate 		l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
11190Sstevel@tonic-gate 	else
11200Sstevel@tonic-gate 		l2_colors = 1;
11210Sstevel@tonic-gate 
11220Sstevel@tonic-gate 	/* for scalability, configure at least PAGE_COLORS_MIN color bins */
11230Sstevel@tonic-gate 	page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
11240Sstevel@tonic-gate 
11250Sstevel@tonic-gate 	/*
11260Sstevel@tonic-gate 	 * cpu_page_colors is non-zero when a page color may be spread across
11270Sstevel@tonic-gate 	 * multiple bins.
11280Sstevel@tonic-gate 	 */
11290Sstevel@tonic-gate 	if (l2_colors < page_colors)
11300Sstevel@tonic-gate 		cpu_page_colors = l2_colors;
11310Sstevel@tonic-gate 
11320Sstevel@tonic-gate 	ASSERT(ISP2(page_colors));
11330Sstevel@tonic-gate 
11340Sstevel@tonic-gate 	page_colors_mask = page_colors - 1;
11350Sstevel@tonic-gate 
11360Sstevel@tonic-gate 	ASSERT(ISP2(CPUSETSIZE()));
11370Sstevel@tonic-gate 	page_coloring_shift = lowbit(CPUSETSIZE());
11380Sstevel@tonic-gate 
11390Sstevel@tonic-gate 	/* size for mnoderanges */
11400Sstevel@tonic-gate 	mnoderangecnt = mnode_range_cnt();
11410Sstevel@tonic-gate 	colorsz = mnoderangecnt * sizeof (mnoderange_t);
11420Sstevel@tonic-gate 
11430Sstevel@tonic-gate 	/* size for fpc_mutex and cpc_mutex */
11440Sstevel@tonic-gate 	colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
11450Sstevel@tonic-gate 
11460Sstevel@tonic-gate 	/* size of page_freelists */
11470Sstevel@tonic-gate 	colorsz += mnoderangecnt * sizeof (page_t ***);
11480Sstevel@tonic-gate 	colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
11490Sstevel@tonic-gate 
11500Sstevel@tonic-gate 	for (i = 0; i < mmu_page_sizes; i++) {
11510Sstevel@tonic-gate 		colors = page_get_pagecolors(i);
11520Sstevel@tonic-gate 		colorsz += mnoderangecnt * colors * sizeof (page_t *);
11530Sstevel@tonic-gate 	}
11540Sstevel@tonic-gate 
11550Sstevel@tonic-gate 	/* size of page_cachelists */
11560Sstevel@tonic-gate 	colorsz += mnoderangecnt * sizeof (page_t **);
11570Sstevel@tonic-gate 	colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
11580Sstevel@tonic-gate 
11590Sstevel@tonic-gate 	return (colorsz);
11600Sstevel@tonic-gate }
11610Sstevel@tonic-gate 
11620Sstevel@tonic-gate /*
11630Sstevel@tonic-gate  * Called once at startup to configure page_coloring data structures and
11640Sstevel@tonic-gate  * does the 1st page_free()/page_freelist_add().
11650Sstevel@tonic-gate  */
11660Sstevel@tonic-gate void
11670Sstevel@tonic-gate page_coloring_setup(caddr_t pcmemaddr)
11680Sstevel@tonic-gate {
11690Sstevel@tonic-gate 	int	i;
11700Sstevel@tonic-gate 	int	j;
11710Sstevel@tonic-gate 	int	k;
11720Sstevel@tonic-gate 	caddr_t	addr;
11730Sstevel@tonic-gate 	int	colors;
11740Sstevel@tonic-gate 
11750Sstevel@tonic-gate 	/*
11760Sstevel@tonic-gate 	 * do page coloring setup
11770Sstevel@tonic-gate 	 */
11780Sstevel@tonic-gate 	addr = pcmemaddr;
11790Sstevel@tonic-gate 
11800Sstevel@tonic-gate 	mnoderanges = (mnoderange_t *)addr;
11810Sstevel@tonic-gate 	addr += (mnoderangecnt * sizeof (mnoderange_t));
11820Sstevel@tonic-gate 
11830Sstevel@tonic-gate 	mnode_range_setup(mnoderanges);
11840Sstevel@tonic-gate 
11850Sstevel@tonic-gate 	if (physmax4g)
11860Sstevel@tonic-gate 		mtype4g = pfn_2_mtype(0xfffff);
11870Sstevel@tonic-gate 
11880Sstevel@tonic-gate 	for (k = 0; k < NPC_MUTEX; k++) {
11890Sstevel@tonic-gate 		fpc_mutex[k] = (kmutex_t *)addr;
11900Sstevel@tonic-gate 		addr += (max_mem_nodes * sizeof (kmutex_t));
11910Sstevel@tonic-gate 	}
11920Sstevel@tonic-gate 	for (k = 0; k < NPC_MUTEX; k++) {
11930Sstevel@tonic-gate 		cpc_mutex[k] = (kmutex_t *)addr;
11940Sstevel@tonic-gate 		addr += (max_mem_nodes * sizeof (kmutex_t));
11950Sstevel@tonic-gate 	}
11960Sstevel@tonic-gate 	page_freelists = (page_t ****)addr;
11970Sstevel@tonic-gate 	addr += (mnoderangecnt * sizeof (page_t ***));
11980Sstevel@tonic-gate 
11990Sstevel@tonic-gate 	page_cachelists = (page_t ***)addr;
12000Sstevel@tonic-gate 	addr += (mnoderangecnt * sizeof (page_t **));
12010Sstevel@tonic-gate 
12020Sstevel@tonic-gate 	for (i = 0; i < mnoderangecnt; i++) {
12030Sstevel@tonic-gate 		page_freelists[i] = (page_t ***)addr;
12040Sstevel@tonic-gate 		addr += (mmu_page_sizes * sizeof (page_t **));
12050Sstevel@tonic-gate 
12060Sstevel@tonic-gate 		for (j = 0; j < mmu_page_sizes; j++) {
12070Sstevel@tonic-gate 			colors = page_get_pagecolors(j);
12080Sstevel@tonic-gate 			page_freelists[i][j] = (page_t **)addr;
12090Sstevel@tonic-gate 			addr += (colors * sizeof (page_t *));
12100Sstevel@tonic-gate 		}
12110Sstevel@tonic-gate 		page_cachelists[i] = (page_t **)addr;
12120Sstevel@tonic-gate 		addr += (page_colors * sizeof (page_t *));
12130Sstevel@tonic-gate 	}
12140Sstevel@tonic-gate }
12150Sstevel@tonic-gate 
12160Sstevel@tonic-gate /*ARGSUSED*/
12170Sstevel@tonic-gate int
12180Sstevel@tonic-gate bp_color(struct buf *bp)
12190Sstevel@tonic-gate {
12200Sstevel@tonic-gate 	return (0);
12210Sstevel@tonic-gate }
12220Sstevel@tonic-gate 
12230Sstevel@tonic-gate /*
12240Sstevel@tonic-gate  * get a page from any list with the given mnode
12250Sstevel@tonic-gate  */
12260Sstevel@tonic-gate page_t *
12270Sstevel@tonic-gate page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
12280Sstevel@tonic-gate     int mnode, int mtype, ddi_dma_attr_t *dma_attr)
12290Sstevel@tonic-gate {
12300Sstevel@tonic-gate 	kmutex_t	*pcm;
12310Sstevel@tonic-gate 	int		i;
12320Sstevel@tonic-gate 	page_t		*pp;
12330Sstevel@tonic-gate 	page_t		*first_pp;
12340Sstevel@tonic-gate 	uint64_t	pgaddr;
12350Sstevel@tonic-gate 	ulong_t		bin;
12360Sstevel@tonic-gate 	int		mtypestart;
12370Sstevel@tonic-gate 
12380Sstevel@tonic-gate 	VM_STAT_ADD(pga_vmstats.pgma_alloc);
12390Sstevel@tonic-gate 
12400Sstevel@tonic-gate 	ASSERT((flags & PG_MATCH_COLOR) == 0);
12410Sstevel@tonic-gate 	ASSERT(szc == 0);
12420Sstevel@tonic-gate 	ASSERT(dma_attr != NULL);
12430Sstevel@tonic-gate 
12440Sstevel@tonic-gate 
12450Sstevel@tonic-gate 	MTYPE_START(mnode, mtype, flags);
12460Sstevel@tonic-gate 	if (mtype < 0) {
12470Sstevel@tonic-gate 		VM_STAT_ADD(pga_vmstats.pgma_allocempty);
12480Sstevel@tonic-gate 		return (NULL);
12490Sstevel@tonic-gate 	}
12500Sstevel@tonic-gate 
12510Sstevel@tonic-gate 	mtypestart = mtype;
12520Sstevel@tonic-gate 
12530Sstevel@tonic-gate 	bin = origbin;
12540Sstevel@tonic-gate 
12550Sstevel@tonic-gate 	/*
12560Sstevel@tonic-gate 	 * check up to page_colors + 1 bins - origbin may be checked twice
12570Sstevel@tonic-gate 	 * because of BIN_STEP skip
12580Sstevel@tonic-gate 	 */
12590Sstevel@tonic-gate 	do {
12600Sstevel@tonic-gate 		i = 0;
12610Sstevel@tonic-gate 		while (i <= page_colors) {
12620Sstevel@tonic-gate 			if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
12630Sstevel@tonic-gate 				goto nextfreebin;
12640Sstevel@tonic-gate 
12650Sstevel@tonic-gate 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
12660Sstevel@tonic-gate 			mutex_enter(pcm);
12670Sstevel@tonic-gate 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
12680Sstevel@tonic-gate 			first_pp = pp;
12690Sstevel@tonic-gate 			while (pp != NULL) {
12700Sstevel@tonic-gate 				if (page_trylock(pp, SE_EXCL) == 0) {
12710Sstevel@tonic-gate 					pp = pp->p_next;
12720Sstevel@tonic-gate 					if (pp == first_pp) {
12730Sstevel@tonic-gate 						pp = NULL;
12740Sstevel@tonic-gate 					}
12750Sstevel@tonic-gate 					continue;
12760Sstevel@tonic-gate 				}
12770Sstevel@tonic-gate 
12780Sstevel@tonic-gate 				ASSERT(PP_ISFREE(pp));
12790Sstevel@tonic-gate 				ASSERT(PP_ISAGED(pp));
12800Sstevel@tonic-gate 				ASSERT(pp->p_vnode == NULL);
12810Sstevel@tonic-gate 				ASSERT(pp->p_hash == NULL);
12820Sstevel@tonic-gate 				ASSERT(pp->p_offset == (u_offset_t)-1);
12830Sstevel@tonic-gate 				ASSERT(pp->p_szc == szc);
12840Sstevel@tonic-gate 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
12850Sstevel@tonic-gate 				/* check if page within DMA attributes */
12860Sstevel@tonic-gate 				pgaddr = mmu_ptob((uint64_t)(pp->p_pagenum));
12870Sstevel@tonic-gate 
12880Sstevel@tonic-gate 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
12890Sstevel@tonic-gate 				    (pgaddr + MMU_PAGESIZE - 1 <=
12900Sstevel@tonic-gate 				    dma_attr->dma_attr_addr_hi)) {
12910Sstevel@tonic-gate 					break;
12920Sstevel@tonic-gate 				}
12930Sstevel@tonic-gate 
12940Sstevel@tonic-gate 				/* continue looking */
12950Sstevel@tonic-gate 				page_unlock(pp);
12960Sstevel@tonic-gate 				pp = pp->p_next;
12970Sstevel@tonic-gate 				if (pp == first_pp)
12980Sstevel@tonic-gate 					pp = NULL;
12990Sstevel@tonic-gate 
13000Sstevel@tonic-gate 			}
13010Sstevel@tonic-gate 			if (pp != NULL) {
13020Sstevel@tonic-gate 				ASSERT(mtype == PP_2_MTYPE(pp));
13030Sstevel@tonic-gate 				ASSERT(pp->p_szc == 0);
13040Sstevel@tonic-gate 
13050Sstevel@tonic-gate 				/* found a page with specified DMA attributes */
13060Sstevel@tonic-gate 				page_sub(&PAGE_FREELISTS(mnode, szc, bin,
13070Sstevel@tonic-gate 				    mtype), pp);
1308414Skchow 				page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
13090Sstevel@tonic-gate 
13100Sstevel@tonic-gate 				if ((PP_ISFREE(pp) == 0) ||
13110Sstevel@tonic-gate 				    (PP_ISAGED(pp) == 0)) {
13120Sstevel@tonic-gate 					cmn_err(CE_PANIC, "page %p is not free",
13130Sstevel@tonic-gate 					    (void *)pp);
13140Sstevel@tonic-gate 				}
13150Sstevel@tonic-gate 
13160Sstevel@tonic-gate 				mutex_exit(pcm);
13170Sstevel@tonic-gate 				check_dma(dma_attr, pp, 1);
13180Sstevel@tonic-gate 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
13190Sstevel@tonic-gate 				return (pp);
13200Sstevel@tonic-gate 			}
13210Sstevel@tonic-gate 			mutex_exit(pcm);
13220Sstevel@tonic-gate nextfreebin:
13230Sstevel@tonic-gate 			pp = page_freelist_fill(szc, bin, mnode, mtype,
13240Sstevel@tonic-gate 			    mmu_btop(dma_attr->dma_attr_addr_hi + 1));
13250Sstevel@tonic-gate 			if (pp)
13260Sstevel@tonic-gate 				return (pp);
13270Sstevel@tonic-gate 
13280Sstevel@tonic-gate 			/* try next bin */
13290Sstevel@tonic-gate 			bin += (i == 0) ? BIN_STEP : 1;
13300Sstevel@tonic-gate 			bin &= page_colors_mask;
13310Sstevel@tonic-gate 			i++;
13320Sstevel@tonic-gate 		}
1333414Skchow 		MTYPE_NEXT(mnode, mtype, flags);
1334414Skchow 	} while (mtype >= 0);
13350Sstevel@tonic-gate 
13360Sstevel@tonic-gate 	/* failed to find a page in the freelist; try it in the cachelist */
13370Sstevel@tonic-gate 
13380Sstevel@tonic-gate 	/* reset mtype start for cachelist search */
13390Sstevel@tonic-gate 	mtype = mtypestart;
13400Sstevel@tonic-gate 	ASSERT(mtype >= 0);
13410Sstevel@tonic-gate 
13420Sstevel@tonic-gate 	/* start with the bin of matching color */
13430Sstevel@tonic-gate 	bin = origbin;
13440Sstevel@tonic-gate 
13450Sstevel@tonic-gate 	do {
13460Sstevel@tonic-gate 		for (i = 0; i <= page_colors; i++) {
13470Sstevel@tonic-gate 			if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
13480Sstevel@tonic-gate 				goto nextcachebin;
13490Sstevel@tonic-gate 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
13500Sstevel@tonic-gate 			mutex_enter(pcm);
13510Sstevel@tonic-gate 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
13520Sstevel@tonic-gate 			first_pp = pp;
13530Sstevel@tonic-gate 			while (pp != NULL) {
13540Sstevel@tonic-gate 				if (page_trylock(pp, SE_EXCL) == 0) {
13550Sstevel@tonic-gate 					pp = pp->p_next;
13560Sstevel@tonic-gate 					if (pp == first_pp)
13570Sstevel@tonic-gate 						break;
13580Sstevel@tonic-gate 					continue;
13590Sstevel@tonic-gate 				}
13600Sstevel@tonic-gate 				ASSERT(pp->p_vnode);
13610Sstevel@tonic-gate 				ASSERT(PP_ISAGED(pp) == 0);
13620Sstevel@tonic-gate 				ASSERT(pp->p_szc == 0);
13630Sstevel@tonic-gate 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
13640Sstevel@tonic-gate 
13650Sstevel@tonic-gate 				/* check if page within DMA attributes */
13660Sstevel@tonic-gate 
13670Sstevel@tonic-gate 				pgaddr = ptob((uint64_t)(pp->p_pagenum));
13680Sstevel@tonic-gate 
13690Sstevel@tonic-gate 				if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
13700Sstevel@tonic-gate 				    (pgaddr + MMU_PAGESIZE - 1 <=
13710Sstevel@tonic-gate 				    dma_attr->dma_attr_addr_hi)) {
13720Sstevel@tonic-gate 					break;
13730Sstevel@tonic-gate 				}
13740Sstevel@tonic-gate 
13750Sstevel@tonic-gate 				/* continue looking */
13760Sstevel@tonic-gate 				page_unlock(pp);
13770Sstevel@tonic-gate 				pp = pp->p_next;
13780Sstevel@tonic-gate 				if (pp == first_pp)
13790Sstevel@tonic-gate 					pp = NULL;
13800Sstevel@tonic-gate 			}
13810Sstevel@tonic-gate 
13820Sstevel@tonic-gate 			if (pp != NULL) {
13830Sstevel@tonic-gate 				ASSERT(mtype == PP_2_MTYPE(pp));
13840Sstevel@tonic-gate 				ASSERT(pp->p_szc == 0);
13850Sstevel@tonic-gate 
13860Sstevel@tonic-gate 				/* found a page with specified DMA attributes */
13870Sstevel@tonic-gate 				page_sub(&PAGE_CACHELISTS(mnode, bin,
13880Sstevel@tonic-gate 				    mtype), pp);
1389414Skchow 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
13900Sstevel@tonic-gate 
13910Sstevel@tonic-gate 				mutex_exit(pcm);
13920Sstevel@tonic-gate 				ASSERT(pp->p_vnode);
13930Sstevel@tonic-gate 				ASSERT(PP_ISAGED(pp) == 0);
13940Sstevel@tonic-gate 				check_dma(dma_attr, pp, 1);
13950Sstevel@tonic-gate 				VM_STAT_ADD(pga_vmstats.pgma_allocok);
13960Sstevel@tonic-gate 				return (pp);
13970Sstevel@tonic-gate 			}
13980Sstevel@tonic-gate 			mutex_exit(pcm);
13990Sstevel@tonic-gate nextcachebin:
14000Sstevel@tonic-gate 			bin += (i == 0) ? BIN_STEP : 1;
14010Sstevel@tonic-gate 			bin &= page_colors_mask;
14020Sstevel@tonic-gate 		}
1403414Skchow 		MTYPE_NEXT(mnode, mtype, flags);
1404414Skchow 	} while (mtype >= 0);
14050Sstevel@tonic-gate 
14060Sstevel@tonic-gate 	VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
14070Sstevel@tonic-gate 	return (NULL);
14080Sstevel@tonic-gate }
14090Sstevel@tonic-gate 
14100Sstevel@tonic-gate /*
14110Sstevel@tonic-gate  * This function is similar to page_get_freelist()/page_get_cachelist()
14120Sstevel@tonic-gate  * but it searches both the lists to find a page with the specified
14130Sstevel@tonic-gate  * color (or no color) and DMA attributes. The search is done in the
14140Sstevel@tonic-gate  * freelist first and then in the cache list within the highest memory
14150Sstevel@tonic-gate  * range (based on DMA attributes) before searching in the lower
14160Sstevel@tonic-gate  * memory ranges.
14170Sstevel@tonic-gate  *
14180Sstevel@tonic-gate  * Note: This function is called only by page_create_io().
14190Sstevel@tonic-gate  */
14200Sstevel@tonic-gate /*ARGSUSED*/
14210Sstevel@tonic-gate page_t *
14220Sstevel@tonic-gate page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
14230Sstevel@tonic-gate     size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t	*lgrp)
14240Sstevel@tonic-gate {
14250Sstevel@tonic-gate 	uint_t		bin;
14260Sstevel@tonic-gate 	int		mtype;
14270Sstevel@tonic-gate 	page_t		*pp;
14280Sstevel@tonic-gate 	int		n;
14290Sstevel@tonic-gate 	int		m;
14300Sstevel@tonic-gate 	int		szc;
14310Sstevel@tonic-gate 	int		fullrange;
14320Sstevel@tonic-gate 	int		mnode;
14330Sstevel@tonic-gate 	int		local_failed_stat = 0;
14340Sstevel@tonic-gate 	lgrp_mnode_cookie_t	lgrp_cookie;
14350Sstevel@tonic-gate 
14360Sstevel@tonic-gate 	VM_STAT_ADD(pga_vmstats.pga_alloc);
14370Sstevel@tonic-gate 
14380Sstevel@tonic-gate 	/* only base pagesize currently supported */
14390Sstevel@tonic-gate 	if (size != MMU_PAGESIZE)
14400Sstevel@tonic-gate 		return (NULL);
14410Sstevel@tonic-gate 
14420Sstevel@tonic-gate 	/*
14430Sstevel@tonic-gate 	 * If we're passed a specific lgroup, we use it.  Otherwise,
14440Sstevel@tonic-gate 	 * assume first-touch placement is desired.
14450Sstevel@tonic-gate 	 */
14460Sstevel@tonic-gate 	if (!LGRP_EXISTS(lgrp))
14470Sstevel@tonic-gate 		lgrp = lgrp_home_lgrp();
14480Sstevel@tonic-gate 
14490Sstevel@tonic-gate 	/* LINTED */
14500Sstevel@tonic-gate 	AS_2_BIN(as, seg, vp, vaddr, bin);
14510Sstevel@tonic-gate 
14520Sstevel@tonic-gate 	/*
14530Sstevel@tonic-gate 	 * Only hold one freelist or cachelist lock at a time, that way we
14540Sstevel@tonic-gate 	 * can start anywhere and not have to worry about lock
14550Sstevel@tonic-gate 	 * ordering.
14560Sstevel@tonic-gate 	 */
14570Sstevel@tonic-gate 	if (dma_attr == NULL) {
14580Sstevel@tonic-gate 		n = 0;
14590Sstevel@tonic-gate 		m = mnoderangecnt - 1;
14600Sstevel@tonic-gate 		fullrange = 1;
14610Sstevel@tonic-gate 		VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
14620Sstevel@tonic-gate 	} else {
14630Sstevel@tonic-gate 		pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
14640Sstevel@tonic-gate 		pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
14650Sstevel@tonic-gate 
14660Sstevel@tonic-gate 		/*
14670Sstevel@tonic-gate 		 * We can guarantee alignment only for page boundary.
14680Sstevel@tonic-gate 		 */
14690Sstevel@tonic-gate 		if (dma_attr->dma_attr_align > MMU_PAGESIZE)
14700Sstevel@tonic-gate 			return (NULL);
14710Sstevel@tonic-gate 
14720Sstevel@tonic-gate 		n = pfn_2_mtype(pfnlo);
14730Sstevel@tonic-gate 		m = pfn_2_mtype(pfnhi);
14740Sstevel@tonic-gate 
14750Sstevel@tonic-gate 		fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
14760Sstevel@tonic-gate 		    (pfnhi >= mnoderanges[m].mnr_pfnhi));
14770Sstevel@tonic-gate 	}
14780Sstevel@tonic-gate 	VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
14790Sstevel@tonic-gate 
14800Sstevel@tonic-gate 	if (n > m)
14810Sstevel@tonic-gate 		return (NULL);
14820Sstevel@tonic-gate 
14830Sstevel@tonic-gate 	szc = 0;
14840Sstevel@tonic-gate 
14850Sstevel@tonic-gate 	/* cylcing thru mtype handled by RANGE0 if n == 0 */
14860Sstevel@tonic-gate 	if (n == 0) {
14870Sstevel@tonic-gate 		flags |= PGI_MT_RANGE0;
14880Sstevel@tonic-gate 		n = m;
14890Sstevel@tonic-gate 	}
14900Sstevel@tonic-gate 
14910Sstevel@tonic-gate 	/*
14920Sstevel@tonic-gate 	 * Try local memory node first, but try remote if we can't
14930Sstevel@tonic-gate 	 * get a page of the right color.
14940Sstevel@tonic-gate 	 */
14950Sstevel@tonic-gate 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
14960Sstevel@tonic-gate 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
14970Sstevel@tonic-gate 		/*
14980Sstevel@tonic-gate 		 * allocate pages from high pfn to low.
14990Sstevel@tonic-gate 		 */
15000Sstevel@tonic-gate 		for (mtype = m; mtype >= n; mtype--) {
15010Sstevel@tonic-gate 			if (fullrange != 0) {
15020Sstevel@tonic-gate 				pp = page_get_mnode_freelist(mnode,
15030Sstevel@tonic-gate 				    bin, mtype, szc, flags);
15040Sstevel@tonic-gate 				if (pp == NULL) {
15050Sstevel@tonic-gate 					pp = page_get_mnode_cachelist(
15060Sstevel@tonic-gate 						bin, flags, mnode, mtype);
15070Sstevel@tonic-gate 				}
15080Sstevel@tonic-gate 			} else {
15090Sstevel@tonic-gate 				pp = page_get_mnode_anylist(bin, szc,
15100Sstevel@tonic-gate 				    flags, mnode, mtype, dma_attr);
15110Sstevel@tonic-gate 			}
15120Sstevel@tonic-gate 			if (pp != NULL) {
15130Sstevel@tonic-gate 				VM_STAT_ADD(pga_vmstats.pga_allocok);
15140Sstevel@tonic-gate 				check_dma(dma_attr, pp, 1);
15150Sstevel@tonic-gate 				return (pp);
15160Sstevel@tonic-gate 			}
15170Sstevel@tonic-gate 		}
15180Sstevel@tonic-gate 		if (!local_failed_stat) {
15190Sstevel@tonic-gate 			lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
15200Sstevel@tonic-gate 			local_failed_stat = 1;
15210Sstevel@tonic-gate 		}
15220Sstevel@tonic-gate 	}
15230Sstevel@tonic-gate 	VM_STAT_ADD(pga_vmstats.pga_allocfailed);
15240Sstevel@tonic-gate 
15250Sstevel@tonic-gate 	return (NULL);
15260Sstevel@tonic-gate }
15270Sstevel@tonic-gate 
15280Sstevel@tonic-gate /*
15290Sstevel@tonic-gate  * page_create_io()
15300Sstevel@tonic-gate  *
15310Sstevel@tonic-gate  * This function is a copy of page_create_va() with an additional
15320Sstevel@tonic-gate  * argument 'mattr' that specifies DMA memory requirements to
15330Sstevel@tonic-gate  * the page list functions. This function is used by the segkmem
15340Sstevel@tonic-gate  * allocator so it is only to create new pages (i.e PG_EXCL is
15350Sstevel@tonic-gate  * set).
15360Sstevel@tonic-gate  *
15370Sstevel@tonic-gate  * Note: This interface is currently used by x86 PSM only and is
15380Sstevel@tonic-gate  *	 not fully specified so the commitment level is only for
15390Sstevel@tonic-gate  *	 private interface specific to x86. This interface uses PSM
15400Sstevel@tonic-gate  *	 specific page_get_anylist() interface.
15410Sstevel@tonic-gate  */
15420Sstevel@tonic-gate 
15430Sstevel@tonic-gate #define	PAGE_HASH_SEARCH(index, pp, vp, off) { \
15440Sstevel@tonic-gate 	for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
15450Sstevel@tonic-gate 		if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
15460Sstevel@tonic-gate 			break; \
15470Sstevel@tonic-gate 	} \
15480Sstevel@tonic-gate }
15490Sstevel@tonic-gate 
15500Sstevel@tonic-gate 
15510Sstevel@tonic-gate page_t *
15520Sstevel@tonic-gate page_create_io(
15530Sstevel@tonic-gate 	struct vnode	*vp,
15540Sstevel@tonic-gate 	u_offset_t	off,
15550Sstevel@tonic-gate 	uint_t		bytes,
15560Sstevel@tonic-gate 	uint_t		flags,
15570Sstevel@tonic-gate 	struct as	*as,
15580Sstevel@tonic-gate 	caddr_t		vaddr,
15590Sstevel@tonic-gate 	ddi_dma_attr_t	*mattr)	/* DMA memory attributes if any */
15600Sstevel@tonic-gate {
15610Sstevel@tonic-gate 	page_t		*plist = NULL;
15620Sstevel@tonic-gate 	uint_t		plist_len = 0;
15630Sstevel@tonic-gate 	pgcnt_t		npages;
15640Sstevel@tonic-gate 	page_t		*npp = NULL;
15650Sstevel@tonic-gate 	uint_t		pages_req;
15660Sstevel@tonic-gate 	page_t		*pp;
15670Sstevel@tonic-gate 	kmutex_t	*phm = NULL;
15680Sstevel@tonic-gate 	uint_t		index;
15690Sstevel@tonic-gate 
15700Sstevel@tonic-gate 	TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
15710Sstevel@tonic-gate 		"page_create_start:vp %p off %llx bytes %u flags %x",
15720Sstevel@tonic-gate 		vp, off, bytes, flags);
15730Sstevel@tonic-gate 
15740Sstevel@tonic-gate 	ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
15750Sstevel@tonic-gate 
15760Sstevel@tonic-gate 	pages_req = npages = mmu_btopr(bytes);
15770Sstevel@tonic-gate 
15780Sstevel@tonic-gate 	/*
15790Sstevel@tonic-gate 	 * Do the freemem and pcf accounting.
15800Sstevel@tonic-gate 	 */
15810Sstevel@tonic-gate 	if (!page_create_wait(npages, flags)) {
15820Sstevel@tonic-gate 		return (NULL);
15830Sstevel@tonic-gate 	}
15840Sstevel@tonic-gate 
15850Sstevel@tonic-gate 	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
15860Sstevel@tonic-gate 		"page_create_success:vp %p off %llx",
15870Sstevel@tonic-gate 		vp, off);
15880Sstevel@tonic-gate 
15890Sstevel@tonic-gate 	/*
15900Sstevel@tonic-gate 	 * If satisfying this request has left us with too little
15910Sstevel@tonic-gate 	 * memory, start the wheels turning to get some back.  The
15920Sstevel@tonic-gate 	 * first clause of the test prevents waking up the pageout
15930Sstevel@tonic-gate 	 * daemon in situations where it would decide that there's
15940Sstevel@tonic-gate 	 * nothing to do.
15950Sstevel@tonic-gate 	 */
15960Sstevel@tonic-gate 	if (nscan < desscan && freemem < minfree) {
15970Sstevel@tonic-gate 		TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
15980Sstevel@tonic-gate 			"pageout_cv_signal:freemem %ld", freemem);
15990Sstevel@tonic-gate 		cv_signal(&proc_pageout->p_cv);
16000Sstevel@tonic-gate 	}
16010Sstevel@tonic-gate 
16020Sstevel@tonic-gate 	if (flags & PG_PHYSCONTIG) {
16030Sstevel@tonic-gate 
16040Sstevel@tonic-gate 		plist = page_get_contigpage(&npages, mattr, 1);
16050Sstevel@tonic-gate 		if (plist == NULL) {
16060Sstevel@tonic-gate 			page_create_putback(npages);
16070Sstevel@tonic-gate 			return (NULL);
16080Sstevel@tonic-gate 		}
16090Sstevel@tonic-gate 
16100Sstevel@tonic-gate 		pp = plist;
16110Sstevel@tonic-gate 
16120Sstevel@tonic-gate 		do {
16130Sstevel@tonic-gate 			if (!page_hashin(pp, vp, off, NULL)) {
16140Sstevel@tonic-gate 				panic("pg_creat_io: hashin failed %p %p %llx",
16150Sstevel@tonic-gate 				    (void *)pp, (void *)vp, off);
16160Sstevel@tonic-gate 			}
16170Sstevel@tonic-gate 			VM_STAT_ADD(page_create_new);
16180Sstevel@tonic-gate 			off += MMU_PAGESIZE;
16190Sstevel@tonic-gate 			PP_CLRFREE(pp);
16200Sstevel@tonic-gate 			PP_CLRAGED(pp);
16210Sstevel@tonic-gate 			page_set_props(pp, P_REF);
16220Sstevel@tonic-gate 			pp = pp->p_next;
16230Sstevel@tonic-gate 		} while (pp != plist);
16240Sstevel@tonic-gate 
16250Sstevel@tonic-gate 		if (!npages) {
16260Sstevel@tonic-gate 			check_dma(mattr, plist, pages_req);
16270Sstevel@tonic-gate 			return (plist);
16280Sstevel@tonic-gate 		} else {
16290Sstevel@tonic-gate 			vaddr += (pages_req - npages) << MMU_PAGESHIFT;
16300Sstevel@tonic-gate 		}
16310Sstevel@tonic-gate 
16320Sstevel@tonic-gate 		/*
16330Sstevel@tonic-gate 		 * fall-thru:
16340Sstevel@tonic-gate 		 *
16350Sstevel@tonic-gate 		 * page_get_contigpage returns when npages <= sgllen.
16360Sstevel@tonic-gate 		 * Grab the rest of the non-contig pages below from anylist.
16370Sstevel@tonic-gate 		 */
16380Sstevel@tonic-gate 	}
16390Sstevel@tonic-gate 
16400Sstevel@tonic-gate 	/*
16410Sstevel@tonic-gate 	 * Loop around collecting the requested number of pages.
16420Sstevel@tonic-gate 	 * Most of the time, we have to `create' a new page. With
16430Sstevel@tonic-gate 	 * this in mind, pull the page off the free list before
16440Sstevel@tonic-gate 	 * getting the hash lock.  This will minimize the hash
16450Sstevel@tonic-gate 	 * lock hold time, nesting, and the like.  If it turns
16460Sstevel@tonic-gate 	 * out we don't need the page, we put it back at the end.
16470Sstevel@tonic-gate 	 */
16480Sstevel@tonic-gate 	while (npages--) {
16490Sstevel@tonic-gate 		phm = NULL;
16500Sstevel@tonic-gate 
16510Sstevel@tonic-gate 		index = PAGE_HASH_FUNC(vp, off);
16520Sstevel@tonic-gate top:
16530Sstevel@tonic-gate 		ASSERT(phm == NULL);
16540Sstevel@tonic-gate 		ASSERT(index == PAGE_HASH_FUNC(vp, off));
16550Sstevel@tonic-gate 		ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
16560Sstevel@tonic-gate 
16570Sstevel@tonic-gate 		if (npp == NULL) {
16580Sstevel@tonic-gate 			/*
16590Sstevel@tonic-gate 			 * Try to get the page of any color either from
16600Sstevel@tonic-gate 			 * the freelist or from the cache list.
16610Sstevel@tonic-gate 			 */
16620Sstevel@tonic-gate 			npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
16630Sstevel@tonic-gate 			    flags & ~PG_MATCH_COLOR, mattr, NULL);
16640Sstevel@tonic-gate 			if (npp == NULL) {
16650Sstevel@tonic-gate 				if (mattr == NULL) {
16660Sstevel@tonic-gate 					/*
16670Sstevel@tonic-gate 					 * Not looking for a special page;
16680Sstevel@tonic-gate 					 * panic!
16690Sstevel@tonic-gate 					 */
16700Sstevel@tonic-gate 					panic("no page found %d", (int)npages);
16710Sstevel@tonic-gate 				}
16720Sstevel@tonic-gate 				/*
16730Sstevel@tonic-gate 				 * No page found! This can happen
16740Sstevel@tonic-gate 				 * if we are looking for a page
16750Sstevel@tonic-gate 				 * within a specific memory range
16760Sstevel@tonic-gate 				 * for DMA purposes. If PG_WAIT is
16770Sstevel@tonic-gate 				 * specified then we wait for a
16780Sstevel@tonic-gate 				 * while and then try again. The
16790Sstevel@tonic-gate 				 * wait could be forever if we
16800Sstevel@tonic-gate 				 * don't get the page(s) we need.
16810Sstevel@tonic-gate 				 *
16820Sstevel@tonic-gate 				 * Note: XXX We really need a mechanism
16830Sstevel@tonic-gate 				 * to wait for pages in the desired
16840Sstevel@tonic-gate 				 * range. For now, we wait for any
16850Sstevel@tonic-gate 				 * pages and see if we can use it.
16860Sstevel@tonic-gate 				 */
16870Sstevel@tonic-gate 
16880Sstevel@tonic-gate 				if ((mattr != NULL) && (flags & PG_WAIT)) {
16890Sstevel@tonic-gate 					delay(10);
16900Sstevel@tonic-gate 					goto top;
16910Sstevel@tonic-gate 				}
16920Sstevel@tonic-gate 
16930Sstevel@tonic-gate 				goto fail; /* undo accounting stuff */
16940Sstevel@tonic-gate 			}
16950Sstevel@tonic-gate 
16960Sstevel@tonic-gate 			if (PP_ISAGED(npp) == 0) {
16970Sstevel@tonic-gate 				/*
16980Sstevel@tonic-gate 				 * Since this page came from the
16990Sstevel@tonic-gate 				 * cachelist, we must destroy the
17000Sstevel@tonic-gate 				 * old vnode association.
17010Sstevel@tonic-gate 				 */
17020Sstevel@tonic-gate 				page_hashout(npp, (kmutex_t *)NULL);
17030Sstevel@tonic-gate 			}
17040Sstevel@tonic-gate 		}
17050Sstevel@tonic-gate 
17060Sstevel@tonic-gate 		/*
17070Sstevel@tonic-gate 		 * We own this page!
17080Sstevel@tonic-gate 		 */
17090Sstevel@tonic-gate 		ASSERT(PAGE_EXCL(npp));
17100Sstevel@tonic-gate 		ASSERT(npp->p_vnode == NULL);
17110Sstevel@tonic-gate 		ASSERT(!hat_page_is_mapped(npp));
17120Sstevel@tonic-gate 		PP_CLRFREE(npp);
17130Sstevel@tonic-gate 		PP_CLRAGED(npp);
17140Sstevel@tonic-gate 
17150Sstevel@tonic-gate 		/*
17160Sstevel@tonic-gate 		 * Here we have a page in our hot little mits and are
17170Sstevel@tonic-gate 		 * just waiting to stuff it on the appropriate lists.
17180Sstevel@tonic-gate 		 * Get the mutex and check to see if it really does
17190Sstevel@tonic-gate 		 * not exist.
17200Sstevel@tonic-gate 		 */
17210Sstevel@tonic-gate 		phm = PAGE_HASH_MUTEX(index);
17220Sstevel@tonic-gate 		mutex_enter(phm);
17230Sstevel@tonic-gate 		PAGE_HASH_SEARCH(index, pp, vp, off);
17240Sstevel@tonic-gate 		if (pp == NULL) {
17250Sstevel@tonic-gate 			VM_STAT_ADD(page_create_new);
17260Sstevel@tonic-gate 			pp = npp;
17270Sstevel@tonic-gate 			npp = NULL;
17280Sstevel@tonic-gate 			if (!page_hashin(pp, vp, off, phm)) {
17290Sstevel@tonic-gate 				/*
17300Sstevel@tonic-gate 				 * Since we hold the page hash mutex and
17310Sstevel@tonic-gate 				 * just searched for this page, page_hashin
17320Sstevel@tonic-gate 				 * had better not fail.  If it does, that
17330Sstevel@tonic-gate 				 * means somethread did not follow the
17340Sstevel@tonic-gate 				 * page hash mutex rules.  Panic now and
17350Sstevel@tonic-gate 				 * get it over with.  As usual, go down
17360Sstevel@tonic-gate 				 * holding all the locks.
17370Sstevel@tonic-gate 				 */
17380Sstevel@tonic-gate 				ASSERT(MUTEX_HELD(phm));
17390Sstevel@tonic-gate 				panic("page_create: hashin fail %p %p %llx %p",
17400Sstevel@tonic-gate 				    (void *)pp, (void *)vp, off, (void *)phm);
17410Sstevel@tonic-gate 
17420Sstevel@tonic-gate 			}
17430Sstevel@tonic-gate 			ASSERT(MUTEX_HELD(phm));
17440Sstevel@tonic-gate 			mutex_exit(phm);
17450Sstevel@tonic-gate 			phm = NULL;
17460Sstevel@tonic-gate 
17470Sstevel@tonic-gate 			/*
17480Sstevel@tonic-gate 			 * Hat layer locking need not be done to set
17490Sstevel@tonic-gate 			 * the following bits since the page is not hashed
17500Sstevel@tonic-gate 			 * and was on the free list (i.e., had no mappings).
17510Sstevel@tonic-gate 			 *
17520Sstevel@tonic-gate 			 * Set the reference bit to protect
17530Sstevel@tonic-gate 			 * against immediate pageout
17540Sstevel@tonic-gate 			 *
17550Sstevel@tonic-gate 			 * XXXmh modify freelist code to set reference
17560Sstevel@tonic-gate 			 * bit so we don't have to do it here.
17570Sstevel@tonic-gate 			 */
17580Sstevel@tonic-gate 			page_set_props(pp, P_REF);
17590Sstevel@tonic-gate 		} else {
17600Sstevel@tonic-gate 			ASSERT(MUTEX_HELD(phm));
17610Sstevel@tonic-gate 			mutex_exit(phm);
17620Sstevel@tonic-gate 			phm = NULL;
17630Sstevel@tonic-gate 			/*
17640Sstevel@tonic-gate 			 * NOTE: This should not happen for pages associated
17650Sstevel@tonic-gate 			 *	 with kernel vnode 'kvp'.
17660Sstevel@tonic-gate 			 */
17670Sstevel@tonic-gate 			/* XX64 - to debug why this happens! */
17680Sstevel@tonic-gate 			ASSERT(vp != &kvp);
17690Sstevel@tonic-gate 			if (vp == &kvp)
17700Sstevel@tonic-gate 				cmn_err(CE_NOTE,
17710Sstevel@tonic-gate 				    "page_create: page not expected "
17720Sstevel@tonic-gate 				    "in hash list for kernel vnode - pp 0x%p",
17730Sstevel@tonic-gate 				    (void *)pp);
17740Sstevel@tonic-gate 			VM_STAT_ADD(page_create_exists);
17750Sstevel@tonic-gate 			goto fail;
17760Sstevel@tonic-gate 		}
17770Sstevel@tonic-gate 
17780Sstevel@tonic-gate 		/*
17790Sstevel@tonic-gate 		 * Got a page!  It is locked.  Acquire the i/o
17800Sstevel@tonic-gate 		 * lock since we are going to use the p_next and
17810Sstevel@tonic-gate 		 * p_prev fields to link the requested pages together.
17820Sstevel@tonic-gate 		 */
17830Sstevel@tonic-gate 		page_io_lock(pp);
17840Sstevel@tonic-gate 		page_add(&plist, pp);
17850Sstevel@tonic-gate 		plist = plist->p_next;
17860Sstevel@tonic-gate 		off += MMU_PAGESIZE;
17870Sstevel@tonic-gate 		vaddr += MMU_PAGESIZE;
17880Sstevel@tonic-gate 	}
17890Sstevel@tonic-gate 
17900Sstevel@tonic-gate 	check_dma(mattr, plist, pages_req);
17910Sstevel@tonic-gate 	return (plist);
17920Sstevel@tonic-gate 
17930Sstevel@tonic-gate fail:
17940Sstevel@tonic-gate 	if (npp != NULL) {
17950Sstevel@tonic-gate 		/*
17960Sstevel@tonic-gate 		 * Did not need this page after all.
17970Sstevel@tonic-gate 		 * Put it back on the free list.
17980Sstevel@tonic-gate 		 */
17990Sstevel@tonic-gate 		VM_STAT_ADD(page_create_putbacks);
18000Sstevel@tonic-gate 		PP_SETFREE(npp);
18010Sstevel@tonic-gate 		PP_SETAGED(npp);
18020Sstevel@tonic-gate 		npp->p_offset = (u_offset_t)-1;
18030Sstevel@tonic-gate 		page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
18040Sstevel@tonic-gate 		page_unlock(npp);
18050Sstevel@tonic-gate 	}
18060Sstevel@tonic-gate 
18070Sstevel@tonic-gate 	/*
18080Sstevel@tonic-gate 	 * Give up the pages we already got.
18090Sstevel@tonic-gate 	 */
18100Sstevel@tonic-gate 	while (plist != NULL) {
18110Sstevel@tonic-gate 		pp = plist;
18120Sstevel@tonic-gate 		page_sub(&plist, pp);
18130Sstevel@tonic-gate 		page_io_unlock(pp);
18140Sstevel@tonic-gate 		plist_len++;
18150Sstevel@tonic-gate 		/*LINTED: constant in conditional ctx*/
18160Sstevel@tonic-gate 		VN_DISPOSE(pp, B_INVAL, 0, kcred);
18170Sstevel@tonic-gate 	}
18180Sstevel@tonic-gate 
18190Sstevel@tonic-gate 	/*
18200Sstevel@tonic-gate 	 * VN_DISPOSE does freemem accounting for the pages in plist
18210Sstevel@tonic-gate 	 * by calling page_free. So, we need to undo the pcf accounting
18220Sstevel@tonic-gate 	 * for only the remaining pages.
18230Sstevel@tonic-gate 	 */
18240Sstevel@tonic-gate 	VM_STAT_ADD(page_create_putbacks);
18250Sstevel@tonic-gate 	page_create_putback(pages_req - plist_len);
18260Sstevel@tonic-gate 
18270Sstevel@tonic-gate 	return (NULL);
18280Sstevel@tonic-gate }
18290Sstevel@tonic-gate 
18300Sstevel@tonic-gate 
18310Sstevel@tonic-gate /*
18320Sstevel@tonic-gate  * Copy the data from the physical page represented by "frompp" to
18330Sstevel@tonic-gate  * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
18340Sstevel@tonic-gate  * CPU->cpu_caddr2.  It assumes that no one uses either map at interrupt
18350Sstevel@tonic-gate  * level and no one sleeps with an active mapping there.
18360Sstevel@tonic-gate  *
18370Sstevel@tonic-gate  * Note that the ref/mod bits in the page_t's are not affected by
18380Sstevel@tonic-gate  * this operation, hence it is up to the caller to update them appropriately.
18390Sstevel@tonic-gate  */
18400Sstevel@tonic-gate void
18410Sstevel@tonic-gate ppcopy(page_t *frompp, page_t *topp)
18420Sstevel@tonic-gate {
18430Sstevel@tonic-gate 	caddr_t		pp_addr1;
18440Sstevel@tonic-gate 	caddr_t		pp_addr2;
18450Sstevel@tonic-gate 	void		*pte1;
18460Sstevel@tonic-gate 	void		*pte2;
18470Sstevel@tonic-gate 	kmutex_t	*ppaddr_mutex;
18480Sstevel@tonic-gate 
18490Sstevel@tonic-gate 	ASSERT_STACK_ALIGNED();
18500Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(frompp));
18510Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(topp));
18520Sstevel@tonic-gate 
18530Sstevel@tonic-gate 	if (kpm_enable) {
18540Sstevel@tonic-gate 		pp_addr1 = hat_kpm_page2va(frompp, 0);
18550Sstevel@tonic-gate 		pp_addr2 = hat_kpm_page2va(topp, 0);
18560Sstevel@tonic-gate 		kpreempt_disable();
18570Sstevel@tonic-gate 	} else {
18580Sstevel@tonic-gate 		/*
18590Sstevel@tonic-gate 		 * disable pre-emption so that CPU can't change
18600Sstevel@tonic-gate 		 */
18610Sstevel@tonic-gate 		kpreempt_disable();
18620Sstevel@tonic-gate 
18630Sstevel@tonic-gate 		pp_addr1 = CPU->cpu_caddr1;
18640Sstevel@tonic-gate 		pp_addr2 = CPU->cpu_caddr2;
18650Sstevel@tonic-gate 		pte1 = (void *)CPU->cpu_caddr1pte;
18660Sstevel@tonic-gate 		pte2 = (void *)CPU->cpu_caddr2pte;
18670Sstevel@tonic-gate 
18680Sstevel@tonic-gate 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
18690Sstevel@tonic-gate 		mutex_enter(ppaddr_mutex);
18700Sstevel@tonic-gate 
18710Sstevel@tonic-gate 		hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
18720Sstevel@tonic-gate 		    PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
18730Sstevel@tonic-gate 		hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
18740Sstevel@tonic-gate 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
18750Sstevel@tonic-gate 		    HAT_LOAD_NOCONSIST);
18760Sstevel@tonic-gate 	}
18770Sstevel@tonic-gate 
18780Sstevel@tonic-gate 	if (use_sse_pagecopy)
18790Sstevel@tonic-gate 		hwblkpagecopy(pp_addr1, pp_addr2);
18800Sstevel@tonic-gate 	else
18810Sstevel@tonic-gate 		bcopy(pp_addr1, pp_addr2, PAGESIZE);
18820Sstevel@tonic-gate 
18830Sstevel@tonic-gate 	if (!kpm_enable)
18840Sstevel@tonic-gate 		mutex_exit(ppaddr_mutex);
18850Sstevel@tonic-gate 	kpreempt_enable();
18860Sstevel@tonic-gate }
18870Sstevel@tonic-gate 
18880Sstevel@tonic-gate /*
18890Sstevel@tonic-gate  * Zero the physical page from off to off + len given by `pp'
18900Sstevel@tonic-gate  * without changing the reference and modified bits of page.
18910Sstevel@tonic-gate  *
18920Sstevel@tonic-gate  * We use this using CPU private page address #2, see ppcopy() for more info.
18930Sstevel@tonic-gate  * pagezero() must not be called at interrupt level.
18940Sstevel@tonic-gate  */
18950Sstevel@tonic-gate void
18960Sstevel@tonic-gate pagezero(page_t *pp, uint_t off, uint_t len)
18970Sstevel@tonic-gate {
18980Sstevel@tonic-gate 	caddr_t		pp_addr2;
18990Sstevel@tonic-gate 	void		*pte2;
19000Sstevel@tonic-gate 	kmutex_t	*ppaddr_mutex;
19010Sstevel@tonic-gate 
19020Sstevel@tonic-gate 	ASSERT_STACK_ALIGNED();
19030Sstevel@tonic-gate 	ASSERT(len <= MMU_PAGESIZE);
19040Sstevel@tonic-gate 	ASSERT(off <= MMU_PAGESIZE);
19050Sstevel@tonic-gate 	ASSERT(off + len <= MMU_PAGESIZE);
19060Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(pp));
19070Sstevel@tonic-gate 
19080Sstevel@tonic-gate 	if (kpm_enable) {
19090Sstevel@tonic-gate 		pp_addr2 = hat_kpm_page2va(pp, 0);
19100Sstevel@tonic-gate 		kpreempt_disable();
19110Sstevel@tonic-gate 	} else {
19120Sstevel@tonic-gate 		kpreempt_disable();
19130Sstevel@tonic-gate 
19140Sstevel@tonic-gate 		pp_addr2 = CPU->cpu_caddr2;
19150Sstevel@tonic-gate 		pte2 = (void *)CPU->cpu_caddr2pte;
19160Sstevel@tonic-gate 
19170Sstevel@tonic-gate 		ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
19180Sstevel@tonic-gate 		mutex_enter(ppaddr_mutex);
19190Sstevel@tonic-gate 
19200Sstevel@tonic-gate 		hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2,
19210Sstevel@tonic-gate 		    PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
19220Sstevel@tonic-gate 		    HAT_LOAD_NOCONSIST);
19230Sstevel@tonic-gate 	}
19240Sstevel@tonic-gate 
19250Sstevel@tonic-gate 	if (use_sse_pagezero)
19260Sstevel@tonic-gate 		hwblkclr(pp_addr2 + off, len);
19270Sstevel@tonic-gate 	else
19280Sstevel@tonic-gate 		bzero(pp_addr2 + off, len);
19290Sstevel@tonic-gate 
19300Sstevel@tonic-gate 	if (!kpm_enable)
19310Sstevel@tonic-gate 		mutex_exit(ppaddr_mutex);
19320Sstevel@tonic-gate 	kpreempt_enable();
19330Sstevel@tonic-gate }
19340Sstevel@tonic-gate 
19350Sstevel@tonic-gate /*
19360Sstevel@tonic-gate  * Platform-dependent page scrub call.
19370Sstevel@tonic-gate  */
19380Sstevel@tonic-gate void
19390Sstevel@tonic-gate pagescrub(page_t *pp, uint_t off, uint_t len)
19400Sstevel@tonic-gate {
19410Sstevel@tonic-gate 	/*
19420Sstevel@tonic-gate 	 * For now, we rely on the fact that pagezero() will
19430Sstevel@tonic-gate 	 * always clear UEs.
19440Sstevel@tonic-gate 	 */
19450Sstevel@tonic-gate 	pagezero(pp, off, len);
19460Sstevel@tonic-gate }
19470Sstevel@tonic-gate 
19480Sstevel@tonic-gate /*
19490Sstevel@tonic-gate  * set up two private addresses for use on a given CPU for use in ppcopy()
19500Sstevel@tonic-gate  */
19510Sstevel@tonic-gate void
19520Sstevel@tonic-gate setup_vaddr_for_ppcopy(struct cpu *cpup)
19530Sstevel@tonic-gate {
19540Sstevel@tonic-gate 	void *addr;
19550Sstevel@tonic-gate 	void *pte;
19560Sstevel@tonic-gate 
19570Sstevel@tonic-gate 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
19580Sstevel@tonic-gate 	pte = hat_mempte_setup(addr);
19590Sstevel@tonic-gate 	cpup->cpu_caddr1 = addr;
19600Sstevel@tonic-gate 	cpup->cpu_caddr1pte = (pteptr_t)pte;
19610Sstevel@tonic-gate 
19620Sstevel@tonic-gate 	addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
19630Sstevel@tonic-gate 	pte = hat_mempte_setup(addr);
19640Sstevel@tonic-gate 	cpup->cpu_caddr2 = addr;
19650Sstevel@tonic-gate 	cpup->cpu_caddr2pte = (pteptr_t)pte;
19660Sstevel@tonic-gate 
19670Sstevel@tonic-gate 	mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
19680Sstevel@tonic-gate }
19690Sstevel@tonic-gate 
19700Sstevel@tonic-gate 
19710Sstevel@tonic-gate /*
19720Sstevel@tonic-gate  * Create the pageout scanner thread. The thread has to
19730Sstevel@tonic-gate  * start at procedure with process pp and priority pri.
19740Sstevel@tonic-gate  */
19750Sstevel@tonic-gate void
19760Sstevel@tonic-gate pageout_init(void (*procedure)(), proc_t *pp, pri_t pri)
19770Sstevel@tonic-gate {
19780Sstevel@tonic-gate 	(void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri);
19790Sstevel@tonic-gate }
19800Sstevel@tonic-gate 
19810Sstevel@tonic-gate /*
19820Sstevel@tonic-gate  * Function for flushing D-cache when performing module relocations
19830Sstevel@tonic-gate  * to an alternate mapping.  Unnecessary on Intel / AMD platforms.
19840Sstevel@tonic-gate  */
19850Sstevel@tonic-gate void
19860Sstevel@tonic-gate dcache_flushall()
19870Sstevel@tonic-gate {}
1988