10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 50Sstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only 60Sstevel@tonic-gate * (the "License"). You may not use this file except in compliance 70Sstevel@tonic-gate * with the License. 80Sstevel@tonic-gate * 90Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 100Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 110Sstevel@tonic-gate * See the License for the specific language governing permissions 120Sstevel@tonic-gate * and limitations under the License. 130Sstevel@tonic-gate * 140Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 150Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 160Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 170Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 180Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 190Sstevel@tonic-gate * 200Sstevel@tonic-gate * CDDL HEADER END 210Sstevel@tonic-gate */ 220Sstevel@tonic-gate /* 230Sstevel@tonic-gate * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 240Sstevel@tonic-gate * Use is subject to license terms. 250Sstevel@tonic-gate */ 260Sstevel@tonic-gate 270Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 280Sstevel@tonic-gate /* All Rights Reserved */ 290Sstevel@tonic-gate 300Sstevel@tonic-gate /* 310Sstevel@tonic-gate * Portions of this source code were derived from Berkeley 4.3 BSD 320Sstevel@tonic-gate * under license from the Regents of the University of California. 330Sstevel@tonic-gate */ 340Sstevel@tonic-gate 350Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 360Sstevel@tonic-gate 370Sstevel@tonic-gate /* 380Sstevel@tonic-gate * UNIX machine dependent virtual memory support. 390Sstevel@tonic-gate */ 400Sstevel@tonic-gate 410Sstevel@tonic-gate #include <sys/types.h> 420Sstevel@tonic-gate #include <sys/param.h> 430Sstevel@tonic-gate #include <sys/systm.h> 440Sstevel@tonic-gate #include <sys/user.h> 450Sstevel@tonic-gate #include <sys/proc.h> 460Sstevel@tonic-gate #include <sys/kmem.h> 470Sstevel@tonic-gate #include <sys/vmem.h> 480Sstevel@tonic-gate #include <sys/buf.h> 490Sstevel@tonic-gate #include <sys/cpuvar.h> 500Sstevel@tonic-gate #include <sys/lgrp.h> 510Sstevel@tonic-gate #include <sys/disp.h> 520Sstevel@tonic-gate #include <sys/vm.h> 530Sstevel@tonic-gate #include <sys/mman.h> 540Sstevel@tonic-gate #include <sys/vnode.h> 550Sstevel@tonic-gate #include <sys/cred.h> 560Sstevel@tonic-gate #include <sys/exec.h> 570Sstevel@tonic-gate #include <sys/exechdr.h> 580Sstevel@tonic-gate #include <sys/debug.h> 590Sstevel@tonic-gate 600Sstevel@tonic-gate #include <vm/hat.h> 610Sstevel@tonic-gate #include <vm/as.h> 620Sstevel@tonic-gate #include <vm/seg.h> 630Sstevel@tonic-gate #include <vm/seg_kp.h> 640Sstevel@tonic-gate #include <vm/seg_vn.h> 650Sstevel@tonic-gate #include <vm/page.h> 660Sstevel@tonic-gate #include <vm/seg_kmem.h> 670Sstevel@tonic-gate #include <vm/seg_kpm.h> 680Sstevel@tonic-gate #include <vm/vm_dep.h> 690Sstevel@tonic-gate 700Sstevel@tonic-gate #include <sys/cpu.h> 710Sstevel@tonic-gate #include <sys/vm_machparam.h> 720Sstevel@tonic-gate #include <sys/memlist.h> 730Sstevel@tonic-gate #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 740Sstevel@tonic-gate #include <vm/hat_i86.h> 750Sstevel@tonic-gate #include <sys/x86_archext.h> 760Sstevel@tonic-gate #include <sys/elf_386.h> 770Sstevel@tonic-gate #include <sys/cmn_err.h> 780Sstevel@tonic-gate #include <sys/archsystm.h> 790Sstevel@tonic-gate #include <sys/machsystm.h> 800Sstevel@tonic-gate 810Sstevel@tonic-gate #include <sys/vtrace.h> 820Sstevel@tonic-gate #include <sys/ddidmareq.h> 830Sstevel@tonic-gate #include <sys/promif.h> 840Sstevel@tonic-gate #include <sys/memnode.h> 850Sstevel@tonic-gate #include <sys/stack.h> 860Sstevel@tonic-gate 870Sstevel@tonic-gate uint_t vac_colors = 0; 880Sstevel@tonic-gate 890Sstevel@tonic-gate int largepagesupport = 0; 900Sstevel@tonic-gate extern uint_t page_create_new; 910Sstevel@tonic-gate extern uint_t page_create_exists; 920Sstevel@tonic-gate extern uint_t page_create_putbacks; 930Sstevel@tonic-gate extern uint_t page_create_putbacks; 940Sstevel@tonic-gate extern uintptr_t eprom_kernelbase; 950Sstevel@tonic-gate extern int use_sse_pagecopy, use_sse_pagezero; /* in ml/float.s */ 960Sstevel@tonic-gate 970Sstevel@tonic-gate /* 4g memory management */ 980Sstevel@tonic-gate pgcnt_t maxmem4g; 990Sstevel@tonic-gate pgcnt_t freemem4g; 1000Sstevel@tonic-gate int physmax4g; 1010Sstevel@tonic-gate int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 1020Sstevel@tonic-gate int lotsfree4gshift = 3; 1030Sstevel@tonic-gate 1040Sstevel@tonic-gate #ifdef VM_STATS 1050Sstevel@tonic-gate struct { 1060Sstevel@tonic-gate ulong_t pga_alloc; 1070Sstevel@tonic-gate ulong_t pga_notfullrange; 1080Sstevel@tonic-gate ulong_t pga_nulldmaattr; 1090Sstevel@tonic-gate ulong_t pga_allocok; 1100Sstevel@tonic-gate ulong_t pga_allocfailed; 1110Sstevel@tonic-gate ulong_t pgma_alloc; 1120Sstevel@tonic-gate ulong_t pgma_allocok; 1130Sstevel@tonic-gate ulong_t pgma_allocfailed; 1140Sstevel@tonic-gate ulong_t pgma_allocempty; 1150Sstevel@tonic-gate } pga_vmstats; 1160Sstevel@tonic-gate #endif 1170Sstevel@tonic-gate 1180Sstevel@tonic-gate uint_t mmu_page_sizes; 1190Sstevel@tonic-gate 1200Sstevel@tonic-gate /* How many page sizes the users can see */ 1210Sstevel@tonic-gate uint_t mmu_exported_page_sizes; 1220Sstevel@tonic-gate 1230Sstevel@tonic-gate size_t auto_lpg_va_default = MMU_PAGESIZE; /* used by zmap() */ 124*423Sdavemq /* 125*423Sdavemq * Number of pages in 1 GB. Don't enable automatic large pages if we have 126*423Sdavemq * fewer than this many pages. 127*423Sdavemq */ 128*423Sdavemq pgcnt_t auto_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 1290Sstevel@tonic-gate 1300Sstevel@tonic-gate /* 1310Sstevel@tonic-gate * Return the optimum page size for a given mapping 1320Sstevel@tonic-gate */ 1330Sstevel@tonic-gate /*ARGSUSED*/ 1340Sstevel@tonic-gate size_t 1350Sstevel@tonic-gate map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int *remap) 1360Sstevel@tonic-gate { 1370Sstevel@tonic-gate level_t l; 1380Sstevel@tonic-gate 1390Sstevel@tonic-gate if (remap) 1400Sstevel@tonic-gate *remap = 0; 1410Sstevel@tonic-gate 1420Sstevel@tonic-gate switch (maptype) { 1430Sstevel@tonic-gate 1440Sstevel@tonic-gate case MAPPGSZ_STK: 1450Sstevel@tonic-gate case MAPPGSZ_HEAP: 1460Sstevel@tonic-gate case MAPPGSZ_VA: 1470Sstevel@tonic-gate /* 1480Sstevel@tonic-gate * use the pages size that best fits len 1490Sstevel@tonic-gate */ 1500Sstevel@tonic-gate for (l = mmu.max_page_level; l > 0; --l) { 1510Sstevel@tonic-gate if (len < LEVEL_SIZE(l)) 1520Sstevel@tonic-gate continue; 1530Sstevel@tonic-gate break; 1540Sstevel@tonic-gate } 1550Sstevel@tonic-gate return (LEVEL_SIZE(l)); 1560Sstevel@tonic-gate 1570Sstevel@tonic-gate /* 1580Sstevel@tonic-gate * for ISM use the 1st large page size. 1590Sstevel@tonic-gate */ 1600Sstevel@tonic-gate case MAPPGSZ_ISM: 1610Sstevel@tonic-gate if (mmu.max_page_level == 0) 1620Sstevel@tonic-gate return (MMU_PAGESIZE); 1630Sstevel@tonic-gate return (LEVEL_SIZE(1)); 1640Sstevel@tonic-gate } 1650Sstevel@tonic-gate return (0); 1660Sstevel@tonic-gate } 1670Sstevel@tonic-gate 1680Sstevel@tonic-gate /* 1690Sstevel@tonic-gate * This can be patched via /etc/system to allow large pages 1700Sstevel@tonic-gate * to be used for mapping application and libraries text segments. 1710Sstevel@tonic-gate */ 1720Sstevel@tonic-gate int use_text_largepages = 0; 1730Sstevel@tonic-gate 1740Sstevel@tonic-gate /* 1750Sstevel@tonic-gate * Return a bit vector of large page size codes that 1760Sstevel@tonic-gate * can be used to map [addr, addr + len) region. 1770Sstevel@tonic-gate */ 1780Sstevel@tonic-gate 1790Sstevel@tonic-gate /*ARGSUSED*/ 1800Sstevel@tonic-gate uint_t 1810Sstevel@tonic-gate map_execseg_pgszcvec(int text, caddr_t addr, size_t len) 1820Sstevel@tonic-gate { 1830Sstevel@tonic-gate size_t pgsz; 1840Sstevel@tonic-gate caddr_t a; 1850Sstevel@tonic-gate 1860Sstevel@tonic-gate if (!text || !use_text_largepages || 1870Sstevel@tonic-gate mmu.max_page_level == 0) 1880Sstevel@tonic-gate return (0); 1890Sstevel@tonic-gate 1900Sstevel@tonic-gate pgsz = LEVEL_SIZE(1); 1910Sstevel@tonic-gate a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 1920Sstevel@tonic-gate if (a < addr || a >= addr + len) { 1930Sstevel@tonic-gate return (0); 1940Sstevel@tonic-gate } 1950Sstevel@tonic-gate len -= (a - addr); 1960Sstevel@tonic-gate if (len < pgsz) { 1970Sstevel@tonic-gate return (0); 1980Sstevel@tonic-gate } 1990Sstevel@tonic-gate return (1 << 1); 2000Sstevel@tonic-gate } 2010Sstevel@tonic-gate 2020Sstevel@tonic-gate /* 2030Sstevel@tonic-gate * Handle a pagefault. 2040Sstevel@tonic-gate */ 2050Sstevel@tonic-gate faultcode_t 2060Sstevel@tonic-gate pagefault( 2070Sstevel@tonic-gate caddr_t addr, 2080Sstevel@tonic-gate enum fault_type type, 2090Sstevel@tonic-gate enum seg_rw rw, 2100Sstevel@tonic-gate int iskernel) 2110Sstevel@tonic-gate { 2120Sstevel@tonic-gate struct as *as; 2130Sstevel@tonic-gate struct hat *hat; 2140Sstevel@tonic-gate struct proc *p; 2150Sstevel@tonic-gate kthread_t *t; 2160Sstevel@tonic-gate faultcode_t res; 2170Sstevel@tonic-gate caddr_t base; 2180Sstevel@tonic-gate size_t len; 2190Sstevel@tonic-gate int err; 2200Sstevel@tonic-gate int mapped_red; 2210Sstevel@tonic-gate uintptr_t ea; 2220Sstevel@tonic-gate 2230Sstevel@tonic-gate ASSERT_STACK_ALIGNED(); 2240Sstevel@tonic-gate 2250Sstevel@tonic-gate if (INVALID_VADDR(addr)) 2260Sstevel@tonic-gate return (FC_NOMAP); 2270Sstevel@tonic-gate 2280Sstevel@tonic-gate mapped_red = segkp_map_red(); 2290Sstevel@tonic-gate 2300Sstevel@tonic-gate if (iskernel) { 2310Sstevel@tonic-gate as = &kas; 2320Sstevel@tonic-gate hat = as->a_hat; 2330Sstevel@tonic-gate } else { 2340Sstevel@tonic-gate t = curthread; 2350Sstevel@tonic-gate p = ttoproc(t); 2360Sstevel@tonic-gate as = p->p_as; 2370Sstevel@tonic-gate hat = as->a_hat; 2380Sstevel@tonic-gate } 2390Sstevel@tonic-gate 2400Sstevel@tonic-gate /* 2410Sstevel@tonic-gate * Dispatch pagefault. 2420Sstevel@tonic-gate */ 2430Sstevel@tonic-gate res = as_fault(hat, as, addr, 1, type, rw); 2440Sstevel@tonic-gate 2450Sstevel@tonic-gate /* 2460Sstevel@tonic-gate * If this isn't a potential unmapped hole in the user's 2470Sstevel@tonic-gate * UNIX data or stack segments, just return status info. 2480Sstevel@tonic-gate */ 2490Sstevel@tonic-gate if (res != FC_NOMAP || iskernel) 2500Sstevel@tonic-gate goto out; 2510Sstevel@tonic-gate 2520Sstevel@tonic-gate /* 2530Sstevel@tonic-gate * Check to see if we happened to faulted on a currently unmapped 2540Sstevel@tonic-gate * part of the UNIX data or stack segments. If so, create a zfod 2550Sstevel@tonic-gate * mapping there and then try calling the fault routine again. 2560Sstevel@tonic-gate */ 2570Sstevel@tonic-gate base = p->p_brkbase; 2580Sstevel@tonic-gate len = p->p_brksize; 2590Sstevel@tonic-gate 2600Sstevel@tonic-gate if (addr < base || addr >= base + len) { /* data seg? */ 2610Sstevel@tonic-gate base = (caddr_t)p->p_usrstack - p->p_stksize; 2620Sstevel@tonic-gate len = p->p_stksize; 2630Sstevel@tonic-gate if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 2640Sstevel@tonic-gate /* not in either UNIX data or stack segments */ 2650Sstevel@tonic-gate res = FC_NOMAP; 2660Sstevel@tonic-gate goto out; 2670Sstevel@tonic-gate } 2680Sstevel@tonic-gate } 2690Sstevel@tonic-gate 2700Sstevel@tonic-gate /* 2710Sstevel@tonic-gate * the rest of this function implements a 3.X 4.X 5.X compatibility 2720Sstevel@tonic-gate * This code is probably not needed anymore 2730Sstevel@tonic-gate */ 2740Sstevel@tonic-gate if (p->p_model == DATAMODEL_ILP32) { 2750Sstevel@tonic-gate 2760Sstevel@tonic-gate /* expand the gap to the page boundaries on each side */ 2770Sstevel@tonic-gate ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 2780Sstevel@tonic-gate base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 2790Sstevel@tonic-gate len = ea - (uintptr_t)base; 2800Sstevel@tonic-gate 2810Sstevel@tonic-gate as_rangelock(as); 2820Sstevel@tonic-gate if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 2830Sstevel@tonic-gate 0) { 2840Sstevel@tonic-gate err = as_map(as, base, len, segvn_create, zfod_argsp); 2850Sstevel@tonic-gate as_rangeunlock(as); 2860Sstevel@tonic-gate if (err) { 2870Sstevel@tonic-gate res = FC_MAKE_ERR(err); 2880Sstevel@tonic-gate goto out; 2890Sstevel@tonic-gate } 2900Sstevel@tonic-gate } else { 2910Sstevel@tonic-gate /* 2920Sstevel@tonic-gate * This page is already mapped by another thread after 2930Sstevel@tonic-gate * we returned from as_fault() above. We just fall 2940Sstevel@tonic-gate * through as_fault() below. 2950Sstevel@tonic-gate */ 2960Sstevel@tonic-gate as_rangeunlock(as); 2970Sstevel@tonic-gate } 2980Sstevel@tonic-gate 2990Sstevel@tonic-gate res = as_fault(hat, as, addr, 1, F_INVAL, rw); 3000Sstevel@tonic-gate } 3010Sstevel@tonic-gate 3020Sstevel@tonic-gate out: 3030Sstevel@tonic-gate if (mapped_red) 3040Sstevel@tonic-gate segkp_unmap_red(); 3050Sstevel@tonic-gate 3060Sstevel@tonic-gate return (res); 3070Sstevel@tonic-gate } 3080Sstevel@tonic-gate 3090Sstevel@tonic-gate void 3100Sstevel@tonic-gate map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 3110Sstevel@tonic-gate { 3120Sstevel@tonic-gate struct proc *p = curproc; 3130Sstevel@tonic-gate caddr_t userlimit = (flags & _MAP_LOW32) ? 3140Sstevel@tonic-gate (caddr_t)_userlimit32 : p->p_as->a_userlimit; 3150Sstevel@tonic-gate 3160Sstevel@tonic-gate map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 3170Sstevel@tonic-gate } 3180Sstevel@tonic-gate 3190Sstevel@tonic-gate /*ARGSUSED*/ 3200Sstevel@tonic-gate int 3210Sstevel@tonic-gate map_addr_vacalign_check(caddr_t addr, u_offset_t off) 3220Sstevel@tonic-gate { 3230Sstevel@tonic-gate return (0); 3240Sstevel@tonic-gate } 3250Sstevel@tonic-gate 3260Sstevel@tonic-gate /* 3270Sstevel@tonic-gate * map_addr_proc() is the routine called when the system is to 3280Sstevel@tonic-gate * choose an address for the user. We will pick an address 3290Sstevel@tonic-gate * range which is the highest available below kernelbase. 3300Sstevel@tonic-gate * 3310Sstevel@tonic-gate * addrp is a value/result parameter. 3320Sstevel@tonic-gate * On input it is a hint from the user to be used in a completely 3330Sstevel@tonic-gate * machine dependent fashion. We decide to completely ignore this hint. 3340Sstevel@tonic-gate * 3350Sstevel@tonic-gate * On output it is NULL if no address can be found in the current 3360Sstevel@tonic-gate * processes address space or else an address that is currently 3370Sstevel@tonic-gate * not mapped for len bytes with a page of red zone on either side. 3380Sstevel@tonic-gate * 3390Sstevel@tonic-gate * align is not needed on x86 (it's for viturally addressed caches) 3400Sstevel@tonic-gate */ 3410Sstevel@tonic-gate /*ARGSUSED*/ 3420Sstevel@tonic-gate void 3430Sstevel@tonic-gate map_addr_proc( 3440Sstevel@tonic-gate caddr_t *addrp, 3450Sstevel@tonic-gate size_t len, 3460Sstevel@tonic-gate offset_t off, 3470Sstevel@tonic-gate int vacalign, 3480Sstevel@tonic-gate caddr_t userlimit, 3490Sstevel@tonic-gate struct proc *p, 3500Sstevel@tonic-gate uint_t flags) 3510Sstevel@tonic-gate { 3520Sstevel@tonic-gate struct as *as = p->p_as; 3530Sstevel@tonic-gate caddr_t addr; 3540Sstevel@tonic-gate caddr_t base; 3550Sstevel@tonic-gate size_t slen; 3560Sstevel@tonic-gate size_t align_amount; 3570Sstevel@tonic-gate 3580Sstevel@tonic-gate ASSERT32(userlimit == as->a_userlimit); 3590Sstevel@tonic-gate 3600Sstevel@tonic-gate base = p->p_brkbase; 3610Sstevel@tonic-gate #if defined(__amd64) 3620Sstevel@tonic-gate /* 3630Sstevel@tonic-gate * XX64 Yes, this needs more work. 3640Sstevel@tonic-gate */ 3650Sstevel@tonic-gate if (p->p_model == DATAMODEL_NATIVE) { 3660Sstevel@tonic-gate if (userlimit < as->a_userlimit) { 3670Sstevel@tonic-gate /* 3680Sstevel@tonic-gate * This happens when a program wants to map 3690Sstevel@tonic-gate * something in a range that's accessible to a 3700Sstevel@tonic-gate * program in a smaller address space. For example, 3710Sstevel@tonic-gate * a 64-bit program calling mmap32(2) to guarantee 3720Sstevel@tonic-gate * that the returned address is below 4Gbytes. 3730Sstevel@tonic-gate */ 3740Sstevel@tonic-gate ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 3750Sstevel@tonic-gate 3760Sstevel@tonic-gate if (userlimit > base) 3770Sstevel@tonic-gate slen = userlimit - base; 3780Sstevel@tonic-gate else { 3790Sstevel@tonic-gate *addrp = NULL; 3800Sstevel@tonic-gate return; 3810Sstevel@tonic-gate } 3820Sstevel@tonic-gate } else { 3830Sstevel@tonic-gate /* 3840Sstevel@tonic-gate * XX64 This layout is probably wrong .. but in 3850Sstevel@tonic-gate * the event we make the amd64 address space look 3860Sstevel@tonic-gate * like sparcv9 i.e. with the stack -above- the 3870Sstevel@tonic-gate * heap, this bit of code might even be correct. 3880Sstevel@tonic-gate */ 3890Sstevel@tonic-gate slen = p->p_usrstack - base - 3900Sstevel@tonic-gate (((size_t)rctl_enforced_value( 3910Sstevel@tonic-gate rctlproc_legacy[RLIMIT_STACK], 3920Sstevel@tonic-gate p->p_rctls, p) + PAGEOFFSET) & PAGEMASK); 3930Sstevel@tonic-gate } 3940Sstevel@tonic-gate } else 3950Sstevel@tonic-gate #endif 3960Sstevel@tonic-gate slen = userlimit - base; 3970Sstevel@tonic-gate 3980Sstevel@tonic-gate len = (len + PAGEOFFSET) & PAGEMASK; 3990Sstevel@tonic-gate 4000Sstevel@tonic-gate /* 4010Sstevel@tonic-gate * Redzone for each side of the request. This is done to leave 4020Sstevel@tonic-gate * one page unmapped between segments. This is not required, but 4030Sstevel@tonic-gate * it's useful for the user because if their program strays across 4040Sstevel@tonic-gate * a segment boundary, it will catch a fault immediately making 4050Sstevel@tonic-gate * debugging a little easier. 4060Sstevel@tonic-gate */ 4070Sstevel@tonic-gate len += 2 * MMU_PAGESIZE; 4080Sstevel@tonic-gate 4090Sstevel@tonic-gate /* 4100Sstevel@tonic-gate * figure out what the alignment should be 4110Sstevel@tonic-gate * 4120Sstevel@tonic-gate * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 4130Sstevel@tonic-gate */ 4140Sstevel@tonic-gate if (len <= ELF_386_MAXPGSZ) { 4150Sstevel@tonic-gate /* 4160Sstevel@tonic-gate * Align virtual addresses to ensure that ELF shared libraries 4170Sstevel@tonic-gate * are mapped with the appropriate alignment constraints by 4180Sstevel@tonic-gate * the run-time linker. 4190Sstevel@tonic-gate */ 4200Sstevel@tonic-gate align_amount = ELF_386_MAXPGSZ; 4210Sstevel@tonic-gate } else { 4220Sstevel@tonic-gate int l = mmu.max_page_level; 4230Sstevel@tonic-gate 4240Sstevel@tonic-gate while (l && len < LEVEL_SIZE(l)) 4250Sstevel@tonic-gate --l; 4260Sstevel@tonic-gate 4270Sstevel@tonic-gate align_amount = LEVEL_SIZE(l); 4280Sstevel@tonic-gate } 4290Sstevel@tonic-gate 4300Sstevel@tonic-gate if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 4310Sstevel@tonic-gate align_amount = (uintptr_t)*addrp; 4320Sstevel@tonic-gate 4330Sstevel@tonic-gate len += align_amount; 4340Sstevel@tonic-gate 4350Sstevel@tonic-gate /* 4360Sstevel@tonic-gate * Look for a large enough hole starting below userlimit. 4370Sstevel@tonic-gate * After finding it, use the upper part. Addition of PAGESIZE 4380Sstevel@tonic-gate * is for the redzone as described above. 4390Sstevel@tonic-gate */ 4400Sstevel@tonic-gate if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) { 4410Sstevel@tonic-gate caddr_t as_addr; 4420Sstevel@tonic-gate 4430Sstevel@tonic-gate addr = base + slen - len + MMU_PAGESIZE; 4440Sstevel@tonic-gate as_addr = addr; 4450Sstevel@tonic-gate /* 4460Sstevel@tonic-gate * Round address DOWN to the alignment amount, 4470Sstevel@tonic-gate * add the offset, and if this address is less 4480Sstevel@tonic-gate * than the original address, add alignment amount. 4490Sstevel@tonic-gate */ 4500Sstevel@tonic-gate addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 4510Sstevel@tonic-gate addr += (uintptr_t)(off & (align_amount - 1)); 4520Sstevel@tonic-gate if (addr < as_addr) 4530Sstevel@tonic-gate addr += align_amount; 4540Sstevel@tonic-gate 4550Sstevel@tonic-gate ASSERT(addr <= (as_addr + align_amount)); 4560Sstevel@tonic-gate ASSERT(((uintptr_t)addr & (align_amount - 1)) == 4570Sstevel@tonic-gate ((uintptr_t)(off & (align_amount - 1)))); 4580Sstevel@tonic-gate *addrp = addr; 4590Sstevel@tonic-gate } else { 4600Sstevel@tonic-gate *addrp = NULL; /* no more virtual space */ 4610Sstevel@tonic-gate } 4620Sstevel@tonic-gate } 4630Sstevel@tonic-gate 4640Sstevel@tonic-gate /* 4650Sstevel@tonic-gate * Determine whether [base, base+len] contains a valid range of 4660Sstevel@tonic-gate * addresses at least minlen long. base and len are adjusted if 4670Sstevel@tonic-gate * required to provide a valid range. 4680Sstevel@tonic-gate */ 4690Sstevel@tonic-gate /*ARGSUSED3*/ 4700Sstevel@tonic-gate int 4710Sstevel@tonic-gate valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 4720Sstevel@tonic-gate { 4730Sstevel@tonic-gate uintptr_t hi, lo; 4740Sstevel@tonic-gate 4750Sstevel@tonic-gate lo = (uintptr_t)*basep; 4760Sstevel@tonic-gate hi = lo + *lenp; 4770Sstevel@tonic-gate 4780Sstevel@tonic-gate /* 4790Sstevel@tonic-gate * If hi rolled over the top, try cutting back. 4800Sstevel@tonic-gate */ 4810Sstevel@tonic-gate if (hi < lo) { 4820Sstevel@tonic-gate if (0 - lo + hi < minlen) 4830Sstevel@tonic-gate return (0); 4840Sstevel@tonic-gate if (0 - lo < minlen) 4850Sstevel@tonic-gate return (0); 4860Sstevel@tonic-gate *lenp = 0 - lo; 4870Sstevel@tonic-gate } else if (hi - lo < minlen) { 4880Sstevel@tonic-gate return (0); 4890Sstevel@tonic-gate } 4900Sstevel@tonic-gate #if defined(__amd64) 4910Sstevel@tonic-gate /* 4920Sstevel@tonic-gate * Deal with a possible hole in the address range between 4930Sstevel@tonic-gate * hole_start and hole_end that should never be mapped. 4940Sstevel@tonic-gate */ 4950Sstevel@tonic-gate if (lo < hole_start) { 4960Sstevel@tonic-gate if (hi > hole_start) { 4970Sstevel@tonic-gate if (hi < hole_end) { 4980Sstevel@tonic-gate hi = hole_start; 4990Sstevel@tonic-gate } else { 5000Sstevel@tonic-gate /* lo < hole_start && hi >= hole_end */ 5010Sstevel@tonic-gate if (dir == AH_LO) { 5020Sstevel@tonic-gate /* 5030Sstevel@tonic-gate * prefer lowest range 5040Sstevel@tonic-gate */ 5050Sstevel@tonic-gate if (hole_start - lo >= minlen) 5060Sstevel@tonic-gate hi = hole_start; 5070Sstevel@tonic-gate else if (hi - hole_end >= minlen) 5080Sstevel@tonic-gate lo = hole_end; 5090Sstevel@tonic-gate else 5100Sstevel@tonic-gate return (0); 5110Sstevel@tonic-gate } else { 5120Sstevel@tonic-gate /* 5130Sstevel@tonic-gate * prefer highest range 5140Sstevel@tonic-gate */ 5150Sstevel@tonic-gate if (hi - hole_end >= minlen) 5160Sstevel@tonic-gate lo = hole_end; 5170Sstevel@tonic-gate else if (hole_start - lo >= minlen) 5180Sstevel@tonic-gate hi = hole_start; 5190Sstevel@tonic-gate else 5200Sstevel@tonic-gate return (0); 5210Sstevel@tonic-gate } 5220Sstevel@tonic-gate } 5230Sstevel@tonic-gate } 5240Sstevel@tonic-gate } else { 5250Sstevel@tonic-gate /* lo >= hole_start */ 5260Sstevel@tonic-gate if (hi < hole_end) 5270Sstevel@tonic-gate return (0); 5280Sstevel@tonic-gate if (lo < hole_end) 5290Sstevel@tonic-gate lo = hole_end; 5300Sstevel@tonic-gate } 5310Sstevel@tonic-gate 5320Sstevel@tonic-gate if (hi - lo < minlen) 5330Sstevel@tonic-gate return (0); 5340Sstevel@tonic-gate 5350Sstevel@tonic-gate *basep = (caddr_t)lo; 5360Sstevel@tonic-gate *lenp = hi - lo; 5370Sstevel@tonic-gate #endif 5380Sstevel@tonic-gate return (1); 5390Sstevel@tonic-gate } 5400Sstevel@tonic-gate 5410Sstevel@tonic-gate /* 5420Sstevel@tonic-gate * Determine whether [addr, addr+len] are valid user addresses. 5430Sstevel@tonic-gate */ 5440Sstevel@tonic-gate /*ARGSUSED*/ 5450Sstevel@tonic-gate int 5460Sstevel@tonic-gate valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 5470Sstevel@tonic-gate caddr_t userlimit) 5480Sstevel@tonic-gate { 5490Sstevel@tonic-gate caddr_t eaddr = addr + len; 5500Sstevel@tonic-gate 5510Sstevel@tonic-gate if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 5520Sstevel@tonic-gate return (RANGE_BADADDR); 5530Sstevel@tonic-gate 5540Sstevel@tonic-gate #if defined(__amd64) 5550Sstevel@tonic-gate /* 5560Sstevel@tonic-gate * Check for the VA hole 5570Sstevel@tonic-gate */ 5580Sstevel@tonic-gate if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 5590Sstevel@tonic-gate return (RANGE_BADADDR); 5600Sstevel@tonic-gate #endif 5610Sstevel@tonic-gate 5620Sstevel@tonic-gate return (RANGE_OKAY); 5630Sstevel@tonic-gate } 5640Sstevel@tonic-gate 5650Sstevel@tonic-gate /* 5660Sstevel@tonic-gate * Return 1 if the page frame is onboard memory, else 0. 5670Sstevel@tonic-gate */ 5680Sstevel@tonic-gate int 5690Sstevel@tonic-gate pf_is_memory(pfn_t pf) 5700Sstevel@tonic-gate { 5710Sstevel@tonic-gate return (address_in_memlist(phys_install, mmu_ptob((uint64_t)pf), 1)); 5720Sstevel@tonic-gate } 5730Sstevel@tonic-gate 5740Sstevel@tonic-gate 5750Sstevel@tonic-gate /* 5760Sstevel@tonic-gate * initialized by page_coloring_init(). 5770Sstevel@tonic-gate */ 5780Sstevel@tonic-gate uint_t page_colors; 5790Sstevel@tonic-gate uint_t page_colors_mask; 5800Sstevel@tonic-gate uint_t page_coloring_shift; 5810Sstevel@tonic-gate int cpu_page_colors; 5820Sstevel@tonic-gate static uint_t l2_colors; 5830Sstevel@tonic-gate 5840Sstevel@tonic-gate /* 5850Sstevel@tonic-gate * Page freelists and cachelists are dynamically allocated once mnoderangecnt 5860Sstevel@tonic-gate * and page_colors are calculated from the l2 cache n-way set size. Within a 5870Sstevel@tonic-gate * mnode range, the page freelist and cachelist are hashed into bins based on 5880Sstevel@tonic-gate * color. This makes it easier to search for a page within a specific memory 5890Sstevel@tonic-gate * range. 5900Sstevel@tonic-gate */ 5910Sstevel@tonic-gate #define PAGE_COLORS_MIN 16 5920Sstevel@tonic-gate 5930Sstevel@tonic-gate page_t ****page_freelists; 5940Sstevel@tonic-gate page_t ***page_cachelists; 5950Sstevel@tonic-gate 5960Sstevel@tonic-gate /* 5970Sstevel@tonic-gate * As the PC architecture evolved memory up was clumped into several 5980Sstevel@tonic-gate * ranges for various historical I/O devices to do DMA. 5990Sstevel@tonic-gate * < 16Meg - ISA bus 6000Sstevel@tonic-gate * < 2Gig - ??? 6010Sstevel@tonic-gate * < 4Gig - PCI bus or drivers that don't understand PAE mode 6020Sstevel@tonic-gate */ 6030Sstevel@tonic-gate static pfn_t arch_memranges[NUM_MEM_RANGES] = { 6040Sstevel@tonic-gate 0x100000, /* pfn range for 4G and above */ 6050Sstevel@tonic-gate 0x80000, /* pfn range for 2G-4G */ 6060Sstevel@tonic-gate 0x01000, /* pfn range for 16M-2G */ 6070Sstevel@tonic-gate 0x00000, /* pfn range for 0-16M */ 6080Sstevel@tonic-gate }; 6090Sstevel@tonic-gate 6100Sstevel@tonic-gate /* 6110Sstevel@tonic-gate * These are changed during startup if the machine has limited memory. 6120Sstevel@tonic-gate */ 6130Sstevel@tonic-gate pfn_t *memranges = &arch_memranges[0]; 6140Sstevel@tonic-gate int nranges = NUM_MEM_RANGES; 6150Sstevel@tonic-gate 6160Sstevel@tonic-gate /* 6170Sstevel@tonic-gate * Used by page layer to know about page sizes 6180Sstevel@tonic-gate */ 6190Sstevel@tonic-gate hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 6200Sstevel@tonic-gate 6210Sstevel@tonic-gate /* 6220Sstevel@tonic-gate * This can be patched via /etc/system to allow old non-PAE aware device 6230Sstevel@tonic-gate * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 6240Sstevel@tonic-gate */ 6250Sstevel@tonic-gate #if defined(__i386) 6260Sstevel@tonic-gate int restricted_kmemalloc = 1; /* XX64 re-examine with PSARC 2004/405 */ 6270Sstevel@tonic-gate #elif defined(__amd64) 6280Sstevel@tonic-gate int restricted_kmemalloc = 0; 6290Sstevel@tonic-gate #endif 6300Sstevel@tonic-gate 6310Sstevel@tonic-gate kmutex_t *fpc_mutex[NPC_MUTEX]; 6320Sstevel@tonic-gate kmutex_t *cpc_mutex[NPC_MUTEX]; 6330Sstevel@tonic-gate 6340Sstevel@tonic-gate 6350Sstevel@tonic-gate /* 6360Sstevel@tonic-gate * return the memrange containing pfn 6370Sstevel@tonic-gate */ 6380Sstevel@tonic-gate int 6390Sstevel@tonic-gate memrange_num(pfn_t pfn) 6400Sstevel@tonic-gate { 6410Sstevel@tonic-gate int n; 6420Sstevel@tonic-gate 6430Sstevel@tonic-gate for (n = 0; n < nranges - 1; ++n) { 6440Sstevel@tonic-gate if (pfn >= memranges[n]) 6450Sstevel@tonic-gate break; 6460Sstevel@tonic-gate } 6470Sstevel@tonic-gate return (n); 6480Sstevel@tonic-gate } 6490Sstevel@tonic-gate 6500Sstevel@tonic-gate /* 6510Sstevel@tonic-gate * return the mnoderange containing pfn 6520Sstevel@tonic-gate */ 6530Sstevel@tonic-gate int 6540Sstevel@tonic-gate pfn_2_mtype(pfn_t pfn) 6550Sstevel@tonic-gate { 6560Sstevel@tonic-gate int n; 6570Sstevel@tonic-gate 6580Sstevel@tonic-gate for (n = mnoderangecnt - 1; n >= 0; n--) { 6590Sstevel@tonic-gate if (pfn >= mnoderanges[n].mnr_pfnlo) { 6600Sstevel@tonic-gate break; 6610Sstevel@tonic-gate } 6620Sstevel@tonic-gate } 6630Sstevel@tonic-gate return (n); 6640Sstevel@tonic-gate } 6650Sstevel@tonic-gate 6660Sstevel@tonic-gate /* 6670Sstevel@tonic-gate * is_contigpage_free: 6680Sstevel@tonic-gate * returns a page list of contiguous pages. It minimally has to return 6690Sstevel@tonic-gate * minctg pages. Caller determines minctg based on the scatter-gather 6700Sstevel@tonic-gate * list length. 6710Sstevel@tonic-gate * 6720Sstevel@tonic-gate * pfnp is set to the next page frame to search on return. 6730Sstevel@tonic-gate */ 6740Sstevel@tonic-gate static page_t * 6750Sstevel@tonic-gate is_contigpage_free( 6760Sstevel@tonic-gate pfn_t *pfnp, 6770Sstevel@tonic-gate pgcnt_t *pgcnt, 6780Sstevel@tonic-gate pgcnt_t minctg, 6790Sstevel@tonic-gate uint64_t pfnseg, 6800Sstevel@tonic-gate int iolock) 6810Sstevel@tonic-gate { 6820Sstevel@tonic-gate int i = 0; 6830Sstevel@tonic-gate pfn_t pfn = *pfnp; 6840Sstevel@tonic-gate page_t *pp; 6850Sstevel@tonic-gate page_t *plist = NULL; 6860Sstevel@tonic-gate 6870Sstevel@tonic-gate /* 6880Sstevel@tonic-gate * fail if pfn + minctg crosses a segment boundary. 6890Sstevel@tonic-gate * Adjust for next starting pfn to begin at segment boundary. 6900Sstevel@tonic-gate */ 6910Sstevel@tonic-gate 6920Sstevel@tonic-gate if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 6930Sstevel@tonic-gate *pfnp = roundup(*pfnp, pfnseg + 1); 6940Sstevel@tonic-gate return (NULL); 6950Sstevel@tonic-gate } 6960Sstevel@tonic-gate 6970Sstevel@tonic-gate do { 6980Sstevel@tonic-gate retry: 6990Sstevel@tonic-gate pp = page_numtopp_nolock(pfn + i); 7000Sstevel@tonic-gate if ((pp == NULL) || 7010Sstevel@tonic-gate (page_trylock(pp, SE_EXCL) == 0)) { 7020Sstevel@tonic-gate (*pfnp)++; 7030Sstevel@tonic-gate break; 7040Sstevel@tonic-gate } 7050Sstevel@tonic-gate if (page_pptonum(pp) != pfn + i) { 7060Sstevel@tonic-gate page_unlock(pp); 7070Sstevel@tonic-gate goto retry; 7080Sstevel@tonic-gate } 7090Sstevel@tonic-gate 7100Sstevel@tonic-gate if (!(PP_ISFREE(pp))) { 7110Sstevel@tonic-gate page_unlock(pp); 7120Sstevel@tonic-gate (*pfnp)++; 7130Sstevel@tonic-gate break; 7140Sstevel@tonic-gate } 7150Sstevel@tonic-gate 7160Sstevel@tonic-gate if (!PP_ISAGED(pp)) { 7170Sstevel@tonic-gate page_list_sub(pp, PG_CACHE_LIST); 7180Sstevel@tonic-gate page_hashout(pp, (kmutex_t *)NULL); 7190Sstevel@tonic-gate } else { 7200Sstevel@tonic-gate page_list_sub(pp, PG_FREE_LIST); 7210Sstevel@tonic-gate } 7220Sstevel@tonic-gate 7230Sstevel@tonic-gate if (iolock) 7240Sstevel@tonic-gate page_io_lock(pp); 7250Sstevel@tonic-gate page_list_concat(&plist, &pp); 7260Sstevel@tonic-gate 7270Sstevel@tonic-gate /* 7280Sstevel@tonic-gate * exit loop when pgcnt satisfied or segment boundary reached. 7290Sstevel@tonic-gate */ 7300Sstevel@tonic-gate 7310Sstevel@tonic-gate } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 7320Sstevel@tonic-gate 7330Sstevel@tonic-gate *pfnp += i; /* set to next pfn to search */ 7340Sstevel@tonic-gate 7350Sstevel@tonic-gate if (i >= minctg) { 7360Sstevel@tonic-gate *pgcnt -= i; 7370Sstevel@tonic-gate return (plist); 7380Sstevel@tonic-gate } 7390Sstevel@tonic-gate 7400Sstevel@tonic-gate /* 7410Sstevel@tonic-gate * failure: minctg not satisfied. 7420Sstevel@tonic-gate * 7430Sstevel@tonic-gate * if next request crosses segment boundary, set next pfn 7440Sstevel@tonic-gate * to search from the segment boundary. 7450Sstevel@tonic-gate */ 7460Sstevel@tonic-gate if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 7470Sstevel@tonic-gate *pfnp = roundup(*pfnp, pfnseg + 1); 7480Sstevel@tonic-gate 7490Sstevel@tonic-gate /* clean up any pages already allocated */ 7500Sstevel@tonic-gate 7510Sstevel@tonic-gate while (plist) { 7520Sstevel@tonic-gate pp = plist; 7530Sstevel@tonic-gate page_sub(&plist, pp); 7540Sstevel@tonic-gate page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 7550Sstevel@tonic-gate if (iolock) 7560Sstevel@tonic-gate page_io_unlock(pp); 7570Sstevel@tonic-gate page_unlock(pp); 7580Sstevel@tonic-gate } 7590Sstevel@tonic-gate 7600Sstevel@tonic-gate return (NULL); 7610Sstevel@tonic-gate } 7620Sstevel@tonic-gate 7630Sstevel@tonic-gate /* 7640Sstevel@tonic-gate * verify that pages being returned from allocator have correct DMA attribute 7650Sstevel@tonic-gate */ 7660Sstevel@tonic-gate #ifndef DEBUG 7670Sstevel@tonic-gate #define check_dma(a, b, c) (0) 7680Sstevel@tonic-gate #else 7690Sstevel@tonic-gate static void 7700Sstevel@tonic-gate check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 7710Sstevel@tonic-gate { 7720Sstevel@tonic-gate if (dma_attr == NULL) 7730Sstevel@tonic-gate return; 7740Sstevel@tonic-gate 7750Sstevel@tonic-gate while (cnt-- > 0) { 7760Sstevel@tonic-gate if (mmu_ptob((uint64_t)pp->p_pagenum) < 7770Sstevel@tonic-gate dma_attr->dma_attr_addr_lo) 7780Sstevel@tonic-gate panic("PFN (pp=%p) below dma_attr_addr_lo", pp); 7790Sstevel@tonic-gate if (mmu_ptob((uint64_t)pp->p_pagenum) >= 7800Sstevel@tonic-gate dma_attr->dma_attr_addr_hi) 7810Sstevel@tonic-gate panic("PFN (pp=%p) above dma_attr_addr_hi", pp); 7820Sstevel@tonic-gate pp = pp->p_next; 7830Sstevel@tonic-gate } 7840Sstevel@tonic-gate } 7850Sstevel@tonic-gate #endif 7860Sstevel@tonic-gate 7870Sstevel@tonic-gate static kmutex_t contig_lock; 7880Sstevel@tonic-gate 7890Sstevel@tonic-gate #define CONTIG_LOCK() mutex_enter(&contig_lock); 7900Sstevel@tonic-gate #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 7910Sstevel@tonic-gate 7920Sstevel@tonic-gate #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 7930Sstevel@tonic-gate 7940Sstevel@tonic-gate static page_t * 7950Sstevel@tonic-gate page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 7960Sstevel@tonic-gate { 7970Sstevel@tonic-gate pfn_t pfn; 7980Sstevel@tonic-gate int sgllen; 7990Sstevel@tonic-gate uint64_t pfnseg; 8000Sstevel@tonic-gate pgcnt_t minctg; 8010Sstevel@tonic-gate page_t *pplist = NULL, *plist; 8020Sstevel@tonic-gate uint64_t lo, hi; 8030Sstevel@tonic-gate pgcnt_t pfnalign = 0; 8040Sstevel@tonic-gate static pfn_t startpfn; 8050Sstevel@tonic-gate static pgcnt_t lastctgcnt; 8060Sstevel@tonic-gate uintptr_t align; 8070Sstevel@tonic-gate 8080Sstevel@tonic-gate CONTIG_LOCK(); 8090Sstevel@tonic-gate 8100Sstevel@tonic-gate if (mattr) { 8110Sstevel@tonic-gate lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 8120Sstevel@tonic-gate hi = mmu_btop(mattr->dma_attr_addr_hi); 8130Sstevel@tonic-gate if (hi >= physmax) 8140Sstevel@tonic-gate hi = physmax - 1; 8150Sstevel@tonic-gate sgllen = mattr->dma_attr_sgllen; 8160Sstevel@tonic-gate pfnseg = mmu_btop(mattr->dma_attr_seg); 8170Sstevel@tonic-gate 8180Sstevel@tonic-gate align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 8190Sstevel@tonic-gate if (align > MMU_PAGESIZE) 8200Sstevel@tonic-gate pfnalign = mmu_btop(align); 8210Sstevel@tonic-gate 8220Sstevel@tonic-gate /* 8230Sstevel@tonic-gate * in order to satisfy the request, must minimally 8240Sstevel@tonic-gate * acquire minctg contiguous pages 8250Sstevel@tonic-gate */ 8260Sstevel@tonic-gate minctg = howmany(*pgcnt, sgllen); 8270Sstevel@tonic-gate 8280Sstevel@tonic-gate ASSERT(hi >= lo); 8290Sstevel@tonic-gate 8300Sstevel@tonic-gate /* 8310Sstevel@tonic-gate * start from where last searched if the minctg >= lastctgcnt 8320Sstevel@tonic-gate */ 8330Sstevel@tonic-gate if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 8340Sstevel@tonic-gate startpfn = lo; 8350Sstevel@tonic-gate } else { 8360Sstevel@tonic-gate hi = physmax - 1; 8370Sstevel@tonic-gate lo = 0; 8380Sstevel@tonic-gate sgllen = 1; 8390Sstevel@tonic-gate pfnseg = mmu.highest_pfn; 8400Sstevel@tonic-gate minctg = *pgcnt; 8410Sstevel@tonic-gate 8420Sstevel@tonic-gate if (minctg < lastctgcnt) 8430Sstevel@tonic-gate startpfn = lo; 8440Sstevel@tonic-gate } 8450Sstevel@tonic-gate lastctgcnt = minctg; 8460Sstevel@tonic-gate 8470Sstevel@tonic-gate ASSERT(pfnseg + 1 >= (uint64_t)minctg); 8480Sstevel@tonic-gate 8490Sstevel@tonic-gate /* conserve 16m memory - start search above 16m when possible */ 8500Sstevel@tonic-gate if (hi > PFN_16M && startpfn < PFN_16M) 8510Sstevel@tonic-gate startpfn = PFN_16M; 8520Sstevel@tonic-gate 8530Sstevel@tonic-gate pfn = startpfn; 8540Sstevel@tonic-gate if (pfnalign) 8550Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 8560Sstevel@tonic-gate 8570Sstevel@tonic-gate while (pfn + minctg - 1 <= hi) { 8580Sstevel@tonic-gate 8590Sstevel@tonic-gate plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 8600Sstevel@tonic-gate if (plist) { 8610Sstevel@tonic-gate page_list_concat(&pplist, &plist); 8620Sstevel@tonic-gate sgllen--; 8630Sstevel@tonic-gate /* 8640Sstevel@tonic-gate * return when contig pages no longer needed 8650Sstevel@tonic-gate */ 8660Sstevel@tonic-gate if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 8670Sstevel@tonic-gate startpfn = pfn; 8680Sstevel@tonic-gate CONTIG_UNLOCK(); 8690Sstevel@tonic-gate check_dma(mattr, pplist, *pgcnt); 8700Sstevel@tonic-gate return (pplist); 8710Sstevel@tonic-gate } 8720Sstevel@tonic-gate minctg = howmany(*pgcnt, sgllen); 8730Sstevel@tonic-gate } 8740Sstevel@tonic-gate if (pfnalign) 8750Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 8760Sstevel@tonic-gate } 8770Sstevel@tonic-gate 8780Sstevel@tonic-gate /* cannot find contig pages in specified range */ 8790Sstevel@tonic-gate if (startpfn == lo) { 8800Sstevel@tonic-gate CONTIG_UNLOCK(); 8810Sstevel@tonic-gate return (NULL); 8820Sstevel@tonic-gate } 8830Sstevel@tonic-gate 8840Sstevel@tonic-gate /* did not start with lo previously */ 8850Sstevel@tonic-gate pfn = lo; 8860Sstevel@tonic-gate if (pfnalign) 8870Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 8880Sstevel@tonic-gate 8890Sstevel@tonic-gate /* allow search to go above startpfn */ 8900Sstevel@tonic-gate while (pfn < startpfn) { 8910Sstevel@tonic-gate 8920Sstevel@tonic-gate plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 8930Sstevel@tonic-gate if (plist != NULL) { 8940Sstevel@tonic-gate 8950Sstevel@tonic-gate page_list_concat(&pplist, &plist); 8960Sstevel@tonic-gate sgllen--; 8970Sstevel@tonic-gate 8980Sstevel@tonic-gate /* 8990Sstevel@tonic-gate * return when contig pages no longer needed 9000Sstevel@tonic-gate */ 9010Sstevel@tonic-gate if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 9020Sstevel@tonic-gate startpfn = pfn; 9030Sstevel@tonic-gate CONTIG_UNLOCK(); 9040Sstevel@tonic-gate check_dma(mattr, pplist, *pgcnt); 9050Sstevel@tonic-gate return (pplist); 9060Sstevel@tonic-gate } 9070Sstevel@tonic-gate minctg = howmany(*pgcnt, sgllen); 9080Sstevel@tonic-gate } 9090Sstevel@tonic-gate if (pfnalign) 9100Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 9110Sstevel@tonic-gate } 9120Sstevel@tonic-gate CONTIG_UNLOCK(); 9130Sstevel@tonic-gate return (NULL); 9140Sstevel@tonic-gate } 9150Sstevel@tonic-gate 9160Sstevel@tonic-gate /* 9170Sstevel@tonic-gate * combine mem_node_config and memrange memory ranges into one data 9180Sstevel@tonic-gate * structure to be used for page list management. 9190Sstevel@tonic-gate * 9200Sstevel@tonic-gate * mnode_range_cnt() calculates the number of memory ranges for mnode and 9210Sstevel@tonic-gate * memranges[]. Used to determine the size of page lists and mnoderanges. 9220Sstevel@tonic-gate * 9230Sstevel@tonic-gate * mnode_range_setup() initializes mnoderanges. 9240Sstevel@tonic-gate */ 9250Sstevel@tonic-gate mnoderange_t *mnoderanges; 9260Sstevel@tonic-gate int mnoderangecnt; 9270Sstevel@tonic-gate int mtype4g; 9280Sstevel@tonic-gate 9290Sstevel@tonic-gate int 9300Sstevel@tonic-gate mnode_range_cnt() 9310Sstevel@tonic-gate { 9320Sstevel@tonic-gate int mri; 9330Sstevel@tonic-gate int mnrcnt = 0; 9340Sstevel@tonic-gate int mnode; 9350Sstevel@tonic-gate 9360Sstevel@tonic-gate for (mnode = 0; mnode < max_mem_nodes; mnode++) { 9370Sstevel@tonic-gate if (mem_node_config[mnode].exists == 0) 9380Sstevel@tonic-gate continue; 9390Sstevel@tonic-gate 9400Sstevel@tonic-gate mri = nranges - 1; 9410Sstevel@tonic-gate 9420Sstevel@tonic-gate /* find the memranges index below contained in mnode range */ 9430Sstevel@tonic-gate 9440Sstevel@tonic-gate while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 9450Sstevel@tonic-gate mri--; 9460Sstevel@tonic-gate 9470Sstevel@tonic-gate /* 9480Sstevel@tonic-gate * increment mnode range counter when memranges or mnode 9490Sstevel@tonic-gate * boundary is reached. 9500Sstevel@tonic-gate */ 9510Sstevel@tonic-gate while (mri >= 0 && 9520Sstevel@tonic-gate mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 9530Sstevel@tonic-gate mnrcnt++; 9540Sstevel@tonic-gate if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 9550Sstevel@tonic-gate mri--; 9560Sstevel@tonic-gate else 9570Sstevel@tonic-gate break; 9580Sstevel@tonic-gate } 9590Sstevel@tonic-gate } 9600Sstevel@tonic-gate return (mnrcnt); 9610Sstevel@tonic-gate } 9620Sstevel@tonic-gate 9630Sstevel@tonic-gate void 9640Sstevel@tonic-gate mnode_range_setup(mnoderange_t *mnoderanges) 9650Sstevel@tonic-gate { 9660Sstevel@tonic-gate int mnode, mri; 9670Sstevel@tonic-gate 9680Sstevel@tonic-gate for (mnode = 0; mnode < max_mem_nodes; mnode++) { 9690Sstevel@tonic-gate if (mem_node_config[mnode].exists == 0) 9700Sstevel@tonic-gate continue; 9710Sstevel@tonic-gate 9720Sstevel@tonic-gate mri = nranges - 1; 9730Sstevel@tonic-gate 9740Sstevel@tonic-gate while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 9750Sstevel@tonic-gate mri--; 9760Sstevel@tonic-gate 9770Sstevel@tonic-gate while (mri >= 0 && mem_node_config[mnode].physmax >= 9780Sstevel@tonic-gate MEMRANGELO(mri)) { 9790Sstevel@tonic-gate mnoderanges->mnr_pfnlo = 9800Sstevel@tonic-gate MAX(MEMRANGELO(mri), 9810Sstevel@tonic-gate mem_node_config[mnode].physbase); 9820Sstevel@tonic-gate mnoderanges->mnr_pfnhi = 9830Sstevel@tonic-gate MIN(MEMRANGEHI(mri), 9840Sstevel@tonic-gate mem_node_config[mnode].physmax); 9850Sstevel@tonic-gate mnoderanges->mnr_mnode = mnode; 9860Sstevel@tonic-gate mnoderanges->mnr_memrange = mri; 9870Sstevel@tonic-gate mnoderanges++; 9880Sstevel@tonic-gate if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 9890Sstevel@tonic-gate mri--; 9900Sstevel@tonic-gate else 9910Sstevel@tonic-gate break; 9920Sstevel@tonic-gate } 9930Sstevel@tonic-gate } 9940Sstevel@tonic-gate } 9950Sstevel@tonic-gate 9960Sstevel@tonic-gate /* 9970Sstevel@tonic-gate * Determine if the mnode range specified in mtype contains memory belonging 9980Sstevel@tonic-gate * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 9990Sstevel@tonic-gate * the range of indices to 0 or 4g. 10000Sstevel@tonic-gate * 10010Sstevel@tonic-gate * Return first mnode range type index found otherwise return -1 if none found. 10020Sstevel@tonic-gate */ 10030Sstevel@tonic-gate int 10040Sstevel@tonic-gate mtype_func(int mnode, int mtype, uint_t flags) 10050Sstevel@tonic-gate { 10060Sstevel@tonic-gate if (flags & PGI_MT_RANGE) { 10070Sstevel@tonic-gate int mtlim = 0; /* default to PGI_MT_RANGEO */ 10080Sstevel@tonic-gate 10090Sstevel@tonic-gate if (flags & PGI_MT_NEXT) 10100Sstevel@tonic-gate mtype--; 10110Sstevel@tonic-gate if (flags & PGI_MT_RANGE4G) 10120Sstevel@tonic-gate mtlim = mtype4g + 1; 10130Sstevel@tonic-gate while (mtype >= mtlim) { 10140Sstevel@tonic-gate if (mnoderanges[mtype].mnr_mnode == mnode) 10150Sstevel@tonic-gate return (mtype); 10160Sstevel@tonic-gate mtype--; 10170Sstevel@tonic-gate } 10180Sstevel@tonic-gate } else { 10190Sstevel@tonic-gate if (mnoderanges[mtype].mnr_mnode == mnode) 10200Sstevel@tonic-gate return (mtype); 10210Sstevel@tonic-gate } 10220Sstevel@tonic-gate return (-1); 10230Sstevel@tonic-gate } 10240Sstevel@tonic-gate 10250Sstevel@tonic-gate /* 1026414Skchow * Returns the free page count for mnode 1027414Skchow */ 1028414Skchow int 1029414Skchow mnode_pgcnt(int mnode) 1030414Skchow { 1031414Skchow int mtype = mnoderangecnt - 1; 1032414Skchow int flags = PGI_MT_RANGE0; 1033414Skchow pgcnt_t pgcnt = 0; 1034414Skchow 1035414Skchow mtype = mtype_func(mnode, mtype, flags); 1036414Skchow 1037414Skchow while (mtype != -1) { 1038414Skchow pgcnt += (mnoderanges[mtype].mnr_mt_flpgcnt + 1039414Skchow mnoderanges[mtype].mnr_mt_lgpgcnt + 1040414Skchow mnoderanges[mtype].mnr_mt_clpgcnt); 1041414Skchow mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); 1042414Skchow } 1043414Skchow return (pgcnt); 1044414Skchow } 1045414Skchow 1046414Skchow /* 10470Sstevel@tonic-gate * Initialize page coloring variables based on the l2 cache parameters. 10480Sstevel@tonic-gate * Calculate and return memory needed for page coloring data structures. 10490Sstevel@tonic-gate */ 10500Sstevel@tonic-gate size_t 10510Sstevel@tonic-gate page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 10520Sstevel@tonic-gate { 10530Sstevel@tonic-gate size_t colorsz = 0; 10540Sstevel@tonic-gate int i; 10550Sstevel@tonic-gate int colors; 10560Sstevel@tonic-gate 10570Sstevel@tonic-gate /* 10580Sstevel@tonic-gate * Reduce the memory ranges lists if we don't have large amounts 10590Sstevel@tonic-gate * of memory. This avoids searching known empty free lists. 10600Sstevel@tonic-gate */ 10610Sstevel@tonic-gate i = memrange_num(physmax); 10620Sstevel@tonic-gate memranges += i; 10630Sstevel@tonic-gate nranges -= i; 10640Sstevel@tonic-gate #if defined(__i386) 10650Sstevel@tonic-gate if (i > 0) 10660Sstevel@tonic-gate restricted_kmemalloc = 0; 10670Sstevel@tonic-gate #endif 10680Sstevel@tonic-gate /* physmax greater than 4g */ 10690Sstevel@tonic-gate if (i == 0) 10700Sstevel@tonic-gate physmax4g = 1; 10710Sstevel@tonic-gate 10720Sstevel@tonic-gate /* 10730Sstevel@tonic-gate * setup pagesize for generic page layer 10740Sstevel@tonic-gate */ 10750Sstevel@tonic-gate for (i = 0; i <= mmu.max_page_level; ++i) { 10760Sstevel@tonic-gate hw_page_array[i].hp_size = LEVEL_SIZE(i); 10770Sstevel@tonic-gate hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 10780Sstevel@tonic-gate hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 10790Sstevel@tonic-gate } 10800Sstevel@tonic-gate 10810Sstevel@tonic-gate ASSERT(ISP2(l2_sz)); 10820Sstevel@tonic-gate ASSERT(ISP2(l2_linesz)); 10830Sstevel@tonic-gate ASSERT(l2_sz > MMU_PAGESIZE); 10840Sstevel@tonic-gate 10850Sstevel@tonic-gate /* l2_assoc is 0 for fully associative l2 cache */ 10860Sstevel@tonic-gate if (l2_assoc) 10870Sstevel@tonic-gate l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 10880Sstevel@tonic-gate else 10890Sstevel@tonic-gate l2_colors = 1; 10900Sstevel@tonic-gate 10910Sstevel@tonic-gate /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 10920Sstevel@tonic-gate page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 10930Sstevel@tonic-gate 10940Sstevel@tonic-gate /* 10950Sstevel@tonic-gate * cpu_page_colors is non-zero when a page color may be spread across 10960Sstevel@tonic-gate * multiple bins. 10970Sstevel@tonic-gate */ 10980Sstevel@tonic-gate if (l2_colors < page_colors) 10990Sstevel@tonic-gate cpu_page_colors = l2_colors; 11000Sstevel@tonic-gate 11010Sstevel@tonic-gate ASSERT(ISP2(page_colors)); 11020Sstevel@tonic-gate 11030Sstevel@tonic-gate page_colors_mask = page_colors - 1; 11040Sstevel@tonic-gate 11050Sstevel@tonic-gate ASSERT(ISP2(CPUSETSIZE())); 11060Sstevel@tonic-gate page_coloring_shift = lowbit(CPUSETSIZE()); 11070Sstevel@tonic-gate 11080Sstevel@tonic-gate /* size for mnoderanges */ 11090Sstevel@tonic-gate mnoderangecnt = mnode_range_cnt(); 11100Sstevel@tonic-gate colorsz = mnoderangecnt * sizeof (mnoderange_t); 11110Sstevel@tonic-gate 11120Sstevel@tonic-gate /* size for fpc_mutex and cpc_mutex */ 11130Sstevel@tonic-gate colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 11140Sstevel@tonic-gate 11150Sstevel@tonic-gate /* size of page_freelists */ 11160Sstevel@tonic-gate colorsz += mnoderangecnt * sizeof (page_t ***); 11170Sstevel@tonic-gate colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 11180Sstevel@tonic-gate 11190Sstevel@tonic-gate for (i = 0; i < mmu_page_sizes; i++) { 11200Sstevel@tonic-gate colors = page_get_pagecolors(i); 11210Sstevel@tonic-gate colorsz += mnoderangecnt * colors * sizeof (page_t *); 11220Sstevel@tonic-gate } 11230Sstevel@tonic-gate 11240Sstevel@tonic-gate /* size of page_cachelists */ 11250Sstevel@tonic-gate colorsz += mnoderangecnt * sizeof (page_t **); 11260Sstevel@tonic-gate colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 11270Sstevel@tonic-gate 11280Sstevel@tonic-gate return (colorsz); 11290Sstevel@tonic-gate } 11300Sstevel@tonic-gate 11310Sstevel@tonic-gate /* 11320Sstevel@tonic-gate * Called once at startup to configure page_coloring data structures and 11330Sstevel@tonic-gate * does the 1st page_free()/page_freelist_add(). 11340Sstevel@tonic-gate */ 11350Sstevel@tonic-gate void 11360Sstevel@tonic-gate page_coloring_setup(caddr_t pcmemaddr) 11370Sstevel@tonic-gate { 11380Sstevel@tonic-gate int i; 11390Sstevel@tonic-gate int j; 11400Sstevel@tonic-gate int k; 11410Sstevel@tonic-gate caddr_t addr; 11420Sstevel@tonic-gate int colors; 11430Sstevel@tonic-gate 11440Sstevel@tonic-gate /* 11450Sstevel@tonic-gate * do page coloring setup 11460Sstevel@tonic-gate */ 11470Sstevel@tonic-gate addr = pcmemaddr; 11480Sstevel@tonic-gate 11490Sstevel@tonic-gate mnoderanges = (mnoderange_t *)addr; 11500Sstevel@tonic-gate addr += (mnoderangecnt * sizeof (mnoderange_t)); 11510Sstevel@tonic-gate 11520Sstevel@tonic-gate mnode_range_setup(mnoderanges); 11530Sstevel@tonic-gate 11540Sstevel@tonic-gate if (physmax4g) 11550Sstevel@tonic-gate mtype4g = pfn_2_mtype(0xfffff); 11560Sstevel@tonic-gate 11570Sstevel@tonic-gate for (k = 0; k < NPC_MUTEX; k++) { 11580Sstevel@tonic-gate fpc_mutex[k] = (kmutex_t *)addr; 11590Sstevel@tonic-gate addr += (max_mem_nodes * sizeof (kmutex_t)); 11600Sstevel@tonic-gate } 11610Sstevel@tonic-gate for (k = 0; k < NPC_MUTEX; k++) { 11620Sstevel@tonic-gate cpc_mutex[k] = (kmutex_t *)addr; 11630Sstevel@tonic-gate addr += (max_mem_nodes * sizeof (kmutex_t)); 11640Sstevel@tonic-gate } 11650Sstevel@tonic-gate page_freelists = (page_t ****)addr; 11660Sstevel@tonic-gate addr += (mnoderangecnt * sizeof (page_t ***)); 11670Sstevel@tonic-gate 11680Sstevel@tonic-gate page_cachelists = (page_t ***)addr; 11690Sstevel@tonic-gate addr += (mnoderangecnt * sizeof (page_t **)); 11700Sstevel@tonic-gate 11710Sstevel@tonic-gate for (i = 0; i < mnoderangecnt; i++) { 11720Sstevel@tonic-gate page_freelists[i] = (page_t ***)addr; 11730Sstevel@tonic-gate addr += (mmu_page_sizes * sizeof (page_t **)); 11740Sstevel@tonic-gate 11750Sstevel@tonic-gate for (j = 0; j < mmu_page_sizes; j++) { 11760Sstevel@tonic-gate colors = page_get_pagecolors(j); 11770Sstevel@tonic-gate page_freelists[i][j] = (page_t **)addr; 11780Sstevel@tonic-gate addr += (colors * sizeof (page_t *)); 11790Sstevel@tonic-gate } 11800Sstevel@tonic-gate page_cachelists[i] = (page_t **)addr; 11810Sstevel@tonic-gate addr += (page_colors * sizeof (page_t *)); 11820Sstevel@tonic-gate } 11830Sstevel@tonic-gate } 11840Sstevel@tonic-gate 11850Sstevel@tonic-gate /*ARGSUSED*/ 11860Sstevel@tonic-gate int 11870Sstevel@tonic-gate bp_color(struct buf *bp) 11880Sstevel@tonic-gate { 11890Sstevel@tonic-gate return (0); 11900Sstevel@tonic-gate } 11910Sstevel@tonic-gate 11920Sstevel@tonic-gate /* 11930Sstevel@tonic-gate * get a page from any list with the given mnode 11940Sstevel@tonic-gate */ 11950Sstevel@tonic-gate page_t * 11960Sstevel@tonic-gate page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 11970Sstevel@tonic-gate int mnode, int mtype, ddi_dma_attr_t *dma_attr) 11980Sstevel@tonic-gate { 11990Sstevel@tonic-gate kmutex_t *pcm; 12000Sstevel@tonic-gate int i; 12010Sstevel@tonic-gate page_t *pp; 12020Sstevel@tonic-gate page_t *first_pp; 12030Sstevel@tonic-gate uint64_t pgaddr; 12040Sstevel@tonic-gate ulong_t bin; 12050Sstevel@tonic-gate int mtypestart; 12060Sstevel@tonic-gate 12070Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_alloc); 12080Sstevel@tonic-gate 12090Sstevel@tonic-gate ASSERT((flags & PG_MATCH_COLOR) == 0); 12100Sstevel@tonic-gate ASSERT(szc == 0); 12110Sstevel@tonic-gate ASSERT(dma_attr != NULL); 12120Sstevel@tonic-gate 12130Sstevel@tonic-gate 12140Sstevel@tonic-gate MTYPE_START(mnode, mtype, flags); 12150Sstevel@tonic-gate if (mtype < 0) { 12160Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocempty); 12170Sstevel@tonic-gate return (NULL); 12180Sstevel@tonic-gate } 12190Sstevel@tonic-gate 12200Sstevel@tonic-gate mtypestart = mtype; 12210Sstevel@tonic-gate 12220Sstevel@tonic-gate bin = origbin; 12230Sstevel@tonic-gate 12240Sstevel@tonic-gate /* 12250Sstevel@tonic-gate * check up to page_colors + 1 bins - origbin may be checked twice 12260Sstevel@tonic-gate * because of BIN_STEP skip 12270Sstevel@tonic-gate */ 12280Sstevel@tonic-gate do { 12290Sstevel@tonic-gate i = 0; 12300Sstevel@tonic-gate while (i <= page_colors) { 12310Sstevel@tonic-gate if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 12320Sstevel@tonic-gate goto nextfreebin; 12330Sstevel@tonic-gate 12340Sstevel@tonic-gate pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 12350Sstevel@tonic-gate mutex_enter(pcm); 12360Sstevel@tonic-gate pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 12370Sstevel@tonic-gate first_pp = pp; 12380Sstevel@tonic-gate while (pp != NULL) { 12390Sstevel@tonic-gate if (page_trylock(pp, SE_EXCL) == 0) { 12400Sstevel@tonic-gate pp = pp->p_next; 12410Sstevel@tonic-gate if (pp == first_pp) { 12420Sstevel@tonic-gate pp = NULL; 12430Sstevel@tonic-gate } 12440Sstevel@tonic-gate continue; 12450Sstevel@tonic-gate } 12460Sstevel@tonic-gate 12470Sstevel@tonic-gate ASSERT(PP_ISFREE(pp)); 12480Sstevel@tonic-gate ASSERT(PP_ISAGED(pp)); 12490Sstevel@tonic-gate ASSERT(pp->p_vnode == NULL); 12500Sstevel@tonic-gate ASSERT(pp->p_hash == NULL); 12510Sstevel@tonic-gate ASSERT(pp->p_offset == (u_offset_t)-1); 12520Sstevel@tonic-gate ASSERT(pp->p_szc == szc); 12530Sstevel@tonic-gate ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 12540Sstevel@tonic-gate /* check if page within DMA attributes */ 12550Sstevel@tonic-gate pgaddr = mmu_ptob((uint64_t)(pp->p_pagenum)); 12560Sstevel@tonic-gate 12570Sstevel@tonic-gate if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 12580Sstevel@tonic-gate (pgaddr + MMU_PAGESIZE - 1 <= 12590Sstevel@tonic-gate dma_attr->dma_attr_addr_hi)) { 12600Sstevel@tonic-gate break; 12610Sstevel@tonic-gate } 12620Sstevel@tonic-gate 12630Sstevel@tonic-gate /* continue looking */ 12640Sstevel@tonic-gate page_unlock(pp); 12650Sstevel@tonic-gate pp = pp->p_next; 12660Sstevel@tonic-gate if (pp == first_pp) 12670Sstevel@tonic-gate pp = NULL; 12680Sstevel@tonic-gate 12690Sstevel@tonic-gate } 12700Sstevel@tonic-gate if (pp != NULL) { 12710Sstevel@tonic-gate ASSERT(mtype == PP_2_MTYPE(pp)); 12720Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 12730Sstevel@tonic-gate 12740Sstevel@tonic-gate /* found a page with specified DMA attributes */ 12750Sstevel@tonic-gate page_sub(&PAGE_FREELISTS(mnode, szc, bin, 12760Sstevel@tonic-gate mtype), pp); 1277414Skchow page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 12780Sstevel@tonic-gate 12790Sstevel@tonic-gate if ((PP_ISFREE(pp) == 0) || 12800Sstevel@tonic-gate (PP_ISAGED(pp) == 0)) { 12810Sstevel@tonic-gate cmn_err(CE_PANIC, "page %p is not free", 12820Sstevel@tonic-gate (void *)pp); 12830Sstevel@tonic-gate } 12840Sstevel@tonic-gate 12850Sstevel@tonic-gate mutex_exit(pcm); 12860Sstevel@tonic-gate check_dma(dma_attr, pp, 1); 12870Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocok); 12880Sstevel@tonic-gate return (pp); 12890Sstevel@tonic-gate } 12900Sstevel@tonic-gate mutex_exit(pcm); 12910Sstevel@tonic-gate nextfreebin: 12920Sstevel@tonic-gate pp = page_freelist_fill(szc, bin, mnode, mtype, 12930Sstevel@tonic-gate mmu_btop(dma_attr->dma_attr_addr_hi + 1)); 12940Sstevel@tonic-gate if (pp) 12950Sstevel@tonic-gate return (pp); 12960Sstevel@tonic-gate 12970Sstevel@tonic-gate /* try next bin */ 12980Sstevel@tonic-gate bin += (i == 0) ? BIN_STEP : 1; 12990Sstevel@tonic-gate bin &= page_colors_mask; 13000Sstevel@tonic-gate i++; 13010Sstevel@tonic-gate } 1302414Skchow MTYPE_NEXT(mnode, mtype, flags); 1303414Skchow } while (mtype >= 0); 13040Sstevel@tonic-gate 13050Sstevel@tonic-gate /* failed to find a page in the freelist; try it in the cachelist */ 13060Sstevel@tonic-gate 13070Sstevel@tonic-gate /* reset mtype start for cachelist search */ 13080Sstevel@tonic-gate mtype = mtypestart; 13090Sstevel@tonic-gate ASSERT(mtype >= 0); 13100Sstevel@tonic-gate 13110Sstevel@tonic-gate /* start with the bin of matching color */ 13120Sstevel@tonic-gate bin = origbin; 13130Sstevel@tonic-gate 13140Sstevel@tonic-gate do { 13150Sstevel@tonic-gate for (i = 0; i <= page_colors; i++) { 13160Sstevel@tonic-gate if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 13170Sstevel@tonic-gate goto nextcachebin; 13180Sstevel@tonic-gate pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 13190Sstevel@tonic-gate mutex_enter(pcm); 13200Sstevel@tonic-gate pp = PAGE_CACHELISTS(mnode, bin, mtype); 13210Sstevel@tonic-gate first_pp = pp; 13220Sstevel@tonic-gate while (pp != NULL) { 13230Sstevel@tonic-gate if (page_trylock(pp, SE_EXCL) == 0) { 13240Sstevel@tonic-gate pp = pp->p_next; 13250Sstevel@tonic-gate if (pp == first_pp) 13260Sstevel@tonic-gate break; 13270Sstevel@tonic-gate continue; 13280Sstevel@tonic-gate } 13290Sstevel@tonic-gate ASSERT(pp->p_vnode); 13300Sstevel@tonic-gate ASSERT(PP_ISAGED(pp) == 0); 13310Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 13320Sstevel@tonic-gate ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 13330Sstevel@tonic-gate 13340Sstevel@tonic-gate /* check if page within DMA attributes */ 13350Sstevel@tonic-gate 13360Sstevel@tonic-gate pgaddr = ptob((uint64_t)(pp->p_pagenum)); 13370Sstevel@tonic-gate 13380Sstevel@tonic-gate if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 13390Sstevel@tonic-gate (pgaddr + MMU_PAGESIZE - 1 <= 13400Sstevel@tonic-gate dma_attr->dma_attr_addr_hi)) { 13410Sstevel@tonic-gate break; 13420Sstevel@tonic-gate } 13430Sstevel@tonic-gate 13440Sstevel@tonic-gate /* continue looking */ 13450Sstevel@tonic-gate page_unlock(pp); 13460Sstevel@tonic-gate pp = pp->p_next; 13470Sstevel@tonic-gate if (pp == first_pp) 13480Sstevel@tonic-gate pp = NULL; 13490Sstevel@tonic-gate } 13500Sstevel@tonic-gate 13510Sstevel@tonic-gate if (pp != NULL) { 13520Sstevel@tonic-gate ASSERT(mtype == PP_2_MTYPE(pp)); 13530Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 13540Sstevel@tonic-gate 13550Sstevel@tonic-gate /* found a page with specified DMA attributes */ 13560Sstevel@tonic-gate page_sub(&PAGE_CACHELISTS(mnode, bin, 13570Sstevel@tonic-gate mtype), pp); 1358414Skchow page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 13590Sstevel@tonic-gate 13600Sstevel@tonic-gate mutex_exit(pcm); 13610Sstevel@tonic-gate ASSERT(pp->p_vnode); 13620Sstevel@tonic-gate ASSERT(PP_ISAGED(pp) == 0); 13630Sstevel@tonic-gate check_dma(dma_attr, pp, 1); 13640Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocok); 13650Sstevel@tonic-gate return (pp); 13660Sstevel@tonic-gate } 13670Sstevel@tonic-gate mutex_exit(pcm); 13680Sstevel@tonic-gate nextcachebin: 13690Sstevel@tonic-gate bin += (i == 0) ? BIN_STEP : 1; 13700Sstevel@tonic-gate bin &= page_colors_mask; 13710Sstevel@tonic-gate } 1372414Skchow MTYPE_NEXT(mnode, mtype, flags); 1373414Skchow } while (mtype >= 0); 13740Sstevel@tonic-gate 13750Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 13760Sstevel@tonic-gate return (NULL); 13770Sstevel@tonic-gate } 13780Sstevel@tonic-gate 13790Sstevel@tonic-gate /* 13800Sstevel@tonic-gate * This function is similar to page_get_freelist()/page_get_cachelist() 13810Sstevel@tonic-gate * but it searches both the lists to find a page with the specified 13820Sstevel@tonic-gate * color (or no color) and DMA attributes. The search is done in the 13830Sstevel@tonic-gate * freelist first and then in the cache list within the highest memory 13840Sstevel@tonic-gate * range (based on DMA attributes) before searching in the lower 13850Sstevel@tonic-gate * memory ranges. 13860Sstevel@tonic-gate * 13870Sstevel@tonic-gate * Note: This function is called only by page_create_io(). 13880Sstevel@tonic-gate */ 13890Sstevel@tonic-gate /*ARGSUSED*/ 13900Sstevel@tonic-gate page_t * 13910Sstevel@tonic-gate page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 13920Sstevel@tonic-gate size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 13930Sstevel@tonic-gate { 13940Sstevel@tonic-gate uint_t bin; 13950Sstevel@tonic-gate int mtype; 13960Sstevel@tonic-gate page_t *pp; 13970Sstevel@tonic-gate int n; 13980Sstevel@tonic-gate int m; 13990Sstevel@tonic-gate int szc; 14000Sstevel@tonic-gate int fullrange; 14010Sstevel@tonic-gate int mnode; 14020Sstevel@tonic-gate int local_failed_stat = 0; 14030Sstevel@tonic-gate lgrp_mnode_cookie_t lgrp_cookie; 14040Sstevel@tonic-gate 14050Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_alloc); 14060Sstevel@tonic-gate 14070Sstevel@tonic-gate /* only base pagesize currently supported */ 14080Sstevel@tonic-gate if (size != MMU_PAGESIZE) 14090Sstevel@tonic-gate return (NULL); 14100Sstevel@tonic-gate 14110Sstevel@tonic-gate /* 14120Sstevel@tonic-gate * If we're passed a specific lgroup, we use it. Otherwise, 14130Sstevel@tonic-gate * assume first-touch placement is desired. 14140Sstevel@tonic-gate */ 14150Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp)) 14160Sstevel@tonic-gate lgrp = lgrp_home_lgrp(); 14170Sstevel@tonic-gate 14180Sstevel@tonic-gate /* LINTED */ 14190Sstevel@tonic-gate AS_2_BIN(as, seg, vp, vaddr, bin); 14200Sstevel@tonic-gate 14210Sstevel@tonic-gate /* 14220Sstevel@tonic-gate * Only hold one freelist or cachelist lock at a time, that way we 14230Sstevel@tonic-gate * can start anywhere and not have to worry about lock 14240Sstevel@tonic-gate * ordering. 14250Sstevel@tonic-gate */ 14260Sstevel@tonic-gate if (dma_attr == NULL) { 14270Sstevel@tonic-gate n = 0; 14280Sstevel@tonic-gate m = mnoderangecnt - 1; 14290Sstevel@tonic-gate fullrange = 1; 14300Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 14310Sstevel@tonic-gate } else { 14320Sstevel@tonic-gate pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 14330Sstevel@tonic-gate pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 14340Sstevel@tonic-gate 14350Sstevel@tonic-gate /* 14360Sstevel@tonic-gate * We can guarantee alignment only for page boundary. 14370Sstevel@tonic-gate */ 14380Sstevel@tonic-gate if (dma_attr->dma_attr_align > MMU_PAGESIZE) 14390Sstevel@tonic-gate return (NULL); 14400Sstevel@tonic-gate 14410Sstevel@tonic-gate n = pfn_2_mtype(pfnlo); 14420Sstevel@tonic-gate m = pfn_2_mtype(pfnhi); 14430Sstevel@tonic-gate 14440Sstevel@tonic-gate fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 14450Sstevel@tonic-gate (pfnhi >= mnoderanges[m].mnr_pfnhi)); 14460Sstevel@tonic-gate } 14470Sstevel@tonic-gate VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 14480Sstevel@tonic-gate 14490Sstevel@tonic-gate if (n > m) 14500Sstevel@tonic-gate return (NULL); 14510Sstevel@tonic-gate 14520Sstevel@tonic-gate szc = 0; 14530Sstevel@tonic-gate 14540Sstevel@tonic-gate /* cylcing thru mtype handled by RANGE0 if n == 0 */ 14550Sstevel@tonic-gate if (n == 0) { 14560Sstevel@tonic-gate flags |= PGI_MT_RANGE0; 14570Sstevel@tonic-gate n = m; 14580Sstevel@tonic-gate } 14590Sstevel@tonic-gate 14600Sstevel@tonic-gate /* 14610Sstevel@tonic-gate * Try local memory node first, but try remote if we can't 14620Sstevel@tonic-gate * get a page of the right color. 14630Sstevel@tonic-gate */ 14640Sstevel@tonic-gate LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 14650Sstevel@tonic-gate while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 14660Sstevel@tonic-gate /* 14670Sstevel@tonic-gate * allocate pages from high pfn to low. 14680Sstevel@tonic-gate */ 14690Sstevel@tonic-gate for (mtype = m; mtype >= n; mtype--) { 14700Sstevel@tonic-gate if (fullrange != 0) { 14710Sstevel@tonic-gate pp = page_get_mnode_freelist(mnode, 14720Sstevel@tonic-gate bin, mtype, szc, flags); 14730Sstevel@tonic-gate if (pp == NULL) { 14740Sstevel@tonic-gate pp = page_get_mnode_cachelist( 14750Sstevel@tonic-gate bin, flags, mnode, mtype); 14760Sstevel@tonic-gate } 14770Sstevel@tonic-gate } else { 14780Sstevel@tonic-gate pp = page_get_mnode_anylist(bin, szc, 14790Sstevel@tonic-gate flags, mnode, mtype, dma_attr); 14800Sstevel@tonic-gate } 14810Sstevel@tonic-gate if (pp != NULL) { 14820Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_allocok); 14830Sstevel@tonic-gate check_dma(dma_attr, pp, 1); 14840Sstevel@tonic-gate return (pp); 14850Sstevel@tonic-gate } 14860Sstevel@tonic-gate } 14870Sstevel@tonic-gate if (!local_failed_stat) { 14880Sstevel@tonic-gate lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 14890Sstevel@tonic-gate local_failed_stat = 1; 14900Sstevel@tonic-gate } 14910Sstevel@tonic-gate } 14920Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_allocfailed); 14930Sstevel@tonic-gate 14940Sstevel@tonic-gate return (NULL); 14950Sstevel@tonic-gate } 14960Sstevel@tonic-gate 14970Sstevel@tonic-gate /* 14980Sstevel@tonic-gate * page_create_io() 14990Sstevel@tonic-gate * 15000Sstevel@tonic-gate * This function is a copy of page_create_va() with an additional 15010Sstevel@tonic-gate * argument 'mattr' that specifies DMA memory requirements to 15020Sstevel@tonic-gate * the page list functions. This function is used by the segkmem 15030Sstevel@tonic-gate * allocator so it is only to create new pages (i.e PG_EXCL is 15040Sstevel@tonic-gate * set). 15050Sstevel@tonic-gate * 15060Sstevel@tonic-gate * Note: This interface is currently used by x86 PSM only and is 15070Sstevel@tonic-gate * not fully specified so the commitment level is only for 15080Sstevel@tonic-gate * private interface specific to x86. This interface uses PSM 15090Sstevel@tonic-gate * specific page_get_anylist() interface. 15100Sstevel@tonic-gate */ 15110Sstevel@tonic-gate 15120Sstevel@tonic-gate #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 15130Sstevel@tonic-gate for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 15140Sstevel@tonic-gate if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 15150Sstevel@tonic-gate break; \ 15160Sstevel@tonic-gate } \ 15170Sstevel@tonic-gate } 15180Sstevel@tonic-gate 15190Sstevel@tonic-gate 15200Sstevel@tonic-gate page_t * 15210Sstevel@tonic-gate page_create_io( 15220Sstevel@tonic-gate struct vnode *vp, 15230Sstevel@tonic-gate u_offset_t off, 15240Sstevel@tonic-gate uint_t bytes, 15250Sstevel@tonic-gate uint_t flags, 15260Sstevel@tonic-gate struct as *as, 15270Sstevel@tonic-gate caddr_t vaddr, 15280Sstevel@tonic-gate ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 15290Sstevel@tonic-gate { 15300Sstevel@tonic-gate page_t *plist = NULL; 15310Sstevel@tonic-gate uint_t plist_len = 0; 15320Sstevel@tonic-gate pgcnt_t npages; 15330Sstevel@tonic-gate page_t *npp = NULL; 15340Sstevel@tonic-gate uint_t pages_req; 15350Sstevel@tonic-gate page_t *pp; 15360Sstevel@tonic-gate kmutex_t *phm = NULL; 15370Sstevel@tonic-gate uint_t index; 15380Sstevel@tonic-gate 15390Sstevel@tonic-gate TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 15400Sstevel@tonic-gate "page_create_start:vp %p off %llx bytes %u flags %x", 15410Sstevel@tonic-gate vp, off, bytes, flags); 15420Sstevel@tonic-gate 15430Sstevel@tonic-gate ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 15440Sstevel@tonic-gate 15450Sstevel@tonic-gate pages_req = npages = mmu_btopr(bytes); 15460Sstevel@tonic-gate 15470Sstevel@tonic-gate /* 15480Sstevel@tonic-gate * Do the freemem and pcf accounting. 15490Sstevel@tonic-gate */ 15500Sstevel@tonic-gate if (!page_create_wait(npages, flags)) { 15510Sstevel@tonic-gate return (NULL); 15520Sstevel@tonic-gate } 15530Sstevel@tonic-gate 15540Sstevel@tonic-gate TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 15550Sstevel@tonic-gate "page_create_success:vp %p off %llx", 15560Sstevel@tonic-gate vp, off); 15570Sstevel@tonic-gate 15580Sstevel@tonic-gate /* 15590Sstevel@tonic-gate * If satisfying this request has left us with too little 15600Sstevel@tonic-gate * memory, start the wheels turning to get some back. The 15610Sstevel@tonic-gate * first clause of the test prevents waking up the pageout 15620Sstevel@tonic-gate * daemon in situations where it would decide that there's 15630Sstevel@tonic-gate * nothing to do. 15640Sstevel@tonic-gate */ 15650Sstevel@tonic-gate if (nscan < desscan && freemem < minfree) { 15660Sstevel@tonic-gate TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 15670Sstevel@tonic-gate "pageout_cv_signal:freemem %ld", freemem); 15680Sstevel@tonic-gate cv_signal(&proc_pageout->p_cv); 15690Sstevel@tonic-gate } 15700Sstevel@tonic-gate 15710Sstevel@tonic-gate if (flags & PG_PHYSCONTIG) { 15720Sstevel@tonic-gate 15730Sstevel@tonic-gate plist = page_get_contigpage(&npages, mattr, 1); 15740Sstevel@tonic-gate if (plist == NULL) { 15750Sstevel@tonic-gate page_create_putback(npages); 15760Sstevel@tonic-gate return (NULL); 15770Sstevel@tonic-gate } 15780Sstevel@tonic-gate 15790Sstevel@tonic-gate pp = plist; 15800Sstevel@tonic-gate 15810Sstevel@tonic-gate do { 15820Sstevel@tonic-gate if (!page_hashin(pp, vp, off, NULL)) { 15830Sstevel@tonic-gate panic("pg_creat_io: hashin failed %p %p %llx", 15840Sstevel@tonic-gate (void *)pp, (void *)vp, off); 15850Sstevel@tonic-gate } 15860Sstevel@tonic-gate VM_STAT_ADD(page_create_new); 15870Sstevel@tonic-gate off += MMU_PAGESIZE; 15880Sstevel@tonic-gate PP_CLRFREE(pp); 15890Sstevel@tonic-gate PP_CLRAGED(pp); 15900Sstevel@tonic-gate page_set_props(pp, P_REF); 15910Sstevel@tonic-gate pp = pp->p_next; 15920Sstevel@tonic-gate } while (pp != plist); 15930Sstevel@tonic-gate 15940Sstevel@tonic-gate if (!npages) { 15950Sstevel@tonic-gate check_dma(mattr, plist, pages_req); 15960Sstevel@tonic-gate return (plist); 15970Sstevel@tonic-gate } else { 15980Sstevel@tonic-gate vaddr += (pages_req - npages) << MMU_PAGESHIFT; 15990Sstevel@tonic-gate } 16000Sstevel@tonic-gate 16010Sstevel@tonic-gate /* 16020Sstevel@tonic-gate * fall-thru: 16030Sstevel@tonic-gate * 16040Sstevel@tonic-gate * page_get_contigpage returns when npages <= sgllen. 16050Sstevel@tonic-gate * Grab the rest of the non-contig pages below from anylist. 16060Sstevel@tonic-gate */ 16070Sstevel@tonic-gate } 16080Sstevel@tonic-gate 16090Sstevel@tonic-gate /* 16100Sstevel@tonic-gate * Loop around collecting the requested number of pages. 16110Sstevel@tonic-gate * Most of the time, we have to `create' a new page. With 16120Sstevel@tonic-gate * this in mind, pull the page off the free list before 16130Sstevel@tonic-gate * getting the hash lock. This will minimize the hash 16140Sstevel@tonic-gate * lock hold time, nesting, and the like. If it turns 16150Sstevel@tonic-gate * out we don't need the page, we put it back at the end. 16160Sstevel@tonic-gate */ 16170Sstevel@tonic-gate while (npages--) { 16180Sstevel@tonic-gate phm = NULL; 16190Sstevel@tonic-gate 16200Sstevel@tonic-gate index = PAGE_HASH_FUNC(vp, off); 16210Sstevel@tonic-gate top: 16220Sstevel@tonic-gate ASSERT(phm == NULL); 16230Sstevel@tonic-gate ASSERT(index == PAGE_HASH_FUNC(vp, off)); 16240Sstevel@tonic-gate ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 16250Sstevel@tonic-gate 16260Sstevel@tonic-gate if (npp == NULL) { 16270Sstevel@tonic-gate /* 16280Sstevel@tonic-gate * Try to get the page of any color either from 16290Sstevel@tonic-gate * the freelist or from the cache list. 16300Sstevel@tonic-gate */ 16310Sstevel@tonic-gate npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 16320Sstevel@tonic-gate flags & ~PG_MATCH_COLOR, mattr, NULL); 16330Sstevel@tonic-gate if (npp == NULL) { 16340Sstevel@tonic-gate if (mattr == NULL) { 16350Sstevel@tonic-gate /* 16360Sstevel@tonic-gate * Not looking for a special page; 16370Sstevel@tonic-gate * panic! 16380Sstevel@tonic-gate */ 16390Sstevel@tonic-gate panic("no page found %d", (int)npages); 16400Sstevel@tonic-gate } 16410Sstevel@tonic-gate /* 16420Sstevel@tonic-gate * No page found! This can happen 16430Sstevel@tonic-gate * if we are looking for a page 16440Sstevel@tonic-gate * within a specific memory range 16450Sstevel@tonic-gate * for DMA purposes. If PG_WAIT is 16460Sstevel@tonic-gate * specified then we wait for a 16470Sstevel@tonic-gate * while and then try again. The 16480Sstevel@tonic-gate * wait could be forever if we 16490Sstevel@tonic-gate * don't get the page(s) we need. 16500Sstevel@tonic-gate * 16510Sstevel@tonic-gate * Note: XXX We really need a mechanism 16520Sstevel@tonic-gate * to wait for pages in the desired 16530Sstevel@tonic-gate * range. For now, we wait for any 16540Sstevel@tonic-gate * pages and see if we can use it. 16550Sstevel@tonic-gate */ 16560Sstevel@tonic-gate 16570Sstevel@tonic-gate if ((mattr != NULL) && (flags & PG_WAIT)) { 16580Sstevel@tonic-gate delay(10); 16590Sstevel@tonic-gate goto top; 16600Sstevel@tonic-gate } 16610Sstevel@tonic-gate 16620Sstevel@tonic-gate goto fail; /* undo accounting stuff */ 16630Sstevel@tonic-gate } 16640Sstevel@tonic-gate 16650Sstevel@tonic-gate if (PP_ISAGED(npp) == 0) { 16660Sstevel@tonic-gate /* 16670Sstevel@tonic-gate * Since this page came from the 16680Sstevel@tonic-gate * cachelist, we must destroy the 16690Sstevel@tonic-gate * old vnode association. 16700Sstevel@tonic-gate */ 16710Sstevel@tonic-gate page_hashout(npp, (kmutex_t *)NULL); 16720Sstevel@tonic-gate } 16730Sstevel@tonic-gate } 16740Sstevel@tonic-gate 16750Sstevel@tonic-gate /* 16760Sstevel@tonic-gate * We own this page! 16770Sstevel@tonic-gate */ 16780Sstevel@tonic-gate ASSERT(PAGE_EXCL(npp)); 16790Sstevel@tonic-gate ASSERT(npp->p_vnode == NULL); 16800Sstevel@tonic-gate ASSERT(!hat_page_is_mapped(npp)); 16810Sstevel@tonic-gate PP_CLRFREE(npp); 16820Sstevel@tonic-gate PP_CLRAGED(npp); 16830Sstevel@tonic-gate 16840Sstevel@tonic-gate /* 16850Sstevel@tonic-gate * Here we have a page in our hot little mits and are 16860Sstevel@tonic-gate * just waiting to stuff it on the appropriate lists. 16870Sstevel@tonic-gate * Get the mutex and check to see if it really does 16880Sstevel@tonic-gate * not exist. 16890Sstevel@tonic-gate */ 16900Sstevel@tonic-gate phm = PAGE_HASH_MUTEX(index); 16910Sstevel@tonic-gate mutex_enter(phm); 16920Sstevel@tonic-gate PAGE_HASH_SEARCH(index, pp, vp, off); 16930Sstevel@tonic-gate if (pp == NULL) { 16940Sstevel@tonic-gate VM_STAT_ADD(page_create_new); 16950Sstevel@tonic-gate pp = npp; 16960Sstevel@tonic-gate npp = NULL; 16970Sstevel@tonic-gate if (!page_hashin(pp, vp, off, phm)) { 16980Sstevel@tonic-gate /* 16990Sstevel@tonic-gate * Since we hold the page hash mutex and 17000Sstevel@tonic-gate * just searched for this page, page_hashin 17010Sstevel@tonic-gate * had better not fail. If it does, that 17020Sstevel@tonic-gate * means somethread did not follow the 17030Sstevel@tonic-gate * page hash mutex rules. Panic now and 17040Sstevel@tonic-gate * get it over with. As usual, go down 17050Sstevel@tonic-gate * holding all the locks. 17060Sstevel@tonic-gate */ 17070Sstevel@tonic-gate ASSERT(MUTEX_HELD(phm)); 17080Sstevel@tonic-gate panic("page_create: hashin fail %p %p %llx %p", 17090Sstevel@tonic-gate (void *)pp, (void *)vp, off, (void *)phm); 17100Sstevel@tonic-gate 17110Sstevel@tonic-gate } 17120Sstevel@tonic-gate ASSERT(MUTEX_HELD(phm)); 17130Sstevel@tonic-gate mutex_exit(phm); 17140Sstevel@tonic-gate phm = NULL; 17150Sstevel@tonic-gate 17160Sstevel@tonic-gate /* 17170Sstevel@tonic-gate * Hat layer locking need not be done to set 17180Sstevel@tonic-gate * the following bits since the page is not hashed 17190Sstevel@tonic-gate * and was on the free list (i.e., had no mappings). 17200Sstevel@tonic-gate * 17210Sstevel@tonic-gate * Set the reference bit to protect 17220Sstevel@tonic-gate * against immediate pageout 17230Sstevel@tonic-gate * 17240Sstevel@tonic-gate * XXXmh modify freelist code to set reference 17250Sstevel@tonic-gate * bit so we don't have to do it here. 17260Sstevel@tonic-gate */ 17270Sstevel@tonic-gate page_set_props(pp, P_REF); 17280Sstevel@tonic-gate } else { 17290Sstevel@tonic-gate ASSERT(MUTEX_HELD(phm)); 17300Sstevel@tonic-gate mutex_exit(phm); 17310Sstevel@tonic-gate phm = NULL; 17320Sstevel@tonic-gate /* 17330Sstevel@tonic-gate * NOTE: This should not happen for pages associated 17340Sstevel@tonic-gate * with kernel vnode 'kvp'. 17350Sstevel@tonic-gate */ 17360Sstevel@tonic-gate /* XX64 - to debug why this happens! */ 17370Sstevel@tonic-gate ASSERT(vp != &kvp); 17380Sstevel@tonic-gate if (vp == &kvp) 17390Sstevel@tonic-gate cmn_err(CE_NOTE, 17400Sstevel@tonic-gate "page_create: page not expected " 17410Sstevel@tonic-gate "in hash list for kernel vnode - pp 0x%p", 17420Sstevel@tonic-gate (void *)pp); 17430Sstevel@tonic-gate VM_STAT_ADD(page_create_exists); 17440Sstevel@tonic-gate goto fail; 17450Sstevel@tonic-gate } 17460Sstevel@tonic-gate 17470Sstevel@tonic-gate /* 17480Sstevel@tonic-gate * Got a page! It is locked. Acquire the i/o 17490Sstevel@tonic-gate * lock since we are going to use the p_next and 17500Sstevel@tonic-gate * p_prev fields to link the requested pages together. 17510Sstevel@tonic-gate */ 17520Sstevel@tonic-gate page_io_lock(pp); 17530Sstevel@tonic-gate page_add(&plist, pp); 17540Sstevel@tonic-gate plist = plist->p_next; 17550Sstevel@tonic-gate off += MMU_PAGESIZE; 17560Sstevel@tonic-gate vaddr += MMU_PAGESIZE; 17570Sstevel@tonic-gate } 17580Sstevel@tonic-gate 17590Sstevel@tonic-gate check_dma(mattr, plist, pages_req); 17600Sstevel@tonic-gate return (plist); 17610Sstevel@tonic-gate 17620Sstevel@tonic-gate fail: 17630Sstevel@tonic-gate if (npp != NULL) { 17640Sstevel@tonic-gate /* 17650Sstevel@tonic-gate * Did not need this page after all. 17660Sstevel@tonic-gate * Put it back on the free list. 17670Sstevel@tonic-gate */ 17680Sstevel@tonic-gate VM_STAT_ADD(page_create_putbacks); 17690Sstevel@tonic-gate PP_SETFREE(npp); 17700Sstevel@tonic-gate PP_SETAGED(npp); 17710Sstevel@tonic-gate npp->p_offset = (u_offset_t)-1; 17720Sstevel@tonic-gate page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 17730Sstevel@tonic-gate page_unlock(npp); 17740Sstevel@tonic-gate } 17750Sstevel@tonic-gate 17760Sstevel@tonic-gate /* 17770Sstevel@tonic-gate * Give up the pages we already got. 17780Sstevel@tonic-gate */ 17790Sstevel@tonic-gate while (plist != NULL) { 17800Sstevel@tonic-gate pp = plist; 17810Sstevel@tonic-gate page_sub(&plist, pp); 17820Sstevel@tonic-gate page_io_unlock(pp); 17830Sstevel@tonic-gate plist_len++; 17840Sstevel@tonic-gate /*LINTED: constant in conditional ctx*/ 17850Sstevel@tonic-gate VN_DISPOSE(pp, B_INVAL, 0, kcred); 17860Sstevel@tonic-gate } 17870Sstevel@tonic-gate 17880Sstevel@tonic-gate /* 17890Sstevel@tonic-gate * VN_DISPOSE does freemem accounting for the pages in plist 17900Sstevel@tonic-gate * by calling page_free. So, we need to undo the pcf accounting 17910Sstevel@tonic-gate * for only the remaining pages. 17920Sstevel@tonic-gate */ 17930Sstevel@tonic-gate VM_STAT_ADD(page_create_putbacks); 17940Sstevel@tonic-gate page_create_putback(pages_req - plist_len); 17950Sstevel@tonic-gate 17960Sstevel@tonic-gate return (NULL); 17970Sstevel@tonic-gate } 17980Sstevel@tonic-gate 17990Sstevel@tonic-gate 18000Sstevel@tonic-gate /* 18010Sstevel@tonic-gate * Copy the data from the physical page represented by "frompp" to 18020Sstevel@tonic-gate * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 18030Sstevel@tonic-gate * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 18040Sstevel@tonic-gate * level and no one sleeps with an active mapping there. 18050Sstevel@tonic-gate * 18060Sstevel@tonic-gate * Note that the ref/mod bits in the page_t's are not affected by 18070Sstevel@tonic-gate * this operation, hence it is up to the caller to update them appropriately. 18080Sstevel@tonic-gate */ 18090Sstevel@tonic-gate void 18100Sstevel@tonic-gate ppcopy(page_t *frompp, page_t *topp) 18110Sstevel@tonic-gate { 18120Sstevel@tonic-gate caddr_t pp_addr1; 18130Sstevel@tonic-gate caddr_t pp_addr2; 18140Sstevel@tonic-gate void *pte1; 18150Sstevel@tonic-gate void *pte2; 18160Sstevel@tonic-gate kmutex_t *ppaddr_mutex; 18170Sstevel@tonic-gate 18180Sstevel@tonic-gate ASSERT_STACK_ALIGNED(); 18190Sstevel@tonic-gate ASSERT(PAGE_LOCKED(frompp)); 18200Sstevel@tonic-gate ASSERT(PAGE_LOCKED(topp)); 18210Sstevel@tonic-gate 18220Sstevel@tonic-gate if (kpm_enable) { 18230Sstevel@tonic-gate pp_addr1 = hat_kpm_page2va(frompp, 0); 18240Sstevel@tonic-gate pp_addr2 = hat_kpm_page2va(topp, 0); 18250Sstevel@tonic-gate kpreempt_disable(); 18260Sstevel@tonic-gate } else { 18270Sstevel@tonic-gate /* 18280Sstevel@tonic-gate * disable pre-emption so that CPU can't change 18290Sstevel@tonic-gate */ 18300Sstevel@tonic-gate kpreempt_disable(); 18310Sstevel@tonic-gate 18320Sstevel@tonic-gate pp_addr1 = CPU->cpu_caddr1; 18330Sstevel@tonic-gate pp_addr2 = CPU->cpu_caddr2; 18340Sstevel@tonic-gate pte1 = (void *)CPU->cpu_caddr1pte; 18350Sstevel@tonic-gate pte2 = (void *)CPU->cpu_caddr2pte; 18360Sstevel@tonic-gate 18370Sstevel@tonic-gate ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 18380Sstevel@tonic-gate mutex_enter(ppaddr_mutex); 18390Sstevel@tonic-gate 18400Sstevel@tonic-gate hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 18410Sstevel@tonic-gate PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 18420Sstevel@tonic-gate hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 18430Sstevel@tonic-gate PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 18440Sstevel@tonic-gate HAT_LOAD_NOCONSIST); 18450Sstevel@tonic-gate } 18460Sstevel@tonic-gate 18470Sstevel@tonic-gate if (use_sse_pagecopy) 18480Sstevel@tonic-gate hwblkpagecopy(pp_addr1, pp_addr2); 18490Sstevel@tonic-gate else 18500Sstevel@tonic-gate bcopy(pp_addr1, pp_addr2, PAGESIZE); 18510Sstevel@tonic-gate 18520Sstevel@tonic-gate if (!kpm_enable) 18530Sstevel@tonic-gate mutex_exit(ppaddr_mutex); 18540Sstevel@tonic-gate kpreempt_enable(); 18550Sstevel@tonic-gate } 18560Sstevel@tonic-gate 18570Sstevel@tonic-gate /* 18580Sstevel@tonic-gate * Zero the physical page from off to off + len given by `pp' 18590Sstevel@tonic-gate * without changing the reference and modified bits of page. 18600Sstevel@tonic-gate * 18610Sstevel@tonic-gate * We use this using CPU private page address #2, see ppcopy() for more info. 18620Sstevel@tonic-gate * pagezero() must not be called at interrupt level. 18630Sstevel@tonic-gate */ 18640Sstevel@tonic-gate void 18650Sstevel@tonic-gate pagezero(page_t *pp, uint_t off, uint_t len) 18660Sstevel@tonic-gate { 18670Sstevel@tonic-gate caddr_t pp_addr2; 18680Sstevel@tonic-gate void *pte2; 18690Sstevel@tonic-gate kmutex_t *ppaddr_mutex; 18700Sstevel@tonic-gate 18710Sstevel@tonic-gate ASSERT_STACK_ALIGNED(); 18720Sstevel@tonic-gate ASSERT(len <= MMU_PAGESIZE); 18730Sstevel@tonic-gate ASSERT(off <= MMU_PAGESIZE); 18740Sstevel@tonic-gate ASSERT(off + len <= MMU_PAGESIZE); 18750Sstevel@tonic-gate ASSERT(PAGE_LOCKED(pp)); 18760Sstevel@tonic-gate 18770Sstevel@tonic-gate if (kpm_enable) { 18780Sstevel@tonic-gate pp_addr2 = hat_kpm_page2va(pp, 0); 18790Sstevel@tonic-gate kpreempt_disable(); 18800Sstevel@tonic-gate } else { 18810Sstevel@tonic-gate kpreempt_disable(); 18820Sstevel@tonic-gate 18830Sstevel@tonic-gate pp_addr2 = CPU->cpu_caddr2; 18840Sstevel@tonic-gate pte2 = (void *)CPU->cpu_caddr2pte; 18850Sstevel@tonic-gate 18860Sstevel@tonic-gate ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 18870Sstevel@tonic-gate mutex_enter(ppaddr_mutex); 18880Sstevel@tonic-gate 18890Sstevel@tonic-gate hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2, 18900Sstevel@tonic-gate PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 18910Sstevel@tonic-gate HAT_LOAD_NOCONSIST); 18920Sstevel@tonic-gate } 18930Sstevel@tonic-gate 18940Sstevel@tonic-gate if (use_sse_pagezero) 18950Sstevel@tonic-gate hwblkclr(pp_addr2 + off, len); 18960Sstevel@tonic-gate else 18970Sstevel@tonic-gate bzero(pp_addr2 + off, len); 18980Sstevel@tonic-gate 18990Sstevel@tonic-gate if (!kpm_enable) 19000Sstevel@tonic-gate mutex_exit(ppaddr_mutex); 19010Sstevel@tonic-gate kpreempt_enable(); 19020Sstevel@tonic-gate } 19030Sstevel@tonic-gate 19040Sstevel@tonic-gate /* 19050Sstevel@tonic-gate * Platform-dependent page scrub call. 19060Sstevel@tonic-gate */ 19070Sstevel@tonic-gate void 19080Sstevel@tonic-gate pagescrub(page_t *pp, uint_t off, uint_t len) 19090Sstevel@tonic-gate { 19100Sstevel@tonic-gate /* 19110Sstevel@tonic-gate * For now, we rely on the fact that pagezero() will 19120Sstevel@tonic-gate * always clear UEs. 19130Sstevel@tonic-gate */ 19140Sstevel@tonic-gate pagezero(pp, off, len); 19150Sstevel@tonic-gate } 19160Sstevel@tonic-gate 19170Sstevel@tonic-gate /* 19180Sstevel@tonic-gate * set up two private addresses for use on a given CPU for use in ppcopy() 19190Sstevel@tonic-gate */ 19200Sstevel@tonic-gate void 19210Sstevel@tonic-gate setup_vaddr_for_ppcopy(struct cpu *cpup) 19220Sstevel@tonic-gate { 19230Sstevel@tonic-gate void *addr; 19240Sstevel@tonic-gate void *pte; 19250Sstevel@tonic-gate 19260Sstevel@tonic-gate addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 19270Sstevel@tonic-gate pte = hat_mempte_setup(addr); 19280Sstevel@tonic-gate cpup->cpu_caddr1 = addr; 19290Sstevel@tonic-gate cpup->cpu_caddr1pte = (pteptr_t)pte; 19300Sstevel@tonic-gate 19310Sstevel@tonic-gate addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 19320Sstevel@tonic-gate pte = hat_mempte_setup(addr); 19330Sstevel@tonic-gate cpup->cpu_caddr2 = addr; 19340Sstevel@tonic-gate cpup->cpu_caddr2pte = (pteptr_t)pte; 19350Sstevel@tonic-gate 19360Sstevel@tonic-gate mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 19370Sstevel@tonic-gate } 19380Sstevel@tonic-gate 19390Sstevel@tonic-gate 19400Sstevel@tonic-gate /* 19410Sstevel@tonic-gate * Create the pageout scanner thread. The thread has to 19420Sstevel@tonic-gate * start at procedure with process pp and priority pri. 19430Sstevel@tonic-gate */ 19440Sstevel@tonic-gate void 19450Sstevel@tonic-gate pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 19460Sstevel@tonic-gate { 19470Sstevel@tonic-gate (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 19480Sstevel@tonic-gate } 19490Sstevel@tonic-gate 19500Sstevel@tonic-gate /* 19510Sstevel@tonic-gate * any use for this? 19520Sstevel@tonic-gate */ 19530Sstevel@tonic-gate void 19540Sstevel@tonic-gate post_startup_mmu_initialization(void) 19550Sstevel@tonic-gate {} 19560Sstevel@tonic-gate 19570Sstevel@tonic-gate /* 19580Sstevel@tonic-gate * Function for flushing D-cache when performing module relocations 19590Sstevel@tonic-gate * to an alternate mapping. Unnecessary on Intel / AMD platforms. 19600Sstevel@tonic-gate */ 19610Sstevel@tonic-gate void 19620Sstevel@tonic-gate dcache_flushall() 19630Sstevel@tonic-gate {} 1964