10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 51443Skchow * Common Development and Distribution License (the "License"). 61443Skchow * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate /* 2211678SSudheer.Abdul-Salam@Sun.COM * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 230Sstevel@tonic-gate * Use is subject to license terms. 240Sstevel@tonic-gate */ 25*12004Sjiang.liu@intel.com /* 26*12004Sjiang.liu@intel.com * Copyright (c) 2010, Intel Corporation. 27*12004Sjiang.liu@intel.com * All rights reserved. 28*12004Sjiang.liu@intel.com */ 290Sstevel@tonic-gate 300Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 310Sstevel@tonic-gate /* All Rights Reserved */ 320Sstevel@tonic-gate 330Sstevel@tonic-gate /* 340Sstevel@tonic-gate * Portions of this source code were derived from Berkeley 4.3 BSD 350Sstevel@tonic-gate * under license from the Regents of the University of California. 360Sstevel@tonic-gate */ 370Sstevel@tonic-gate 380Sstevel@tonic-gate /* 390Sstevel@tonic-gate * UNIX machine dependent virtual memory support. 400Sstevel@tonic-gate */ 410Sstevel@tonic-gate 420Sstevel@tonic-gate #include <sys/types.h> 430Sstevel@tonic-gate #include <sys/param.h> 440Sstevel@tonic-gate #include <sys/systm.h> 450Sstevel@tonic-gate #include <sys/user.h> 460Sstevel@tonic-gate #include <sys/proc.h> 470Sstevel@tonic-gate #include <sys/kmem.h> 480Sstevel@tonic-gate #include <sys/vmem.h> 490Sstevel@tonic-gate #include <sys/buf.h> 500Sstevel@tonic-gate #include <sys/cpuvar.h> 510Sstevel@tonic-gate #include <sys/lgrp.h> 520Sstevel@tonic-gate #include <sys/disp.h> 530Sstevel@tonic-gate #include <sys/vm.h> 540Sstevel@tonic-gate #include <sys/mman.h> 550Sstevel@tonic-gate #include <sys/vnode.h> 560Sstevel@tonic-gate #include <sys/cred.h> 570Sstevel@tonic-gate #include <sys/exec.h> 580Sstevel@tonic-gate #include <sys/exechdr.h> 590Sstevel@tonic-gate #include <sys/debug.h> 602991Ssusans #include <sys/vmsystm.h> 610Sstevel@tonic-gate 620Sstevel@tonic-gate #include <vm/hat.h> 630Sstevel@tonic-gate #include <vm/as.h> 640Sstevel@tonic-gate #include <vm/seg.h> 650Sstevel@tonic-gate #include <vm/seg_kp.h> 660Sstevel@tonic-gate #include <vm/seg_vn.h> 670Sstevel@tonic-gate #include <vm/page.h> 680Sstevel@tonic-gate #include <vm/seg_kmem.h> 690Sstevel@tonic-gate #include <vm/seg_kpm.h> 700Sstevel@tonic-gate #include <vm/vm_dep.h> 710Sstevel@tonic-gate 720Sstevel@tonic-gate #include <sys/cpu.h> 730Sstevel@tonic-gate #include <sys/vm_machparam.h> 740Sstevel@tonic-gate #include <sys/memlist.h> 750Sstevel@tonic-gate #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 760Sstevel@tonic-gate #include <vm/hat_i86.h> 770Sstevel@tonic-gate #include <sys/x86_archext.h> 780Sstevel@tonic-gate #include <sys/elf_386.h> 790Sstevel@tonic-gate #include <sys/cmn_err.h> 800Sstevel@tonic-gate #include <sys/archsystm.h> 810Sstevel@tonic-gate #include <sys/machsystm.h> 820Sstevel@tonic-gate 830Sstevel@tonic-gate #include <sys/vtrace.h> 840Sstevel@tonic-gate #include <sys/ddidmareq.h> 850Sstevel@tonic-gate #include <sys/promif.h> 860Sstevel@tonic-gate #include <sys/memnode.h> 870Sstevel@tonic-gate #include <sys/stack.h> 885084Sjohnlev #include <util/qsort.h> 895084Sjohnlev #include <sys/taskq.h> 905084Sjohnlev 915084Sjohnlev #ifdef __xpv 925084Sjohnlev 935084Sjohnlev #include <sys/hypervisor.h> 945084Sjohnlev #include <sys/xen_mmu.h> 955084Sjohnlev #include <sys/balloon_impl.h> 965084Sjohnlev 975084Sjohnlev /* 985084Sjohnlev * domain 0 pages usable for DMA are kept pre-allocated and kept in 995084Sjohnlev * distinct lists, ordered by increasing mfn. 1005084Sjohnlev */ 1015084Sjohnlev static kmutex_t io_pool_lock; 1025529Ssmaybe static kmutex_t contig_list_lock; 1035084Sjohnlev static page_t *io_pool_4g; /* pool for 32 bit dma limited devices */ 1045084Sjohnlev static page_t *io_pool_16m; /* pool for 24 bit dma limited legacy devices */ 1055084Sjohnlev static long io_pool_cnt; 1065084Sjohnlev static long io_pool_cnt_max = 0; 1075084Sjohnlev #define DEFAULT_IO_POOL_MIN 128 1085084Sjohnlev static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN; 1095084Sjohnlev static long io_pool_cnt_lowater = 0; 1105084Sjohnlev static long io_pool_shrink_attempts; /* how many times did we try to shrink */ 1115084Sjohnlev static long io_pool_shrinks; /* how many times did we really shrink */ 1125084Sjohnlev static long io_pool_grows; /* how many times did we grow */ 1135084Sjohnlev static mfn_t start_mfn = 1; 1145084Sjohnlev static caddr_t io_pool_kva; /* use to alloc pages when needed */ 1155084Sjohnlev 1165084Sjohnlev static int create_contig_pfnlist(uint_t); 1175084Sjohnlev 1185084Sjohnlev /* 1195084Sjohnlev * percentage of phys mem to hold in the i/o pool 1205084Sjohnlev */ 1215084Sjohnlev #define DEFAULT_IO_POOL_PCT 2 1225084Sjohnlev static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT; 1235084Sjohnlev static void page_io_pool_sub(page_t **, page_t *, page_t *); 1245529Ssmaybe int ioalloc_dbg = 0; 1255084Sjohnlev 1265084Sjohnlev #endif /* __xpv */ 1270Sstevel@tonic-gate 1282961Sdp78419 uint_t vac_colors = 1; 1290Sstevel@tonic-gate 1300Sstevel@tonic-gate int largepagesupport = 0; 1310Sstevel@tonic-gate extern uint_t page_create_new; 1320Sstevel@tonic-gate extern uint_t page_create_exists; 1330Sstevel@tonic-gate extern uint_t page_create_putbacks; 1343446Smrj /* 1353446Smrj * Allow users to disable the kernel's use of SSE. 1363446Smrj */ 1373446Smrj extern int use_sse_pagecopy, use_sse_pagezero; 1380Sstevel@tonic-gate 1395084Sjohnlev /* 1405084Sjohnlev * combined memory ranges from mnode and memranges[] to manage single 1415084Sjohnlev * mnode/mtype dimension in the page lists. 1425084Sjohnlev */ 1435084Sjohnlev typedef struct { 1445084Sjohnlev pfn_t mnr_pfnlo; 1455084Sjohnlev pfn_t mnr_pfnhi; 1465084Sjohnlev int mnr_mnode; 1475084Sjohnlev int mnr_memrange; /* index into memranges[] */ 148*12004Sjiang.liu@intel.com int mnr_next; /* next lower PA mnoderange */ 149*12004Sjiang.liu@intel.com int mnr_exists; 1505084Sjohnlev /* maintain page list stats */ 1515084Sjohnlev pgcnt_t mnr_mt_clpgcnt; /* cache list cnt */ 1525466Skchow pgcnt_t mnr_mt_flpgcnt[MMU_PAGE_SIZES]; /* free list cnt per szc */ 1535466Skchow pgcnt_t mnr_mt_totcnt; /* sum of cache and free lists */ 1545084Sjohnlev #ifdef DEBUG 1555084Sjohnlev struct mnr_mts { /* mnode/mtype szc stats */ 1565084Sjohnlev pgcnt_t mnr_mts_pgcnt; 1575084Sjohnlev int mnr_mts_colors; 1585084Sjohnlev pgcnt_t *mnr_mtsc_pgcnt; 1595084Sjohnlev } *mnr_mts; 1605084Sjohnlev #endif 1615084Sjohnlev } mnoderange_t; 1625084Sjohnlev 1635084Sjohnlev #define MEMRANGEHI(mtype) \ 1645084Sjohnlev ((mtype > 0) ? memranges[mtype - 1] - 1: physmax) 1655084Sjohnlev #define MEMRANGELO(mtype) (memranges[mtype]) 1665084Sjohnlev 1675466Skchow #define MTYPE_FREEMEM(mt) (mnoderanges[mt].mnr_mt_totcnt) 1685084Sjohnlev 1695084Sjohnlev /* 1705084Sjohnlev * As the PC architecture evolved memory up was clumped into several 1715084Sjohnlev * ranges for various historical I/O devices to do DMA. 1725084Sjohnlev * < 16Meg - ISA bus 1735084Sjohnlev * < 2Gig - ??? 1745084Sjohnlev * < 4Gig - PCI bus or drivers that don't understand PAE mode 1755084Sjohnlev * 1765084Sjohnlev * These are listed in reverse order, so that we can skip over unused 1775084Sjohnlev * ranges on machines with small memories. 1785084Sjohnlev * 1795084Sjohnlev * For now under the Hypervisor, we'll only ever have one memrange. 1805084Sjohnlev */ 1815084Sjohnlev #define PFN_4GIG 0x100000 1825084Sjohnlev #define PFN_16MEG 0x1000 183*12004Sjiang.liu@intel.com /* Indices into the memory range (arch_memranges) array. */ 184*12004Sjiang.liu@intel.com #define MRI_4G 0 185*12004Sjiang.liu@intel.com #define MRI_2G 1 186*12004Sjiang.liu@intel.com #define MRI_16M 2 187*12004Sjiang.liu@intel.com #define MRI_0 3 1885084Sjohnlev static pfn_t arch_memranges[NUM_MEM_RANGES] = { 1895084Sjohnlev PFN_4GIG, /* pfn range for 4G and above */ 1905084Sjohnlev 0x80000, /* pfn range for 2G-4G */ 1915084Sjohnlev PFN_16MEG, /* pfn range for 16M-2G */ 1925084Sjohnlev 0x00000, /* pfn range for 0-16M */ 1935084Sjohnlev }; 1945084Sjohnlev pfn_t *memranges = &arch_memranges[0]; 1955084Sjohnlev int nranges = NUM_MEM_RANGES; 1965084Sjohnlev 1975084Sjohnlev /* 1985084Sjohnlev * This combines mem_node_config and memranges into one data 1995084Sjohnlev * structure to be used for page list management. 2005084Sjohnlev */ 2015084Sjohnlev mnoderange_t *mnoderanges; 2025084Sjohnlev int mnoderangecnt; 2035084Sjohnlev int mtype4g; 204*12004Sjiang.liu@intel.com int mtype16m; 205*12004Sjiang.liu@intel.com int mtypetop; /* index of highest pfn'ed mnoderange */ 2065084Sjohnlev 2075084Sjohnlev /* 2085084Sjohnlev * 4g memory management variables for systems with more than 4g of memory: 2095084Sjohnlev * 2105084Sjohnlev * physical memory below 4g is required for 32bit dma devices and, currently, 2115084Sjohnlev * for kmem memory. On systems with more than 4g of memory, the pool of memory 2125084Sjohnlev * below 4g can be depleted without any paging activity given that there is 2135084Sjohnlev * likely to be sufficient memory above 4g. 2145084Sjohnlev * 2155084Sjohnlev * physmax4g is set true if the largest pfn is over 4g. The rest of the 2165084Sjohnlev * 4g memory management code is enabled only when physmax4g is true. 2175084Sjohnlev * 2185084Sjohnlev * maxmem4g is the count of the maximum number of pages on the page lists 2195084Sjohnlev * with physical addresses below 4g. It can be a lot less then 4g given that 2205084Sjohnlev * BIOS may reserve large chunks of space below 4g for hot plug pci devices, 2215084Sjohnlev * agp aperture etc. 2225084Sjohnlev * 2235084Sjohnlev * freemem4g maintains the count of the number of available pages on the 2245084Sjohnlev * page lists with physical addresses below 4g. 2255084Sjohnlev * 2265084Sjohnlev * DESFREE4G specifies the desired amount of below 4g memory. It defaults to 2275084Sjohnlev * 6% (desfree4gshift = 4) of maxmem4g. 2285084Sjohnlev * 2295084Sjohnlev * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G 2305084Sjohnlev * and the amount of physical memory above 4g is greater than freemem4g. 2315084Sjohnlev * In this case, page_get_* routines will restrict below 4g allocations 2325084Sjohnlev * for requests that don't specifically require it. 2335084Sjohnlev */ 2345084Sjohnlev 2355084Sjohnlev #define DESFREE4G (maxmem4g >> desfree4gshift) 2365084Sjohnlev 2375084Sjohnlev #define RESTRICT4G_ALLOC \ 2385084Sjohnlev (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem)) 2395084Sjohnlev 2405084Sjohnlev static pgcnt_t maxmem4g; 2415084Sjohnlev static pgcnt_t freemem4g; 2425084Sjohnlev static int physmax4g; 2435084Sjohnlev static int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 2445084Sjohnlev 2455084Sjohnlev /* 2465084Sjohnlev * 16m memory management: 2475084Sjohnlev * 2485084Sjohnlev * reserve some amount of physical memory below 16m for legacy devices. 2495084Sjohnlev * 2505084Sjohnlev * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above 2515084Sjohnlev * 16m or if the 16m pool drops below DESFREE16M. 2525084Sjohnlev * 2535084Sjohnlev * In this case, general page allocations via page_get_{free,cache}list 2545084Sjohnlev * routines will be restricted from allocating from the 16m pool. Allocations 2555084Sjohnlev * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations 2565084Sjohnlev * are not restricted. 2575084Sjohnlev */ 2585084Sjohnlev 259*12004Sjiang.liu@intel.com #define FREEMEM16M MTYPE_FREEMEM(mtype16m) 2605084Sjohnlev #define DESFREE16M desfree16m 2615084Sjohnlev #define RESTRICT16M_ALLOC(freemem, pgcnt, flags) \ 2625084Sjohnlev ((freemem != 0) && ((flags & PG_PANIC) == 0) && \ 2635084Sjohnlev ((freemem >= (FREEMEM16M)) || \ 2645084Sjohnlev (FREEMEM16M < (DESFREE16M + pgcnt)))) 2655084Sjohnlev 2665084Sjohnlev static pgcnt_t desfree16m = 0x380; 2675084Sjohnlev 2685084Sjohnlev /* 2695084Sjohnlev * This can be patched via /etc/system to allow old non-PAE aware device 2705084Sjohnlev * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 2715084Sjohnlev */ 2725084Sjohnlev int restricted_kmemalloc = 0; 2731385Skchow 2740Sstevel@tonic-gate #ifdef VM_STATS 2750Sstevel@tonic-gate struct { 2760Sstevel@tonic-gate ulong_t pga_alloc; 2770Sstevel@tonic-gate ulong_t pga_notfullrange; 2780Sstevel@tonic-gate ulong_t pga_nulldmaattr; 2790Sstevel@tonic-gate ulong_t pga_allocok; 2800Sstevel@tonic-gate ulong_t pga_allocfailed; 2810Sstevel@tonic-gate ulong_t pgma_alloc; 2820Sstevel@tonic-gate ulong_t pgma_allocok; 2830Sstevel@tonic-gate ulong_t pgma_allocfailed; 2840Sstevel@tonic-gate ulong_t pgma_allocempty; 2850Sstevel@tonic-gate } pga_vmstats; 2860Sstevel@tonic-gate #endif 2870Sstevel@tonic-gate 2880Sstevel@tonic-gate uint_t mmu_page_sizes; 2890Sstevel@tonic-gate 2900Sstevel@tonic-gate /* How many page sizes the users can see */ 2910Sstevel@tonic-gate uint_t mmu_exported_page_sizes; 2920Sstevel@tonic-gate 2935349Skchow /* page sizes that legacy applications can see */ 2945349Skchow uint_t mmu_legacy_page_sizes; 2955349Skchow 296423Sdavemq /* 297423Sdavemq * Number of pages in 1 GB. Don't enable automatic large pages if we have 298423Sdavemq * fewer than this many pages. 299423Sdavemq */ 3002991Ssusans pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 3012991Ssusans pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 3022991Ssusans 3032991Ssusans /* 3042991Ssusans * Maximum and default segment size tunables for user private 3052991Ssusans * and shared anon memory, and user text and initialized data. 3062991Ssusans * These can be patched via /etc/system to allow large pages 3072991Ssusans * to be used for mapping application private and shared anon memory. 3082991Ssusans */ 3092991Ssusans size_t mcntl0_lpsize = MMU_PAGESIZE; 3102991Ssusans size_t max_uheap_lpsize = MMU_PAGESIZE; 3112991Ssusans size_t default_uheap_lpsize = MMU_PAGESIZE; 3122991Ssusans size_t max_ustack_lpsize = MMU_PAGESIZE; 3132991Ssusans size_t default_ustack_lpsize = MMU_PAGESIZE; 3142991Ssusans size_t max_privmap_lpsize = MMU_PAGESIZE; 3152991Ssusans size_t max_uidata_lpsize = MMU_PAGESIZE; 3162991Ssusans size_t max_utext_lpsize = MMU_PAGESIZE; 3172991Ssusans size_t max_shm_lpsize = MMU_PAGESIZE; 3180Sstevel@tonic-gate 3195084Sjohnlev 3205084Sjohnlev /* 3215084Sjohnlev * initialized by page_coloring_init(). 3225084Sjohnlev */ 3235084Sjohnlev uint_t page_colors; 3245084Sjohnlev uint_t page_colors_mask; 3255084Sjohnlev uint_t page_coloring_shift; 3265084Sjohnlev int cpu_page_colors; 3275084Sjohnlev static uint_t l2_colors; 3285084Sjohnlev 3295084Sjohnlev /* 3305084Sjohnlev * Page freelists and cachelists are dynamically allocated once mnoderangecnt 3315084Sjohnlev * and page_colors are calculated from the l2 cache n-way set size. Within a 3325084Sjohnlev * mnode range, the page freelist and cachelist are hashed into bins based on 3335084Sjohnlev * color. This makes it easier to search for a page within a specific memory 3345084Sjohnlev * range. 3355084Sjohnlev */ 3365084Sjohnlev #define PAGE_COLORS_MIN 16 3375084Sjohnlev 3385084Sjohnlev page_t ****page_freelists; 3395084Sjohnlev page_t ***page_cachelists; 3405084Sjohnlev 3415084Sjohnlev 3425084Sjohnlev /* 3435084Sjohnlev * Used by page layer to know about page sizes 3445084Sjohnlev */ 3455084Sjohnlev hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 3465084Sjohnlev 3475084Sjohnlev kmutex_t *fpc_mutex[NPC_MUTEX]; 3485084Sjohnlev kmutex_t *cpc_mutex[NPC_MUTEX]; 3495084Sjohnlev 350*12004Sjiang.liu@intel.com /* Lock to protect mnoderanges array for memory DR operations. */ 351*12004Sjiang.liu@intel.com static kmutex_t mnoderange_lock; 352*12004Sjiang.liu@intel.com 3535084Sjohnlev /* 3545084Sjohnlev * Only let one thread at a time try to coalesce large pages, to 3555084Sjohnlev * prevent them from working against each other. 3565084Sjohnlev */ 3575084Sjohnlev static kmutex_t contig_lock; 3585084Sjohnlev #define CONTIG_LOCK() mutex_enter(&contig_lock); 3595084Sjohnlev #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 3605084Sjohnlev 3615084Sjohnlev #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 3625084Sjohnlev 3630Sstevel@tonic-gate /* 3640Sstevel@tonic-gate * Return the optimum page size for a given mapping 3650Sstevel@tonic-gate */ 3660Sstevel@tonic-gate /*ARGSUSED*/ 3670Sstevel@tonic-gate size_t 3682991Ssusans map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl) 3690Sstevel@tonic-gate { 3702991Ssusans level_t l = 0; 3712991Ssusans size_t pgsz = MMU_PAGESIZE; 3722991Ssusans size_t max_lpsize; 3732991Ssusans uint_t mszc; 3740Sstevel@tonic-gate 3752991Ssusans ASSERT(maptype != MAPPGSZ_VA); 3762991Ssusans 3772991Ssusans if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) { 3782991Ssusans return (MMU_PAGESIZE); 3792991Ssusans } 3800Sstevel@tonic-gate 3810Sstevel@tonic-gate switch (maptype) { 3822991Ssusans case MAPPGSZ_HEAP: 3830Sstevel@tonic-gate case MAPPGSZ_STK: 3842991Ssusans max_lpsize = memcntl ? mcntl0_lpsize : (maptype == 3852991Ssusans MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize); 3862991Ssusans if (max_lpsize == MMU_PAGESIZE) { 3872991Ssusans return (MMU_PAGESIZE); 3882991Ssusans } 3892991Ssusans if (len == 0) { 3902991Ssusans len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase + 3912991Ssusans p->p_brksize - p->p_bssbase : p->p_stksize; 3922991Ssusans } 3932991Ssusans len = (maptype == MAPPGSZ_HEAP) ? MAX(len, 3942991Ssusans default_uheap_lpsize) : MAX(len, default_ustack_lpsize); 3952991Ssusans 3960Sstevel@tonic-gate /* 3970Sstevel@tonic-gate * use the pages size that best fits len 3980Sstevel@tonic-gate */ 3995349Skchow for (l = mmu.umax_page_level; l > 0; --l) { 4002991Ssusans if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) { 4010Sstevel@tonic-gate continue; 4022991Ssusans } else { 4032991Ssusans pgsz = LEVEL_SIZE(l); 4042991Ssusans } 4050Sstevel@tonic-gate break; 4060Sstevel@tonic-gate } 4072991Ssusans 4082991Ssusans mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc : 4092991Ssusans p->p_stkpageszc); 4102991Ssusans if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) { 4112991Ssusans pgsz = hw_page_array[mszc].hp_size; 4122991Ssusans } 4132991Ssusans return (pgsz); 4140Sstevel@tonic-gate 4150Sstevel@tonic-gate case MAPPGSZ_ISM: 4165349Skchow for (l = mmu.umax_page_level; l > 0; --l) { 4175349Skchow if (len >= LEVEL_SIZE(l)) 4185349Skchow return (LEVEL_SIZE(l)); 4195349Skchow } 4205349Skchow return (LEVEL_SIZE(0)); 4210Sstevel@tonic-gate } 4222991Ssusans return (pgsz); 4230Sstevel@tonic-gate } 4240Sstevel@tonic-gate 4252991Ssusans static uint_t 4262991Ssusans map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize, 4272991Ssusans size_t min_physmem) 4282991Ssusans { 4292991Ssusans caddr_t eaddr = addr + size; 4302991Ssusans uint_t szcvec = 0; 4312991Ssusans caddr_t raddr; 4322991Ssusans caddr_t readdr; 4332991Ssusans size_t pgsz; 4342991Ssusans int i; 4352991Ssusans 4362991Ssusans if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) { 4372991Ssusans return (0); 4382991Ssusans } 4392991Ssusans 4405349Skchow for (i = mmu_exported_page_sizes - 1; i > 0; i--) { 4412991Ssusans pgsz = page_get_pagesize(i); 4422991Ssusans if (pgsz > max_lpsize) { 4432991Ssusans continue; 4442991Ssusans } 4452991Ssusans raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 4462991Ssusans readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 4472991Ssusans if (raddr < addr || raddr >= readdr) { 4482991Ssusans continue; 4492991Ssusans } 4502991Ssusans if (P2PHASE((uintptr_t)addr ^ off, pgsz)) { 4512991Ssusans continue; 4522991Ssusans } 4532991Ssusans /* 4542991Ssusans * Set szcvec to the remaining page sizes. 4552991Ssusans */ 4562991Ssusans szcvec = ((1 << (i + 1)) - 1) & ~1; 4572991Ssusans break; 4582991Ssusans } 4592991Ssusans return (szcvec); 4602991Ssusans } 4610Sstevel@tonic-gate 4620Sstevel@tonic-gate /* 4630Sstevel@tonic-gate * Return a bit vector of large page size codes that 4640Sstevel@tonic-gate * can be used to map [addr, addr + len) region. 4650Sstevel@tonic-gate */ 4660Sstevel@tonic-gate /*ARGSUSED*/ 4670Sstevel@tonic-gate uint_t 4682991Ssusans map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type, 4692991Ssusans int memcntl) 4700Sstevel@tonic-gate { 4712991Ssusans size_t max_lpsize = mcntl0_lpsize; 4720Sstevel@tonic-gate 4732991Ssusans if (mmu.max_page_level == 0) 4740Sstevel@tonic-gate return (0); 4750Sstevel@tonic-gate 4762991Ssusans if (flags & MAP_TEXT) { 4775084Sjohnlev if (!memcntl) 4785084Sjohnlev max_lpsize = max_utext_lpsize; 4795084Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 4802991Ssusans shm_lpg_min_physmem)); 4812991Ssusans 4822991Ssusans } else if (flags & MAP_INITDATA) { 4835084Sjohnlev if (!memcntl) 4845084Sjohnlev max_lpsize = max_uidata_lpsize; 4855084Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 4862991Ssusans privm_lpg_min_physmem)); 4872991Ssusans 4882991Ssusans } else if (type == MAPPGSZC_SHM) { 4895084Sjohnlev if (!memcntl) 4905084Sjohnlev max_lpsize = max_shm_lpsize; 4915084Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 4922991Ssusans shm_lpg_min_physmem)); 4930Sstevel@tonic-gate 4942991Ssusans } else if (type == MAPPGSZC_HEAP) { 4955084Sjohnlev if (!memcntl) 4965084Sjohnlev max_lpsize = max_uheap_lpsize; 4975084Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 4982991Ssusans privm_lpg_min_physmem)); 4992414Saguzovsk 5002991Ssusans } else if (type == MAPPGSZC_STACK) { 5015084Sjohnlev if (!memcntl) 5025084Sjohnlev max_lpsize = max_ustack_lpsize; 5035084Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 5042991Ssusans privm_lpg_min_physmem)); 5052991Ssusans 5062991Ssusans } else { 5075084Sjohnlev if (!memcntl) 5085084Sjohnlev max_lpsize = max_privmap_lpsize; 5095084Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 5102991Ssusans privm_lpg_min_physmem)); 5112414Saguzovsk } 5122414Saguzovsk } 5132414Saguzovsk 5140Sstevel@tonic-gate /* 5150Sstevel@tonic-gate * Handle a pagefault. 5160Sstevel@tonic-gate */ 5170Sstevel@tonic-gate faultcode_t 5180Sstevel@tonic-gate pagefault( 5190Sstevel@tonic-gate caddr_t addr, 5200Sstevel@tonic-gate enum fault_type type, 5210Sstevel@tonic-gate enum seg_rw rw, 5220Sstevel@tonic-gate int iskernel) 5230Sstevel@tonic-gate { 5240Sstevel@tonic-gate struct as *as; 5250Sstevel@tonic-gate struct hat *hat; 5260Sstevel@tonic-gate struct proc *p; 5270Sstevel@tonic-gate kthread_t *t; 5280Sstevel@tonic-gate faultcode_t res; 5290Sstevel@tonic-gate caddr_t base; 5300Sstevel@tonic-gate size_t len; 5310Sstevel@tonic-gate int err; 5320Sstevel@tonic-gate int mapped_red; 5330Sstevel@tonic-gate uintptr_t ea; 5340Sstevel@tonic-gate 5350Sstevel@tonic-gate ASSERT_STACK_ALIGNED(); 5360Sstevel@tonic-gate 5370Sstevel@tonic-gate if (INVALID_VADDR(addr)) 5380Sstevel@tonic-gate return (FC_NOMAP); 5390Sstevel@tonic-gate 5400Sstevel@tonic-gate mapped_red = segkp_map_red(); 5410Sstevel@tonic-gate 5420Sstevel@tonic-gate if (iskernel) { 5430Sstevel@tonic-gate as = &kas; 5440Sstevel@tonic-gate hat = as->a_hat; 5450Sstevel@tonic-gate } else { 5460Sstevel@tonic-gate t = curthread; 5470Sstevel@tonic-gate p = ttoproc(t); 5480Sstevel@tonic-gate as = p->p_as; 5490Sstevel@tonic-gate hat = as->a_hat; 5500Sstevel@tonic-gate } 5510Sstevel@tonic-gate 5520Sstevel@tonic-gate /* 5530Sstevel@tonic-gate * Dispatch pagefault. 5540Sstevel@tonic-gate */ 5550Sstevel@tonic-gate res = as_fault(hat, as, addr, 1, type, rw); 5560Sstevel@tonic-gate 5570Sstevel@tonic-gate /* 5580Sstevel@tonic-gate * If this isn't a potential unmapped hole in the user's 5590Sstevel@tonic-gate * UNIX data or stack segments, just return status info. 5600Sstevel@tonic-gate */ 5610Sstevel@tonic-gate if (res != FC_NOMAP || iskernel) 5620Sstevel@tonic-gate goto out; 5630Sstevel@tonic-gate 5640Sstevel@tonic-gate /* 5650Sstevel@tonic-gate * Check to see if we happened to faulted on a currently unmapped 5660Sstevel@tonic-gate * part of the UNIX data or stack segments. If so, create a zfod 5670Sstevel@tonic-gate * mapping there and then try calling the fault routine again. 5680Sstevel@tonic-gate */ 5690Sstevel@tonic-gate base = p->p_brkbase; 5700Sstevel@tonic-gate len = p->p_brksize; 5710Sstevel@tonic-gate 5720Sstevel@tonic-gate if (addr < base || addr >= base + len) { /* data seg? */ 5730Sstevel@tonic-gate base = (caddr_t)p->p_usrstack - p->p_stksize; 5740Sstevel@tonic-gate len = p->p_stksize; 5750Sstevel@tonic-gate if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 5760Sstevel@tonic-gate /* not in either UNIX data or stack segments */ 5770Sstevel@tonic-gate res = FC_NOMAP; 5780Sstevel@tonic-gate goto out; 5790Sstevel@tonic-gate } 5800Sstevel@tonic-gate } 5810Sstevel@tonic-gate 5820Sstevel@tonic-gate /* 5830Sstevel@tonic-gate * the rest of this function implements a 3.X 4.X 5.X compatibility 5840Sstevel@tonic-gate * This code is probably not needed anymore 5850Sstevel@tonic-gate */ 5860Sstevel@tonic-gate if (p->p_model == DATAMODEL_ILP32) { 5870Sstevel@tonic-gate 5880Sstevel@tonic-gate /* expand the gap to the page boundaries on each side */ 5890Sstevel@tonic-gate ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 5900Sstevel@tonic-gate base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 5910Sstevel@tonic-gate len = ea - (uintptr_t)base; 5920Sstevel@tonic-gate 5930Sstevel@tonic-gate as_rangelock(as); 5940Sstevel@tonic-gate if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 5950Sstevel@tonic-gate 0) { 5960Sstevel@tonic-gate err = as_map(as, base, len, segvn_create, zfod_argsp); 5970Sstevel@tonic-gate as_rangeunlock(as); 5980Sstevel@tonic-gate if (err) { 5990Sstevel@tonic-gate res = FC_MAKE_ERR(err); 6000Sstevel@tonic-gate goto out; 6010Sstevel@tonic-gate } 6020Sstevel@tonic-gate } else { 6030Sstevel@tonic-gate /* 6040Sstevel@tonic-gate * This page is already mapped by another thread after 6050Sstevel@tonic-gate * we returned from as_fault() above. We just fall 6060Sstevel@tonic-gate * through as_fault() below. 6070Sstevel@tonic-gate */ 6080Sstevel@tonic-gate as_rangeunlock(as); 6090Sstevel@tonic-gate } 6100Sstevel@tonic-gate 6110Sstevel@tonic-gate res = as_fault(hat, as, addr, 1, F_INVAL, rw); 6120Sstevel@tonic-gate } 6130Sstevel@tonic-gate 6140Sstevel@tonic-gate out: 6150Sstevel@tonic-gate if (mapped_red) 6160Sstevel@tonic-gate segkp_unmap_red(); 6170Sstevel@tonic-gate 6180Sstevel@tonic-gate return (res); 6190Sstevel@tonic-gate } 6200Sstevel@tonic-gate 6210Sstevel@tonic-gate void 6220Sstevel@tonic-gate map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 6230Sstevel@tonic-gate { 6240Sstevel@tonic-gate struct proc *p = curproc; 6250Sstevel@tonic-gate caddr_t userlimit = (flags & _MAP_LOW32) ? 6260Sstevel@tonic-gate (caddr_t)_userlimit32 : p->p_as->a_userlimit; 6270Sstevel@tonic-gate 6280Sstevel@tonic-gate map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 6290Sstevel@tonic-gate } 6300Sstevel@tonic-gate 6310Sstevel@tonic-gate /*ARGSUSED*/ 6320Sstevel@tonic-gate int 6330Sstevel@tonic-gate map_addr_vacalign_check(caddr_t addr, u_offset_t off) 6340Sstevel@tonic-gate { 6350Sstevel@tonic-gate return (0); 6360Sstevel@tonic-gate } 6370Sstevel@tonic-gate 6380Sstevel@tonic-gate /* 6390Sstevel@tonic-gate * map_addr_proc() is the routine called when the system is to 6400Sstevel@tonic-gate * choose an address for the user. We will pick an address 6413446Smrj * range which is the highest available below userlimit. 6420Sstevel@tonic-gate * 6435668Smec * Every mapping will have a redzone of a single page on either side of 6445668Smec * the request. This is done to leave one page unmapped between segments. 6455668Smec * This is not required, but it's useful for the user because if their 6465668Smec * program strays across a segment boundary, it will catch a fault 6475668Smec * immediately making debugging a little easier. Currently the redzone 6485668Smec * is mandatory. 6495668Smec * 6500Sstevel@tonic-gate * addrp is a value/result parameter. 6510Sstevel@tonic-gate * On input it is a hint from the user to be used in a completely 6520Sstevel@tonic-gate * machine dependent fashion. We decide to completely ignore this hint. 6535668Smec * If MAP_ALIGN was specified, addrp contains the minimal alignment, which 6545668Smec * must be some "power of two" multiple of pagesize. 6550Sstevel@tonic-gate * 6560Sstevel@tonic-gate * On output it is NULL if no address can be found in the current 6570Sstevel@tonic-gate * processes address space or else an address that is currently 6580Sstevel@tonic-gate * not mapped for len bytes with a page of red zone on either side. 6590Sstevel@tonic-gate * 6605668Smec * vacalign is not needed on x86 (it's for viturally addressed caches) 6610Sstevel@tonic-gate */ 6620Sstevel@tonic-gate /*ARGSUSED*/ 6630Sstevel@tonic-gate void 6640Sstevel@tonic-gate map_addr_proc( 6650Sstevel@tonic-gate caddr_t *addrp, 6660Sstevel@tonic-gate size_t len, 6670Sstevel@tonic-gate offset_t off, 6680Sstevel@tonic-gate int vacalign, 6690Sstevel@tonic-gate caddr_t userlimit, 6700Sstevel@tonic-gate struct proc *p, 6710Sstevel@tonic-gate uint_t flags) 6720Sstevel@tonic-gate { 6730Sstevel@tonic-gate struct as *as = p->p_as; 6740Sstevel@tonic-gate caddr_t addr; 6750Sstevel@tonic-gate caddr_t base; 6760Sstevel@tonic-gate size_t slen; 6770Sstevel@tonic-gate size_t align_amount; 6780Sstevel@tonic-gate 6790Sstevel@tonic-gate ASSERT32(userlimit == as->a_userlimit); 6800Sstevel@tonic-gate 6810Sstevel@tonic-gate base = p->p_brkbase; 6820Sstevel@tonic-gate #if defined(__amd64) 6830Sstevel@tonic-gate /* 6840Sstevel@tonic-gate * XX64 Yes, this needs more work. 6850Sstevel@tonic-gate */ 6860Sstevel@tonic-gate if (p->p_model == DATAMODEL_NATIVE) { 6870Sstevel@tonic-gate if (userlimit < as->a_userlimit) { 6880Sstevel@tonic-gate /* 6890Sstevel@tonic-gate * This happens when a program wants to map 6900Sstevel@tonic-gate * something in a range that's accessible to a 6910Sstevel@tonic-gate * program in a smaller address space. For example, 6920Sstevel@tonic-gate * a 64-bit program calling mmap32(2) to guarantee 6930Sstevel@tonic-gate * that the returned address is below 4Gbytes. 6940Sstevel@tonic-gate */ 6950Sstevel@tonic-gate ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 6960Sstevel@tonic-gate 6970Sstevel@tonic-gate if (userlimit > base) 6980Sstevel@tonic-gate slen = userlimit - base; 6990Sstevel@tonic-gate else { 7000Sstevel@tonic-gate *addrp = NULL; 7010Sstevel@tonic-gate return; 7020Sstevel@tonic-gate } 7030Sstevel@tonic-gate } else { 7040Sstevel@tonic-gate /* 7050Sstevel@tonic-gate * XX64 This layout is probably wrong .. but in 7060Sstevel@tonic-gate * the event we make the amd64 address space look 7070Sstevel@tonic-gate * like sparcv9 i.e. with the stack -above- the 7080Sstevel@tonic-gate * heap, this bit of code might even be correct. 7090Sstevel@tonic-gate */ 7100Sstevel@tonic-gate slen = p->p_usrstack - base - 7118947SMichael.Corcoran@Sun.COM ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK); 7120Sstevel@tonic-gate } 7130Sstevel@tonic-gate } else 7140Sstevel@tonic-gate #endif 7150Sstevel@tonic-gate slen = userlimit - base; 7160Sstevel@tonic-gate 7175668Smec /* Make len be a multiple of PAGESIZE */ 7180Sstevel@tonic-gate len = (len + PAGEOFFSET) & PAGEMASK; 7190Sstevel@tonic-gate 7200Sstevel@tonic-gate /* 7210Sstevel@tonic-gate * figure out what the alignment should be 7220Sstevel@tonic-gate * 7230Sstevel@tonic-gate * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 7240Sstevel@tonic-gate */ 7250Sstevel@tonic-gate if (len <= ELF_386_MAXPGSZ) { 7260Sstevel@tonic-gate /* 7270Sstevel@tonic-gate * Align virtual addresses to ensure that ELF shared libraries 7280Sstevel@tonic-gate * are mapped with the appropriate alignment constraints by 7290Sstevel@tonic-gate * the run-time linker. 7300Sstevel@tonic-gate */ 7310Sstevel@tonic-gate align_amount = ELF_386_MAXPGSZ; 7320Sstevel@tonic-gate } else { 73311678SSudheer.Abdul-Salam@Sun.COM /* 73411678SSudheer.Abdul-Salam@Sun.COM * For 32-bit processes, only those which have specified 73511678SSudheer.Abdul-Salam@Sun.COM * MAP_ALIGN and an addr will be aligned on a larger page size. 73611678SSudheer.Abdul-Salam@Sun.COM * Not doing so can potentially waste up to 1G of process 73711678SSudheer.Abdul-Salam@Sun.COM * address space. 73811678SSudheer.Abdul-Salam@Sun.COM */ 73911678SSudheer.Abdul-Salam@Sun.COM int lvl = (p->p_model == DATAMODEL_ILP32) ? 1 : 74011678SSudheer.Abdul-Salam@Sun.COM mmu.umax_page_level; 74111678SSudheer.Abdul-Salam@Sun.COM 74211678SSudheer.Abdul-Salam@Sun.COM while (lvl && len < LEVEL_SIZE(lvl)) 74311678SSudheer.Abdul-Salam@Sun.COM --lvl; 74411678SSudheer.Abdul-Salam@Sun.COM 74511678SSudheer.Abdul-Salam@Sun.COM align_amount = LEVEL_SIZE(lvl); 7460Sstevel@tonic-gate } 7470Sstevel@tonic-gate if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 7480Sstevel@tonic-gate align_amount = (uintptr_t)*addrp; 7490Sstevel@tonic-gate 7505668Smec ASSERT(ISP2(align_amount)); 7515668Smec ASSERT(align_amount == 0 || align_amount >= PAGESIZE); 7525668Smec 7535668Smec off = off & (align_amount - 1); 7540Sstevel@tonic-gate /* 7550Sstevel@tonic-gate * Look for a large enough hole starting below userlimit. 7565668Smec * After finding it, use the upper part. 7570Sstevel@tonic-gate */ 7585668Smec if (as_gap_aligned(as, len, &base, &slen, AH_HI, NULL, align_amount, 7595668Smec PAGESIZE, off) == 0) { 7600Sstevel@tonic-gate caddr_t as_addr; 7610Sstevel@tonic-gate 7625668Smec /* 7635668Smec * addr is the highest possible address to use since we have 7645668Smec * a PAGESIZE redzone at the beginning and end. 7655668Smec */ 7665668Smec addr = base + slen - (PAGESIZE + len); 7670Sstevel@tonic-gate as_addr = addr; 7680Sstevel@tonic-gate /* 7695668Smec * Round address DOWN to the alignment amount and 7705668Smec * add the offset in. 7715668Smec * If addr is greater than as_addr, len would not be large 7725668Smec * enough to include the redzone, so we must adjust down 7735668Smec * by the alignment amount. 7740Sstevel@tonic-gate */ 7750Sstevel@tonic-gate addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 7765668Smec addr += (uintptr_t)off; 7775668Smec if (addr > as_addr) { 7785668Smec addr -= align_amount; 7795668Smec } 7805668Smec 7815668Smec ASSERT(addr > base); 7825668Smec ASSERT(addr + len < base + slen); 7830Sstevel@tonic-gate ASSERT(((uintptr_t)addr & (align_amount - 1)) == 7845668Smec ((uintptr_t)(off))); 7850Sstevel@tonic-gate *addrp = addr; 7860Sstevel@tonic-gate } else { 7870Sstevel@tonic-gate *addrp = NULL; /* no more virtual space */ 7880Sstevel@tonic-gate } 7890Sstevel@tonic-gate } 7900Sstevel@tonic-gate 7915668Smec int valid_va_range_aligned_wraparound; 7925668Smec 7930Sstevel@tonic-gate /* 7945668Smec * Determine whether [*basep, *basep + *lenp) contains a mappable range of 7955668Smec * addresses at least "minlen" long, where the base of the range is at "off" 7965668Smec * phase from an "align" boundary and there is space for a "redzone"-sized 7975668Smec * redzone on either side of the range. On success, 1 is returned and *basep 7985668Smec * and *lenp are adjusted to describe the acceptable range (including 7995668Smec * the redzone). On failure, 0 is returned. 8000Sstevel@tonic-gate */ 8010Sstevel@tonic-gate /*ARGSUSED3*/ 8020Sstevel@tonic-gate int 8035668Smec valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir, 8045668Smec size_t align, size_t redzone, size_t off) 8050Sstevel@tonic-gate { 8060Sstevel@tonic-gate uintptr_t hi, lo; 8075668Smec size_t tot_len; 8085668Smec 8095668Smec ASSERT(align == 0 ? off == 0 : off < align); 8105668Smec ASSERT(ISP2(align)); 8115668Smec ASSERT(align == 0 || align >= PAGESIZE); 8120Sstevel@tonic-gate 8130Sstevel@tonic-gate lo = (uintptr_t)*basep; 8140Sstevel@tonic-gate hi = lo + *lenp; 8155668Smec tot_len = minlen + 2 * redzone; /* need at least this much space */ 8160Sstevel@tonic-gate 8170Sstevel@tonic-gate /* 8180Sstevel@tonic-gate * If hi rolled over the top, try cutting back. 8190Sstevel@tonic-gate */ 8200Sstevel@tonic-gate if (hi < lo) { 8215668Smec *lenp = 0UL - lo - 1UL; 8225668Smec /* See if this really happens. If so, then we figure out why */ 8235668Smec valid_va_range_aligned_wraparound++; 8245668Smec hi = lo + *lenp; 8255668Smec } 8265668Smec if (*lenp < tot_len) { 8270Sstevel@tonic-gate return (0); 8280Sstevel@tonic-gate } 8295668Smec 8300Sstevel@tonic-gate #if defined(__amd64) 8310Sstevel@tonic-gate /* 8320Sstevel@tonic-gate * Deal with a possible hole in the address range between 8330Sstevel@tonic-gate * hole_start and hole_end that should never be mapped. 8340Sstevel@tonic-gate */ 8350Sstevel@tonic-gate if (lo < hole_start) { 8360Sstevel@tonic-gate if (hi > hole_start) { 8370Sstevel@tonic-gate if (hi < hole_end) { 8380Sstevel@tonic-gate hi = hole_start; 8390Sstevel@tonic-gate } else { 8400Sstevel@tonic-gate /* lo < hole_start && hi >= hole_end */ 8410Sstevel@tonic-gate if (dir == AH_LO) { 8420Sstevel@tonic-gate /* 8430Sstevel@tonic-gate * prefer lowest range 8440Sstevel@tonic-gate */ 8455668Smec if (hole_start - lo >= tot_len) 8460Sstevel@tonic-gate hi = hole_start; 8475668Smec else if (hi - hole_end >= tot_len) 8480Sstevel@tonic-gate lo = hole_end; 8490Sstevel@tonic-gate else 8500Sstevel@tonic-gate return (0); 8510Sstevel@tonic-gate } else { 8520Sstevel@tonic-gate /* 8530Sstevel@tonic-gate * prefer highest range 8540Sstevel@tonic-gate */ 8555668Smec if (hi - hole_end >= tot_len) 8560Sstevel@tonic-gate lo = hole_end; 8575668Smec else if (hole_start - lo >= tot_len) 8580Sstevel@tonic-gate hi = hole_start; 8590Sstevel@tonic-gate else 8600Sstevel@tonic-gate return (0); 8610Sstevel@tonic-gate } 8620Sstevel@tonic-gate } 8630Sstevel@tonic-gate } 8640Sstevel@tonic-gate } else { 8650Sstevel@tonic-gate /* lo >= hole_start */ 8660Sstevel@tonic-gate if (hi < hole_end) 8670Sstevel@tonic-gate return (0); 8680Sstevel@tonic-gate if (lo < hole_end) 8690Sstevel@tonic-gate lo = hole_end; 8700Sstevel@tonic-gate } 8715668Smec #endif 8725668Smec 8735668Smec if (hi - lo < tot_len) 8740Sstevel@tonic-gate return (0); 8750Sstevel@tonic-gate 8765668Smec if (align > 1) { 8775668Smec uintptr_t tlo = lo + redzone; 8785668Smec uintptr_t thi = hi - redzone; 8795668Smec tlo = (uintptr_t)P2PHASEUP(tlo, align, off); 8805668Smec if (tlo < lo + redzone) { 8815668Smec return (0); 8825668Smec } 8835668Smec if (thi < tlo || thi - tlo < minlen) { 8845668Smec return (0); 8855668Smec } 8865668Smec } 8875668Smec 8880Sstevel@tonic-gate *basep = (caddr_t)lo; 8890Sstevel@tonic-gate *lenp = hi - lo; 8900Sstevel@tonic-gate return (1); 8910Sstevel@tonic-gate } 8920Sstevel@tonic-gate 8930Sstevel@tonic-gate /* 8945668Smec * Determine whether [*basep, *basep + *lenp) contains a mappable range of 8955668Smec * addresses at least "minlen" long. On success, 1 is returned and *basep 8965668Smec * and *lenp are adjusted to describe the acceptable range. On failure, 0 8975668Smec * is returned. 8985668Smec */ 8995668Smec int 9005668Smec valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 9015668Smec { 9025668Smec return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0)); 9035668Smec } 9045668Smec 9055668Smec /* 9060Sstevel@tonic-gate * Determine whether [addr, addr+len] are valid user addresses. 9070Sstevel@tonic-gate */ 9080Sstevel@tonic-gate /*ARGSUSED*/ 9090Sstevel@tonic-gate int 9100Sstevel@tonic-gate valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 9110Sstevel@tonic-gate caddr_t userlimit) 9120Sstevel@tonic-gate { 9130Sstevel@tonic-gate caddr_t eaddr = addr + len; 9140Sstevel@tonic-gate 9150Sstevel@tonic-gate if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 9160Sstevel@tonic-gate return (RANGE_BADADDR); 9170Sstevel@tonic-gate 9180Sstevel@tonic-gate #if defined(__amd64) 9190Sstevel@tonic-gate /* 9200Sstevel@tonic-gate * Check for the VA hole 9210Sstevel@tonic-gate */ 9220Sstevel@tonic-gate if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 9230Sstevel@tonic-gate return (RANGE_BADADDR); 9240Sstevel@tonic-gate #endif 9250Sstevel@tonic-gate 9260Sstevel@tonic-gate return (RANGE_OKAY); 9270Sstevel@tonic-gate } 9280Sstevel@tonic-gate 9290Sstevel@tonic-gate /* 9300Sstevel@tonic-gate * Return 1 if the page frame is onboard memory, else 0. 9310Sstevel@tonic-gate */ 9320Sstevel@tonic-gate int 9330Sstevel@tonic-gate pf_is_memory(pfn_t pf) 9340Sstevel@tonic-gate { 9353446Smrj if (pfn_is_foreign(pf)) 9363446Smrj return (0); 9373446Smrj return (address_in_memlist(phys_install, pfn_to_pa(pf), 1)); 9380Sstevel@tonic-gate } 9390Sstevel@tonic-gate 9400Sstevel@tonic-gate /* 9410Sstevel@tonic-gate * return the memrange containing pfn 9420Sstevel@tonic-gate */ 9430Sstevel@tonic-gate int 9440Sstevel@tonic-gate memrange_num(pfn_t pfn) 9450Sstevel@tonic-gate { 9460Sstevel@tonic-gate int n; 9470Sstevel@tonic-gate 9480Sstevel@tonic-gate for (n = 0; n < nranges - 1; ++n) { 9490Sstevel@tonic-gate if (pfn >= memranges[n]) 9500Sstevel@tonic-gate break; 9510Sstevel@tonic-gate } 9520Sstevel@tonic-gate return (n); 9530Sstevel@tonic-gate } 9540Sstevel@tonic-gate 9550Sstevel@tonic-gate /* 9560Sstevel@tonic-gate * return the mnoderange containing pfn 9570Sstevel@tonic-gate */ 9585084Sjohnlev /*ARGSUSED*/ 9590Sstevel@tonic-gate int 9600Sstevel@tonic-gate pfn_2_mtype(pfn_t pfn) 9610Sstevel@tonic-gate { 9625084Sjohnlev #if defined(__xpv) 9635084Sjohnlev return (0); 9645084Sjohnlev #else 9650Sstevel@tonic-gate int n; 9660Sstevel@tonic-gate 967*12004Sjiang.liu@intel.com /* Always start from highest pfn and work our way down */ 968*12004Sjiang.liu@intel.com for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) { 9690Sstevel@tonic-gate if (pfn >= mnoderanges[n].mnr_pfnlo) { 9700Sstevel@tonic-gate break; 9710Sstevel@tonic-gate } 9720Sstevel@tonic-gate } 9730Sstevel@tonic-gate return (n); 9745084Sjohnlev #endif 9750Sstevel@tonic-gate } 9760Sstevel@tonic-gate 9775084Sjohnlev #if !defined(__xpv) 9780Sstevel@tonic-gate /* 9790Sstevel@tonic-gate * is_contigpage_free: 9800Sstevel@tonic-gate * returns a page list of contiguous pages. It minimally has to return 9810Sstevel@tonic-gate * minctg pages. Caller determines minctg based on the scatter-gather 9820Sstevel@tonic-gate * list length. 9830Sstevel@tonic-gate * 9840Sstevel@tonic-gate * pfnp is set to the next page frame to search on return. 9850Sstevel@tonic-gate */ 9860Sstevel@tonic-gate static page_t * 9870Sstevel@tonic-gate is_contigpage_free( 9880Sstevel@tonic-gate pfn_t *pfnp, 9890Sstevel@tonic-gate pgcnt_t *pgcnt, 9900Sstevel@tonic-gate pgcnt_t minctg, 9910Sstevel@tonic-gate uint64_t pfnseg, 9920Sstevel@tonic-gate int iolock) 9930Sstevel@tonic-gate { 9940Sstevel@tonic-gate int i = 0; 9950Sstevel@tonic-gate pfn_t pfn = *pfnp; 9960Sstevel@tonic-gate page_t *pp; 9970Sstevel@tonic-gate page_t *plist = NULL; 9980Sstevel@tonic-gate 9990Sstevel@tonic-gate /* 10000Sstevel@tonic-gate * fail if pfn + minctg crosses a segment boundary. 10010Sstevel@tonic-gate * Adjust for next starting pfn to begin at segment boundary. 10020Sstevel@tonic-gate */ 10030Sstevel@tonic-gate 10040Sstevel@tonic-gate if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 10050Sstevel@tonic-gate *pfnp = roundup(*pfnp, pfnseg + 1); 10060Sstevel@tonic-gate return (NULL); 10070Sstevel@tonic-gate } 10080Sstevel@tonic-gate 10090Sstevel@tonic-gate do { 10100Sstevel@tonic-gate retry: 10110Sstevel@tonic-gate pp = page_numtopp_nolock(pfn + i); 10120Sstevel@tonic-gate if ((pp == NULL) || 10130Sstevel@tonic-gate (page_trylock(pp, SE_EXCL) == 0)) { 10140Sstevel@tonic-gate (*pfnp)++; 10150Sstevel@tonic-gate break; 10160Sstevel@tonic-gate } 10170Sstevel@tonic-gate if (page_pptonum(pp) != pfn + i) { 10180Sstevel@tonic-gate page_unlock(pp); 10190Sstevel@tonic-gate goto retry; 10200Sstevel@tonic-gate } 10210Sstevel@tonic-gate 10220Sstevel@tonic-gate if (!(PP_ISFREE(pp))) { 10230Sstevel@tonic-gate page_unlock(pp); 10240Sstevel@tonic-gate (*pfnp)++; 10250Sstevel@tonic-gate break; 10260Sstevel@tonic-gate } 10270Sstevel@tonic-gate 10280Sstevel@tonic-gate if (!PP_ISAGED(pp)) { 10290Sstevel@tonic-gate page_list_sub(pp, PG_CACHE_LIST); 10300Sstevel@tonic-gate page_hashout(pp, (kmutex_t *)NULL); 10310Sstevel@tonic-gate } else { 10320Sstevel@tonic-gate page_list_sub(pp, PG_FREE_LIST); 10330Sstevel@tonic-gate } 10340Sstevel@tonic-gate 10350Sstevel@tonic-gate if (iolock) 10360Sstevel@tonic-gate page_io_lock(pp); 10370Sstevel@tonic-gate page_list_concat(&plist, &pp); 10380Sstevel@tonic-gate 10390Sstevel@tonic-gate /* 10400Sstevel@tonic-gate * exit loop when pgcnt satisfied or segment boundary reached. 10410Sstevel@tonic-gate */ 10420Sstevel@tonic-gate 10430Sstevel@tonic-gate } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 10440Sstevel@tonic-gate 10450Sstevel@tonic-gate *pfnp += i; /* set to next pfn to search */ 10460Sstevel@tonic-gate 10470Sstevel@tonic-gate if (i >= minctg) { 10480Sstevel@tonic-gate *pgcnt -= i; 10490Sstevel@tonic-gate return (plist); 10500Sstevel@tonic-gate } 10510Sstevel@tonic-gate 10520Sstevel@tonic-gate /* 10530Sstevel@tonic-gate * failure: minctg not satisfied. 10540Sstevel@tonic-gate * 10550Sstevel@tonic-gate * if next request crosses segment boundary, set next pfn 10560Sstevel@tonic-gate * to search from the segment boundary. 10570Sstevel@tonic-gate */ 10580Sstevel@tonic-gate if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 10590Sstevel@tonic-gate *pfnp = roundup(*pfnp, pfnseg + 1); 10600Sstevel@tonic-gate 10610Sstevel@tonic-gate /* clean up any pages already allocated */ 10620Sstevel@tonic-gate 10630Sstevel@tonic-gate while (plist) { 10640Sstevel@tonic-gate pp = plist; 10650Sstevel@tonic-gate page_sub(&plist, pp); 10660Sstevel@tonic-gate page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 10670Sstevel@tonic-gate if (iolock) 10680Sstevel@tonic-gate page_io_unlock(pp); 10690Sstevel@tonic-gate page_unlock(pp); 10700Sstevel@tonic-gate } 10710Sstevel@tonic-gate 10720Sstevel@tonic-gate return (NULL); 10730Sstevel@tonic-gate } 10745084Sjohnlev #endif /* !__xpv */ 10750Sstevel@tonic-gate 10760Sstevel@tonic-gate /* 10770Sstevel@tonic-gate * verify that pages being returned from allocator have correct DMA attribute 10780Sstevel@tonic-gate */ 10790Sstevel@tonic-gate #ifndef DEBUG 1080*12004Sjiang.liu@intel.com #define check_dma(a, b, c) (void)(0) 10810Sstevel@tonic-gate #else 10820Sstevel@tonic-gate static void 10830Sstevel@tonic-gate check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 10840Sstevel@tonic-gate { 10850Sstevel@tonic-gate if (dma_attr == NULL) 10860Sstevel@tonic-gate return; 10870Sstevel@tonic-gate 10880Sstevel@tonic-gate while (cnt-- > 0) { 10893446Smrj if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) < 10900Sstevel@tonic-gate dma_attr->dma_attr_addr_lo) 10917240Srh87107 panic("PFN (pp=%p) below dma_attr_addr_lo", (void *)pp); 10923446Smrj if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >= 10930Sstevel@tonic-gate dma_attr->dma_attr_addr_hi) 10947240Srh87107 panic("PFN (pp=%p) above dma_attr_addr_hi", (void *)pp); 10950Sstevel@tonic-gate pp = pp->p_next; 10960Sstevel@tonic-gate } 10970Sstevel@tonic-gate } 10980Sstevel@tonic-gate #endif 10990Sstevel@tonic-gate 11005084Sjohnlev #if !defined(__xpv) 11010Sstevel@tonic-gate static page_t * 11020Sstevel@tonic-gate page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 11030Sstevel@tonic-gate { 11040Sstevel@tonic-gate pfn_t pfn; 11050Sstevel@tonic-gate int sgllen; 11060Sstevel@tonic-gate uint64_t pfnseg; 11070Sstevel@tonic-gate pgcnt_t minctg; 11080Sstevel@tonic-gate page_t *pplist = NULL, *plist; 11090Sstevel@tonic-gate uint64_t lo, hi; 11100Sstevel@tonic-gate pgcnt_t pfnalign = 0; 11110Sstevel@tonic-gate static pfn_t startpfn; 11120Sstevel@tonic-gate static pgcnt_t lastctgcnt; 11130Sstevel@tonic-gate uintptr_t align; 11140Sstevel@tonic-gate 11150Sstevel@tonic-gate CONTIG_LOCK(); 11160Sstevel@tonic-gate 11170Sstevel@tonic-gate if (mattr) { 11180Sstevel@tonic-gate lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 11190Sstevel@tonic-gate hi = mmu_btop(mattr->dma_attr_addr_hi); 11200Sstevel@tonic-gate if (hi >= physmax) 11210Sstevel@tonic-gate hi = physmax - 1; 11220Sstevel@tonic-gate sgllen = mattr->dma_attr_sgllen; 11230Sstevel@tonic-gate pfnseg = mmu_btop(mattr->dma_attr_seg); 11240Sstevel@tonic-gate 11250Sstevel@tonic-gate align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 11260Sstevel@tonic-gate if (align > MMU_PAGESIZE) 11270Sstevel@tonic-gate pfnalign = mmu_btop(align); 11280Sstevel@tonic-gate 11290Sstevel@tonic-gate /* 11300Sstevel@tonic-gate * in order to satisfy the request, must minimally 11310Sstevel@tonic-gate * acquire minctg contiguous pages 11320Sstevel@tonic-gate */ 11330Sstevel@tonic-gate minctg = howmany(*pgcnt, sgllen); 11340Sstevel@tonic-gate 11350Sstevel@tonic-gate ASSERT(hi >= lo); 11360Sstevel@tonic-gate 11370Sstevel@tonic-gate /* 11380Sstevel@tonic-gate * start from where last searched if the minctg >= lastctgcnt 11390Sstevel@tonic-gate */ 11400Sstevel@tonic-gate if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 11410Sstevel@tonic-gate startpfn = lo; 11420Sstevel@tonic-gate } else { 11430Sstevel@tonic-gate hi = physmax - 1; 11440Sstevel@tonic-gate lo = 0; 11450Sstevel@tonic-gate sgllen = 1; 11460Sstevel@tonic-gate pfnseg = mmu.highest_pfn; 11470Sstevel@tonic-gate minctg = *pgcnt; 11480Sstevel@tonic-gate 11490Sstevel@tonic-gate if (minctg < lastctgcnt) 11500Sstevel@tonic-gate startpfn = lo; 11510Sstevel@tonic-gate } 11520Sstevel@tonic-gate lastctgcnt = minctg; 11530Sstevel@tonic-gate 11540Sstevel@tonic-gate ASSERT(pfnseg + 1 >= (uint64_t)minctg); 11550Sstevel@tonic-gate 11560Sstevel@tonic-gate /* conserve 16m memory - start search above 16m when possible */ 11570Sstevel@tonic-gate if (hi > PFN_16M && startpfn < PFN_16M) 11580Sstevel@tonic-gate startpfn = PFN_16M; 11590Sstevel@tonic-gate 11600Sstevel@tonic-gate pfn = startpfn; 11610Sstevel@tonic-gate if (pfnalign) 11620Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 11630Sstevel@tonic-gate 11640Sstevel@tonic-gate while (pfn + minctg - 1 <= hi) { 11650Sstevel@tonic-gate 11660Sstevel@tonic-gate plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 11670Sstevel@tonic-gate if (plist) { 11680Sstevel@tonic-gate page_list_concat(&pplist, &plist); 11690Sstevel@tonic-gate sgllen--; 11700Sstevel@tonic-gate /* 11710Sstevel@tonic-gate * return when contig pages no longer needed 11720Sstevel@tonic-gate */ 11730Sstevel@tonic-gate if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 11740Sstevel@tonic-gate startpfn = pfn; 11750Sstevel@tonic-gate CONTIG_UNLOCK(); 11760Sstevel@tonic-gate check_dma(mattr, pplist, *pgcnt); 11770Sstevel@tonic-gate return (pplist); 11780Sstevel@tonic-gate } 11790Sstevel@tonic-gate minctg = howmany(*pgcnt, sgllen); 11800Sstevel@tonic-gate } 11810Sstevel@tonic-gate if (pfnalign) 11820Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 11830Sstevel@tonic-gate } 11840Sstevel@tonic-gate 11850Sstevel@tonic-gate /* cannot find contig pages in specified range */ 11860Sstevel@tonic-gate if (startpfn == lo) { 11870Sstevel@tonic-gate CONTIG_UNLOCK(); 11880Sstevel@tonic-gate return (NULL); 11890Sstevel@tonic-gate } 11900Sstevel@tonic-gate 11910Sstevel@tonic-gate /* did not start with lo previously */ 11920Sstevel@tonic-gate pfn = lo; 11930Sstevel@tonic-gate if (pfnalign) 11940Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 11950Sstevel@tonic-gate 11960Sstevel@tonic-gate /* allow search to go above startpfn */ 11970Sstevel@tonic-gate while (pfn < startpfn) { 11980Sstevel@tonic-gate 11990Sstevel@tonic-gate plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 12000Sstevel@tonic-gate if (plist != NULL) { 12010Sstevel@tonic-gate 12020Sstevel@tonic-gate page_list_concat(&pplist, &plist); 12030Sstevel@tonic-gate sgllen--; 12040Sstevel@tonic-gate 12050Sstevel@tonic-gate /* 12060Sstevel@tonic-gate * return when contig pages no longer needed 12070Sstevel@tonic-gate */ 12080Sstevel@tonic-gate if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 12090Sstevel@tonic-gate startpfn = pfn; 12100Sstevel@tonic-gate CONTIG_UNLOCK(); 12110Sstevel@tonic-gate check_dma(mattr, pplist, *pgcnt); 12120Sstevel@tonic-gate return (pplist); 12130Sstevel@tonic-gate } 12140Sstevel@tonic-gate minctg = howmany(*pgcnt, sgllen); 12150Sstevel@tonic-gate } 12160Sstevel@tonic-gate if (pfnalign) 12170Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 12180Sstevel@tonic-gate } 12190Sstevel@tonic-gate CONTIG_UNLOCK(); 12200Sstevel@tonic-gate return (NULL); 12210Sstevel@tonic-gate } 12225084Sjohnlev #endif /* !__xpv */ 12230Sstevel@tonic-gate 12240Sstevel@tonic-gate /* 12250Sstevel@tonic-gate * mnode_range_cnt() calculates the number of memory ranges for mnode and 12260Sstevel@tonic-gate * memranges[]. Used to determine the size of page lists and mnoderanges. 12270Sstevel@tonic-gate */ 12280Sstevel@tonic-gate int 12292961Sdp78419 mnode_range_cnt(int mnode) 12300Sstevel@tonic-gate { 12315084Sjohnlev #if defined(__xpv) 12325084Sjohnlev ASSERT(mnode == 0); 12335084Sjohnlev return (1); 12345084Sjohnlev #else /* __xpv */ 12350Sstevel@tonic-gate int mri; 12360Sstevel@tonic-gate int mnrcnt = 0; 12370Sstevel@tonic-gate 12382961Sdp78419 if (mem_node_config[mnode].exists != 0) { 12390Sstevel@tonic-gate mri = nranges - 1; 12400Sstevel@tonic-gate 12410Sstevel@tonic-gate /* find the memranges index below contained in mnode range */ 12420Sstevel@tonic-gate 12430Sstevel@tonic-gate while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 12440Sstevel@tonic-gate mri--; 12450Sstevel@tonic-gate 12460Sstevel@tonic-gate /* 12470Sstevel@tonic-gate * increment mnode range counter when memranges or mnode 12480Sstevel@tonic-gate * boundary is reached. 12490Sstevel@tonic-gate */ 12500Sstevel@tonic-gate while (mri >= 0 && 12510Sstevel@tonic-gate mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 12520Sstevel@tonic-gate mnrcnt++; 12530Sstevel@tonic-gate if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 12540Sstevel@tonic-gate mri--; 12550Sstevel@tonic-gate else 12560Sstevel@tonic-gate break; 12570Sstevel@tonic-gate } 12580Sstevel@tonic-gate } 12592961Sdp78419 ASSERT(mnrcnt <= MAX_MNODE_MRANGES); 12600Sstevel@tonic-gate return (mnrcnt); 12615084Sjohnlev #endif /* __xpv */ 12620Sstevel@tonic-gate } 12630Sstevel@tonic-gate 12645084Sjohnlev /* 12655084Sjohnlev * mnode_range_setup() initializes mnoderanges. 12665084Sjohnlev */ 12670Sstevel@tonic-gate void 12680Sstevel@tonic-gate mnode_range_setup(mnoderange_t *mnoderanges) 12690Sstevel@tonic-gate { 1270*12004Sjiang.liu@intel.com mnoderange_t *mp = mnoderanges; 12710Sstevel@tonic-gate int mnode, mri; 1272*12004Sjiang.liu@intel.com int mindex = 0; /* current index into mnoderanges array */ 1273*12004Sjiang.liu@intel.com int i, j; 1274*12004Sjiang.liu@intel.com pfn_t hipfn; 1275*12004Sjiang.liu@intel.com int last, hi; 12760Sstevel@tonic-gate 12770Sstevel@tonic-gate for (mnode = 0; mnode < max_mem_nodes; mnode++) { 12780Sstevel@tonic-gate if (mem_node_config[mnode].exists == 0) 12790Sstevel@tonic-gate continue; 12800Sstevel@tonic-gate 12810Sstevel@tonic-gate mri = nranges - 1; 12820Sstevel@tonic-gate 12830Sstevel@tonic-gate while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 12840Sstevel@tonic-gate mri--; 12850Sstevel@tonic-gate 12860Sstevel@tonic-gate while (mri >= 0 && mem_node_config[mnode].physmax >= 12870Sstevel@tonic-gate MEMRANGELO(mri)) { 12885084Sjohnlev mnoderanges->mnr_pfnlo = MAX(MEMRANGELO(mri), 12895084Sjohnlev mem_node_config[mnode].physbase); 12905084Sjohnlev mnoderanges->mnr_pfnhi = MIN(MEMRANGEHI(mri), 12915084Sjohnlev mem_node_config[mnode].physmax); 12920Sstevel@tonic-gate mnoderanges->mnr_mnode = mnode; 12930Sstevel@tonic-gate mnoderanges->mnr_memrange = mri; 1294*12004Sjiang.liu@intel.com mnoderanges->mnr_exists = 1; 12950Sstevel@tonic-gate mnoderanges++; 1296*12004Sjiang.liu@intel.com mindex++; 12970Sstevel@tonic-gate if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 12980Sstevel@tonic-gate mri--; 12990Sstevel@tonic-gate else 13000Sstevel@tonic-gate break; 13010Sstevel@tonic-gate } 13020Sstevel@tonic-gate } 1303*12004Sjiang.liu@intel.com 1304*12004Sjiang.liu@intel.com /* 1305*12004Sjiang.liu@intel.com * For now do a simple sort of the mnoderanges array to fill in 1306*12004Sjiang.liu@intel.com * the mnr_next fields. Since mindex is expected to be relatively 1307*12004Sjiang.liu@intel.com * small, using a simple O(N^2) algorithm. 1308*12004Sjiang.liu@intel.com */ 1309*12004Sjiang.liu@intel.com for (i = 0; i < mindex; i++) { 1310*12004Sjiang.liu@intel.com if (mp[i].mnr_pfnlo == 0) /* find lowest */ 1311*12004Sjiang.liu@intel.com break; 1312*12004Sjiang.liu@intel.com } 1313*12004Sjiang.liu@intel.com ASSERT(i < mindex); 1314*12004Sjiang.liu@intel.com last = i; 1315*12004Sjiang.liu@intel.com mtype16m = last; 1316*12004Sjiang.liu@intel.com mp[last].mnr_next = -1; 1317*12004Sjiang.liu@intel.com for (i = 0; i < mindex - 1; i++) { 1318*12004Sjiang.liu@intel.com hipfn = (pfn_t)(-1); 1319*12004Sjiang.liu@intel.com hi = -1; 1320*12004Sjiang.liu@intel.com /* find next highest mnode range */ 1321*12004Sjiang.liu@intel.com for (j = 0; j < mindex; j++) { 1322*12004Sjiang.liu@intel.com if (mp[j].mnr_pfnlo > mp[last].mnr_pfnlo && 1323*12004Sjiang.liu@intel.com mp[j].mnr_pfnlo < hipfn) { 1324*12004Sjiang.liu@intel.com hipfn = mp[j].mnr_pfnlo; 1325*12004Sjiang.liu@intel.com hi = j; 1326*12004Sjiang.liu@intel.com } 1327*12004Sjiang.liu@intel.com } 1328*12004Sjiang.liu@intel.com mp[hi].mnr_next = last; 1329*12004Sjiang.liu@intel.com last = hi; 1330*12004Sjiang.liu@intel.com } 1331*12004Sjiang.liu@intel.com mtypetop = last; 13320Sstevel@tonic-gate } 13330Sstevel@tonic-gate 1334*12004Sjiang.liu@intel.com #ifndef __xpv 1335*12004Sjiang.liu@intel.com /* 1336*12004Sjiang.liu@intel.com * Update mnoderanges for memory hot-add DR operations. 1337*12004Sjiang.liu@intel.com */ 1338*12004Sjiang.liu@intel.com static void 1339*12004Sjiang.liu@intel.com mnode_range_add(int mnode) 1340*12004Sjiang.liu@intel.com { 1341*12004Sjiang.liu@intel.com int *prev; 1342*12004Sjiang.liu@intel.com int n, mri; 1343*12004Sjiang.liu@intel.com pfn_t start, end; 1344*12004Sjiang.liu@intel.com extern void membar_sync(void); 1345*12004Sjiang.liu@intel.com 1346*12004Sjiang.liu@intel.com ASSERT(0 <= mnode && mnode < max_mem_nodes); 1347*12004Sjiang.liu@intel.com ASSERT(mem_node_config[mnode].exists); 1348*12004Sjiang.liu@intel.com start = mem_node_config[mnode].physbase; 1349*12004Sjiang.liu@intel.com end = mem_node_config[mnode].physmax; 1350*12004Sjiang.liu@intel.com ASSERT(start <= end); 1351*12004Sjiang.liu@intel.com mutex_enter(&mnoderange_lock); 1352*12004Sjiang.liu@intel.com 1353*12004Sjiang.liu@intel.com #ifdef DEBUG 1354*12004Sjiang.liu@intel.com /* Check whether it interleaves with other memory nodes. */ 1355*12004Sjiang.liu@intel.com for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) { 1356*12004Sjiang.liu@intel.com ASSERT(mnoderanges[n].mnr_exists); 1357*12004Sjiang.liu@intel.com if (mnoderanges[n].mnr_mnode == mnode) 1358*12004Sjiang.liu@intel.com continue; 1359*12004Sjiang.liu@intel.com ASSERT(start > mnoderanges[n].mnr_pfnhi || 1360*12004Sjiang.liu@intel.com end < mnoderanges[n].mnr_pfnlo); 1361*12004Sjiang.liu@intel.com } 1362*12004Sjiang.liu@intel.com #endif /* DEBUG */ 1363*12004Sjiang.liu@intel.com 1364*12004Sjiang.liu@intel.com mri = nranges - 1; 1365*12004Sjiang.liu@intel.com while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 1366*12004Sjiang.liu@intel.com mri--; 1367*12004Sjiang.liu@intel.com while (mri >= 0 && mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 1368*12004Sjiang.liu@intel.com /* Check whether mtype already exists. */ 1369*12004Sjiang.liu@intel.com for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) { 1370*12004Sjiang.liu@intel.com if (mnoderanges[n].mnr_mnode == mnode && 1371*12004Sjiang.liu@intel.com mnoderanges[n].mnr_memrange == mri) { 1372*12004Sjiang.liu@intel.com mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri), 1373*12004Sjiang.liu@intel.com start); 1374*12004Sjiang.liu@intel.com mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri), 1375*12004Sjiang.liu@intel.com end); 1376*12004Sjiang.liu@intel.com break; 1377*12004Sjiang.liu@intel.com } 1378*12004Sjiang.liu@intel.com } 1379*12004Sjiang.liu@intel.com 1380*12004Sjiang.liu@intel.com /* Add a new entry if it doesn't exist yet. */ 1381*12004Sjiang.liu@intel.com if (n == -1) { 1382*12004Sjiang.liu@intel.com /* Try to find an unused entry in mnoderanges array. */ 1383*12004Sjiang.liu@intel.com for (n = 0; n < mnoderangecnt; n++) { 1384*12004Sjiang.liu@intel.com if (mnoderanges[n].mnr_exists == 0) 1385*12004Sjiang.liu@intel.com break; 1386*12004Sjiang.liu@intel.com } 1387*12004Sjiang.liu@intel.com ASSERT(n < mnoderangecnt); 1388*12004Sjiang.liu@intel.com mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri), start); 1389*12004Sjiang.liu@intel.com mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri), end); 1390*12004Sjiang.liu@intel.com mnoderanges[n].mnr_mnode = mnode; 1391*12004Sjiang.liu@intel.com mnoderanges[n].mnr_memrange = mri; 1392*12004Sjiang.liu@intel.com mnoderanges[n].mnr_exists = 1; 1393*12004Sjiang.liu@intel.com /* Page 0 should always be present. */ 1394*12004Sjiang.liu@intel.com for (prev = &mtypetop; 1395*12004Sjiang.liu@intel.com mnoderanges[*prev].mnr_pfnlo > start; 1396*12004Sjiang.liu@intel.com prev = &mnoderanges[*prev].mnr_next) { 1397*12004Sjiang.liu@intel.com ASSERT(mnoderanges[*prev].mnr_next >= 0); 1398*12004Sjiang.liu@intel.com ASSERT(mnoderanges[*prev].mnr_pfnlo > end); 1399*12004Sjiang.liu@intel.com } 1400*12004Sjiang.liu@intel.com mnoderanges[n].mnr_next = *prev; 1401*12004Sjiang.liu@intel.com membar_sync(); 1402*12004Sjiang.liu@intel.com *prev = n; 1403*12004Sjiang.liu@intel.com } 1404*12004Sjiang.liu@intel.com 1405*12004Sjiang.liu@intel.com if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 1406*12004Sjiang.liu@intel.com mri--; 1407*12004Sjiang.liu@intel.com else 1408*12004Sjiang.liu@intel.com break; 1409*12004Sjiang.liu@intel.com } 1410*12004Sjiang.liu@intel.com 1411*12004Sjiang.liu@intel.com mutex_exit(&mnoderange_lock); 1412*12004Sjiang.liu@intel.com } 1413*12004Sjiang.liu@intel.com 1414*12004Sjiang.liu@intel.com /* 1415*12004Sjiang.liu@intel.com * Update mnoderanges for memory hot-removal DR operations. 1416*12004Sjiang.liu@intel.com */ 1417*12004Sjiang.liu@intel.com static void 1418*12004Sjiang.liu@intel.com mnode_range_del(int mnode) 1419*12004Sjiang.liu@intel.com { 1420*12004Sjiang.liu@intel.com _NOTE(ARGUNUSED(mnode)); 1421*12004Sjiang.liu@intel.com ASSERT(0 <= mnode && mnode < max_mem_nodes); 1422*12004Sjiang.liu@intel.com /* TODO: support deletion operation. */ 1423*12004Sjiang.liu@intel.com ASSERT(0); 1424*12004Sjiang.liu@intel.com } 1425*12004Sjiang.liu@intel.com 1426*12004Sjiang.liu@intel.com void 1427*12004Sjiang.liu@intel.com plat_slice_add(pfn_t start, pfn_t end) 1428*12004Sjiang.liu@intel.com { 1429*12004Sjiang.liu@intel.com mem_node_add_slice(start, end); 1430*12004Sjiang.liu@intel.com if (plat_dr_enabled()) { 1431*12004Sjiang.liu@intel.com mnode_range_add(PFN_2_MEM_NODE(start)); 1432*12004Sjiang.liu@intel.com } 1433*12004Sjiang.liu@intel.com } 1434*12004Sjiang.liu@intel.com 1435*12004Sjiang.liu@intel.com void 1436*12004Sjiang.liu@intel.com plat_slice_del(pfn_t start, pfn_t end) 1437*12004Sjiang.liu@intel.com { 1438*12004Sjiang.liu@intel.com ASSERT(PFN_2_MEM_NODE(start) == PFN_2_MEM_NODE(end)); 1439*12004Sjiang.liu@intel.com ASSERT(plat_dr_enabled()); 1440*12004Sjiang.liu@intel.com mnode_range_del(PFN_2_MEM_NODE(start)); 1441*12004Sjiang.liu@intel.com mem_node_del_slice(start, end); 1442*12004Sjiang.liu@intel.com } 1443*12004Sjiang.liu@intel.com #endif /* __xpv */ 1444*12004Sjiang.liu@intel.com 14455084Sjohnlev /*ARGSUSED*/ 14465084Sjohnlev int 14475084Sjohnlev mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz) 14485084Sjohnlev { 1449*12004Sjiang.liu@intel.com int mtype = mtypetop; 14505084Sjohnlev 14515084Sjohnlev #if !defined(__xpv) 14525084Sjohnlev #if defined(__i386) 14535084Sjohnlev /* 14545084Sjohnlev * set the mtype range 1455*12004Sjiang.liu@intel.com * - kmem requests need to be below 4g if restricted_kmemalloc is set. 14565084Sjohnlev * - for non kmem requests, set range to above 4g if memory below 4g 14575084Sjohnlev * runs low. 14585084Sjohnlev */ 14595084Sjohnlev if (restricted_kmemalloc && VN_ISKAS(vp) && 14605084Sjohnlev (caddr_t)(vaddr) >= kernelheap && 14615084Sjohnlev (caddr_t)(vaddr) < ekernelheap) { 14625084Sjohnlev ASSERT(physmax4g); 14635084Sjohnlev mtype = mtype4g; 14645084Sjohnlev if (RESTRICT16M_ALLOC(freemem4g - btop(pgsz), 14655084Sjohnlev btop(pgsz), *flags)) { 14665084Sjohnlev *flags |= PGI_MT_RANGE16M; 14675084Sjohnlev } else { 14685084Sjohnlev VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 14695084Sjohnlev VM_STAT_COND_ADD((*flags & PG_PANIC), 14705084Sjohnlev vmm_vmstats.pgpanicalloc); 14715084Sjohnlev *flags |= PGI_MT_RANGE0; 14725084Sjohnlev } 14735084Sjohnlev return (mtype); 14745084Sjohnlev } 14755084Sjohnlev #endif /* __i386 */ 14765084Sjohnlev 14775084Sjohnlev if (RESTRICT4G_ALLOC) { 14785084Sjohnlev VM_STAT_ADD(vmm_vmstats.restrict4gcnt); 14795084Sjohnlev /* here only for > 4g systems */ 14805084Sjohnlev *flags |= PGI_MT_RANGE4G; 14815084Sjohnlev } else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) { 14825084Sjohnlev *flags |= PGI_MT_RANGE16M; 14835084Sjohnlev } else { 14845084Sjohnlev VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 14855084Sjohnlev VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc); 14865084Sjohnlev *flags |= PGI_MT_RANGE0; 14875084Sjohnlev } 14885084Sjohnlev #endif /* !__xpv */ 14895084Sjohnlev return (mtype); 14905084Sjohnlev } 14915084Sjohnlev 14925084Sjohnlev 14935084Sjohnlev /* mtype init for page_get_replacement_page */ 14945084Sjohnlev /*ARGSUSED*/ 14955084Sjohnlev int 14965084Sjohnlev mtype_pgr_init(int *flags, page_t *pp, int mnode, pgcnt_t pgcnt) 14975084Sjohnlev { 1498*12004Sjiang.liu@intel.com int mtype = mtypetop; 1499*12004Sjiang.liu@intel.com #if !defined(__xpv) 15005084Sjohnlev if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) { 15015084Sjohnlev *flags |= PGI_MT_RANGE16M; 15025084Sjohnlev } else { 15035084Sjohnlev VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 15045084Sjohnlev *flags |= PGI_MT_RANGE0; 15055084Sjohnlev } 15065084Sjohnlev #endif 15075084Sjohnlev return (mtype); 15085084Sjohnlev } 15095084Sjohnlev 15100Sstevel@tonic-gate /* 15110Sstevel@tonic-gate * Determine if the mnode range specified in mtype contains memory belonging 15120Sstevel@tonic-gate * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 1513*12004Sjiang.liu@intel.com * the range from high pfn to 0, 16m or 4g. 15140Sstevel@tonic-gate * 15150Sstevel@tonic-gate * Return first mnode range type index found otherwise return -1 if none found. 15160Sstevel@tonic-gate */ 15170Sstevel@tonic-gate int 15180Sstevel@tonic-gate mtype_func(int mnode, int mtype, uint_t flags) 15190Sstevel@tonic-gate { 15200Sstevel@tonic-gate if (flags & PGI_MT_RANGE) { 1521*12004Sjiang.liu@intel.com int mnr_lim = MRI_0; 1522*12004Sjiang.liu@intel.com 1523*12004Sjiang.liu@intel.com if (flags & PGI_MT_NEXT) { 1524*12004Sjiang.liu@intel.com mtype = mnoderanges[mtype].mnr_next; 1525*12004Sjiang.liu@intel.com } 15265084Sjohnlev if (flags & PGI_MT_RANGE4G) 1527*12004Sjiang.liu@intel.com mnr_lim = MRI_4G; /* exclude 0-4g range */ 15281385Skchow else if (flags & PGI_MT_RANGE16M) 1529*12004Sjiang.liu@intel.com mnr_lim = MRI_16M; /* exclude 0-16m range */ 1530*12004Sjiang.liu@intel.com while (mtype != -1 && 1531*12004Sjiang.liu@intel.com mnoderanges[mtype].mnr_memrange <= mnr_lim) { 15320Sstevel@tonic-gate if (mnoderanges[mtype].mnr_mnode == mnode) 15330Sstevel@tonic-gate return (mtype); 1534*12004Sjiang.liu@intel.com mtype = mnoderanges[mtype].mnr_next; 15350Sstevel@tonic-gate } 15365084Sjohnlev } else if (mnoderanges[mtype].mnr_mnode == mnode) { 15375084Sjohnlev return (mtype); 15380Sstevel@tonic-gate } 15390Sstevel@tonic-gate return (-1); 15400Sstevel@tonic-gate } 15410Sstevel@tonic-gate 15420Sstevel@tonic-gate /* 15431373Skchow * Update the page list max counts with the pfn range specified by the 1544*12004Sjiang.liu@intel.com * input parameters. 15451373Skchow */ 15461373Skchow void 15471373Skchow mtype_modify_max(pfn_t startpfn, long cnt) 15481373Skchow { 1549*12004Sjiang.liu@intel.com int mtype; 1550*12004Sjiang.liu@intel.com pgcnt_t inc; 1551*12004Sjiang.liu@intel.com spgcnt_t scnt = (spgcnt_t)(cnt); 1552*12004Sjiang.liu@intel.com pgcnt_t acnt = ABS(scnt); 1553*12004Sjiang.liu@intel.com pfn_t endpfn = startpfn + acnt; 1554*12004Sjiang.liu@intel.com pfn_t pfn, lo; 15551373Skchow 15565084Sjohnlev if (!physmax4g) 15575084Sjohnlev return; 15585084Sjohnlev 1559*12004Sjiang.liu@intel.com mtype = mtypetop; 1560*12004Sjiang.liu@intel.com for (pfn = endpfn; pfn > startpfn; ) { 1561*12004Sjiang.liu@intel.com ASSERT(mtype != -1); 1562*12004Sjiang.liu@intel.com lo = mnoderanges[mtype].mnr_pfnlo; 1563*12004Sjiang.liu@intel.com if (pfn > lo) { 1564*12004Sjiang.liu@intel.com if (startpfn >= lo) { 1565*12004Sjiang.liu@intel.com inc = pfn - startpfn; 15661373Skchow } else { 1567*12004Sjiang.liu@intel.com inc = pfn - lo; 15681373Skchow } 1569*12004Sjiang.liu@intel.com if (mnoderanges[mtype].mnr_memrange != MRI_4G) { 1570*12004Sjiang.liu@intel.com if (scnt > 0) 1571*12004Sjiang.liu@intel.com maxmem4g += inc; 1572*12004Sjiang.liu@intel.com else 1573*12004Sjiang.liu@intel.com maxmem4g -= inc; 1574*12004Sjiang.liu@intel.com } 1575*12004Sjiang.liu@intel.com pfn -= inc; 15761373Skchow } 1577*12004Sjiang.liu@intel.com mtype = mnoderanges[mtype].mnr_next; 15781373Skchow } 15791373Skchow } 15801373Skchow 15815084Sjohnlev int 15825084Sjohnlev mtype_2_mrange(int mtype) 15835084Sjohnlev { 15845084Sjohnlev return (mnoderanges[mtype].mnr_memrange); 15855084Sjohnlev } 15865084Sjohnlev 15875084Sjohnlev void 15885084Sjohnlev mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi) 15895084Sjohnlev { 1590*12004Sjiang.liu@intel.com _NOTE(ARGUNUSED(mnode)); 15915084Sjohnlev ASSERT(mnoderanges[mtype].mnr_mnode == mnode); 15925084Sjohnlev *pfnlo = mnoderanges[mtype].mnr_pfnlo; 15935084Sjohnlev *pfnhi = mnoderanges[mtype].mnr_pfnhi; 15945084Sjohnlev } 15955084Sjohnlev 15965084Sjohnlev size_t 15975084Sjohnlev plcnt_sz(size_t ctrs_sz) 15985084Sjohnlev { 15995084Sjohnlev #ifdef DEBUG 16005084Sjohnlev int szc, colors; 16015084Sjohnlev 16025084Sjohnlev ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes; 16035084Sjohnlev for (szc = 0; szc < mmu_page_sizes; szc++) { 16045084Sjohnlev colors = page_get_pagecolors(szc); 16055084Sjohnlev ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors; 16065084Sjohnlev } 16075084Sjohnlev #endif 16085084Sjohnlev return (ctrs_sz); 16095084Sjohnlev } 16105084Sjohnlev 16115084Sjohnlev caddr_t 16125084Sjohnlev plcnt_init(caddr_t addr) 16135084Sjohnlev { 16145084Sjohnlev #ifdef DEBUG 16155084Sjohnlev int mt, szc, colors; 16165084Sjohnlev 16175084Sjohnlev for (mt = 0; mt < mnoderangecnt; mt++) { 16185084Sjohnlev mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr; 16195084Sjohnlev addr += (sizeof (struct mnr_mts) * mmu_page_sizes); 16205084Sjohnlev for (szc = 0; szc < mmu_page_sizes; szc++) { 16215084Sjohnlev colors = page_get_pagecolors(szc); 16225084Sjohnlev mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors; 16235084Sjohnlev mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt = 16245084Sjohnlev (pgcnt_t *)addr; 16255084Sjohnlev addr += (sizeof (pgcnt_t) * colors); 16265084Sjohnlev } 16275084Sjohnlev } 16285084Sjohnlev #endif 16295084Sjohnlev return (addr); 16305084Sjohnlev } 16315084Sjohnlev 16325084Sjohnlev void 16335084Sjohnlev plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags) 16345084Sjohnlev { 1635*12004Sjiang.liu@intel.com _NOTE(ARGUNUSED(pp)); 16365084Sjohnlev #ifdef DEBUG 16375084Sjohnlev int bin = PP_2_BIN(pp); 16385084Sjohnlev 16395084Sjohnlev atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt); 16405084Sjohnlev atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin], 16415084Sjohnlev cnt); 16425084Sjohnlev #endif 16435084Sjohnlev ASSERT(mtype == PP_2_MTYPE(pp)); 1644*12004Sjiang.liu@intel.com if (physmax4g && mnoderanges[mtype].mnr_memrange != MRI_4G) 16455084Sjohnlev atomic_add_long(&freemem4g, cnt); 16465084Sjohnlev if (flags & PG_CACHE_LIST) 16475084Sjohnlev atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt); 16485084Sjohnlev else 16495466Skchow atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt[szc], cnt); 16505466Skchow atomic_add_long(&mnoderanges[mtype].mnr_mt_totcnt, cnt); 16515084Sjohnlev } 16525084Sjohnlev 16531373Skchow /* 1654414Skchow * Returns the free page count for mnode 1655414Skchow */ 1656414Skchow int 1657414Skchow mnode_pgcnt(int mnode) 1658414Skchow { 1659*12004Sjiang.liu@intel.com int mtype = mtypetop; 1660414Skchow int flags = PGI_MT_RANGE0; 1661414Skchow pgcnt_t pgcnt = 0; 1662414Skchow 1663414Skchow mtype = mtype_func(mnode, mtype, flags); 1664414Skchow 1665414Skchow while (mtype != -1) { 16661385Skchow pgcnt += MTYPE_FREEMEM(mtype); 1667414Skchow mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); 1668414Skchow } 1669414Skchow return (pgcnt); 1670414Skchow } 1671414Skchow 1672414Skchow /* 16730Sstevel@tonic-gate * Initialize page coloring variables based on the l2 cache parameters. 16740Sstevel@tonic-gate * Calculate and return memory needed for page coloring data structures. 16750Sstevel@tonic-gate */ 16760Sstevel@tonic-gate size_t 16770Sstevel@tonic-gate page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 16780Sstevel@tonic-gate { 1679*12004Sjiang.liu@intel.com _NOTE(ARGUNUSED(l2_linesz)); 16800Sstevel@tonic-gate size_t colorsz = 0; 16810Sstevel@tonic-gate int i; 16820Sstevel@tonic-gate int colors; 16830Sstevel@tonic-gate 16845084Sjohnlev #if defined(__xpv) 16855084Sjohnlev /* 16865084Sjohnlev * Hypervisor domains currently don't have any concept of NUMA. 16875084Sjohnlev * Hence we'll act like there is only 1 memrange. 16885084Sjohnlev */ 16895084Sjohnlev i = memrange_num(1); 16905084Sjohnlev #else /* !__xpv */ 16910Sstevel@tonic-gate /* 16920Sstevel@tonic-gate * Reduce the memory ranges lists if we don't have large amounts 16930Sstevel@tonic-gate * of memory. This avoids searching known empty free lists. 1694*12004Sjiang.liu@intel.com * To support memory DR operations, we need to keep memory ranges 1695*12004Sjiang.liu@intel.com * for possible memory hot-add operations. 16960Sstevel@tonic-gate */ 1697*12004Sjiang.liu@intel.com if (plat_dr_physmax > physmax) 1698*12004Sjiang.liu@intel.com i = memrange_num(plat_dr_physmax); 1699*12004Sjiang.liu@intel.com else 1700*12004Sjiang.liu@intel.com i = memrange_num(physmax); 17010Sstevel@tonic-gate #if defined(__i386) 1702*12004Sjiang.liu@intel.com if (i > MRI_4G) 17030Sstevel@tonic-gate restricted_kmemalloc = 0; 17040Sstevel@tonic-gate #endif 17050Sstevel@tonic-gate /* physmax greater than 4g */ 1706*12004Sjiang.liu@intel.com if (i == MRI_4G) 17070Sstevel@tonic-gate physmax4g = 1; 17085084Sjohnlev #endif /* !__xpv */ 17095084Sjohnlev memranges += i; 17105084Sjohnlev nranges -= i; 17110Sstevel@tonic-gate 17125349Skchow ASSERT(mmu_page_sizes <= MMU_PAGE_SIZES); 17135349Skchow 17140Sstevel@tonic-gate ASSERT(ISP2(l2_linesz)); 17150Sstevel@tonic-gate ASSERT(l2_sz > MMU_PAGESIZE); 17160Sstevel@tonic-gate 17170Sstevel@tonic-gate /* l2_assoc is 0 for fully associative l2 cache */ 17180Sstevel@tonic-gate if (l2_assoc) 17190Sstevel@tonic-gate l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 17200Sstevel@tonic-gate else 17210Sstevel@tonic-gate l2_colors = 1; 17220Sstevel@tonic-gate 17237069Svd224797 ASSERT(ISP2(l2_colors)); 17247069Svd224797 17250Sstevel@tonic-gate /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 17260Sstevel@tonic-gate page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 17270Sstevel@tonic-gate 17280Sstevel@tonic-gate /* 17290Sstevel@tonic-gate * cpu_page_colors is non-zero when a page color may be spread across 17300Sstevel@tonic-gate * multiple bins. 17310Sstevel@tonic-gate */ 17320Sstevel@tonic-gate if (l2_colors < page_colors) 17330Sstevel@tonic-gate cpu_page_colors = l2_colors; 17340Sstevel@tonic-gate 17350Sstevel@tonic-gate ASSERT(ISP2(page_colors)); 17360Sstevel@tonic-gate 17370Sstevel@tonic-gate page_colors_mask = page_colors - 1; 17380Sstevel@tonic-gate 17390Sstevel@tonic-gate ASSERT(ISP2(CPUSETSIZE())); 17400Sstevel@tonic-gate page_coloring_shift = lowbit(CPUSETSIZE()); 17410Sstevel@tonic-gate 17422961Sdp78419 /* initialize number of colors per page size */ 17432961Sdp78419 for (i = 0; i <= mmu.max_page_level; i++) { 17442961Sdp78419 hw_page_array[i].hp_size = LEVEL_SIZE(i); 17452961Sdp78419 hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 17462961Sdp78419 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 17472961Sdp78419 hw_page_array[i].hp_colors = (page_colors_mask >> 17482961Sdp78419 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift)) 17492961Sdp78419 + 1; 17503717Sdp78419 colorequivszc[i] = 0; 17512961Sdp78419 } 17522961Sdp78419 17532961Sdp78419 /* 17542961Sdp78419 * The value of cpu_page_colors determines if additional color bins 17552961Sdp78419 * need to be checked for a particular color in the page_get routines. 17562961Sdp78419 */ 17572961Sdp78419 if (cpu_page_colors != 0) { 17582961Sdp78419 17592961Sdp78419 int a = lowbit(page_colors) - lowbit(cpu_page_colors); 17602961Sdp78419 ASSERT(a > 0); 17612961Sdp78419 ASSERT(a < 16); 17622961Sdp78419 17632961Sdp78419 for (i = 0; i <= mmu.max_page_level; i++) { 17642961Sdp78419 if ((colors = hw_page_array[i].hp_colors) <= 1) { 17652961Sdp78419 colorequivszc[i] = 0; 17662961Sdp78419 continue; 17672961Sdp78419 } 17682961Sdp78419 while ((colors >> a) == 0) 17692961Sdp78419 a--; 17702961Sdp78419 ASSERT(a >= 0); 17712961Sdp78419 17722961Sdp78419 /* higher 4 bits encodes color equiv mask */ 17732961Sdp78419 colorequivszc[i] = (a << 4); 17742961Sdp78419 } 17752961Sdp78419 } 17762961Sdp78419 17775084Sjohnlev /* factor in colorequiv to check additional 'equivalent' bins. */ 17785084Sjohnlev if (colorequiv > 1) { 17795084Sjohnlev 17805084Sjohnlev int a = lowbit(colorequiv) - 1; 17815084Sjohnlev if (a > 15) 17825084Sjohnlev a = 15; 17835084Sjohnlev 17845084Sjohnlev for (i = 0; i <= mmu.max_page_level; i++) { 17855084Sjohnlev if ((colors = hw_page_array[i].hp_colors) <= 1) { 17865084Sjohnlev continue; 17875084Sjohnlev } 17885084Sjohnlev while ((colors >> a) == 0) 17895084Sjohnlev a--; 17905084Sjohnlev if ((a << 4) > colorequivszc[i]) { 17915084Sjohnlev colorequivszc[i] = (a << 4); 17925084Sjohnlev } 17935084Sjohnlev } 17945084Sjohnlev } 17955084Sjohnlev 17960Sstevel@tonic-gate /* size for mnoderanges */ 17972961Sdp78419 for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++) 17982961Sdp78419 mnoderangecnt += mnode_range_cnt(i); 1799*12004Sjiang.liu@intel.com if (plat_dr_support_memory()) { 1800*12004Sjiang.liu@intel.com /* 1801*12004Sjiang.liu@intel.com * Reserve enough space for memory DR operations. 1802*12004Sjiang.liu@intel.com * Two extra mnoderanges for possbile fragmentations, 1803*12004Sjiang.liu@intel.com * one for the 2G boundary and the other for the 4G boundary. 1804*12004Sjiang.liu@intel.com * We don't expect a memory board crossing the 16M boundary 1805*12004Sjiang.liu@intel.com * for memory hot-add operations on x86 platforms. 1806*12004Sjiang.liu@intel.com */ 1807*12004Sjiang.liu@intel.com mnoderangecnt += 2 + max_mem_nodes - lgrp_plat_node_cnt; 1808*12004Sjiang.liu@intel.com } 18090Sstevel@tonic-gate colorsz = mnoderangecnt * sizeof (mnoderange_t); 18100Sstevel@tonic-gate 18110Sstevel@tonic-gate /* size for fpc_mutex and cpc_mutex */ 18120Sstevel@tonic-gate colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 18130Sstevel@tonic-gate 18140Sstevel@tonic-gate /* size of page_freelists */ 18150Sstevel@tonic-gate colorsz += mnoderangecnt * sizeof (page_t ***); 18160Sstevel@tonic-gate colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 18170Sstevel@tonic-gate 18180Sstevel@tonic-gate for (i = 0; i < mmu_page_sizes; i++) { 18190Sstevel@tonic-gate colors = page_get_pagecolors(i); 18200Sstevel@tonic-gate colorsz += mnoderangecnt * colors * sizeof (page_t *); 18210Sstevel@tonic-gate } 18220Sstevel@tonic-gate 18230Sstevel@tonic-gate /* size of page_cachelists */ 18240Sstevel@tonic-gate colorsz += mnoderangecnt * sizeof (page_t **); 18250Sstevel@tonic-gate colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 18260Sstevel@tonic-gate 18270Sstevel@tonic-gate return (colorsz); 18280Sstevel@tonic-gate } 18290Sstevel@tonic-gate 18300Sstevel@tonic-gate /* 18310Sstevel@tonic-gate * Called once at startup to configure page_coloring data structures and 18320Sstevel@tonic-gate * does the 1st page_free()/page_freelist_add(). 18330Sstevel@tonic-gate */ 18340Sstevel@tonic-gate void 18350Sstevel@tonic-gate page_coloring_setup(caddr_t pcmemaddr) 18360Sstevel@tonic-gate { 18370Sstevel@tonic-gate int i; 18380Sstevel@tonic-gate int j; 18390Sstevel@tonic-gate int k; 18400Sstevel@tonic-gate caddr_t addr; 18410Sstevel@tonic-gate int colors; 18420Sstevel@tonic-gate 18430Sstevel@tonic-gate /* 18440Sstevel@tonic-gate * do page coloring setup 18450Sstevel@tonic-gate */ 18460Sstevel@tonic-gate addr = pcmemaddr; 18470Sstevel@tonic-gate 18480Sstevel@tonic-gate mnoderanges = (mnoderange_t *)addr; 18490Sstevel@tonic-gate addr += (mnoderangecnt * sizeof (mnoderange_t)); 18500Sstevel@tonic-gate 18510Sstevel@tonic-gate mnode_range_setup(mnoderanges); 18520Sstevel@tonic-gate 18530Sstevel@tonic-gate if (physmax4g) 18540Sstevel@tonic-gate mtype4g = pfn_2_mtype(0xfffff); 18550Sstevel@tonic-gate 18560Sstevel@tonic-gate for (k = 0; k < NPC_MUTEX; k++) { 18570Sstevel@tonic-gate fpc_mutex[k] = (kmutex_t *)addr; 18580Sstevel@tonic-gate addr += (max_mem_nodes * sizeof (kmutex_t)); 18590Sstevel@tonic-gate } 18600Sstevel@tonic-gate for (k = 0; k < NPC_MUTEX; k++) { 18610Sstevel@tonic-gate cpc_mutex[k] = (kmutex_t *)addr; 18620Sstevel@tonic-gate addr += (max_mem_nodes * sizeof (kmutex_t)); 18630Sstevel@tonic-gate } 18640Sstevel@tonic-gate page_freelists = (page_t ****)addr; 18650Sstevel@tonic-gate addr += (mnoderangecnt * sizeof (page_t ***)); 18660Sstevel@tonic-gate 18670Sstevel@tonic-gate page_cachelists = (page_t ***)addr; 18680Sstevel@tonic-gate addr += (mnoderangecnt * sizeof (page_t **)); 18690Sstevel@tonic-gate 18700Sstevel@tonic-gate for (i = 0; i < mnoderangecnt; i++) { 18710Sstevel@tonic-gate page_freelists[i] = (page_t ***)addr; 18720Sstevel@tonic-gate addr += (mmu_page_sizes * sizeof (page_t **)); 18730Sstevel@tonic-gate 18740Sstevel@tonic-gate for (j = 0; j < mmu_page_sizes; j++) { 18750Sstevel@tonic-gate colors = page_get_pagecolors(j); 18760Sstevel@tonic-gate page_freelists[i][j] = (page_t **)addr; 18770Sstevel@tonic-gate addr += (colors * sizeof (page_t *)); 18780Sstevel@tonic-gate } 18790Sstevel@tonic-gate page_cachelists[i] = (page_t **)addr; 18800Sstevel@tonic-gate addr += (page_colors * sizeof (page_t *)); 18810Sstevel@tonic-gate } 18820Sstevel@tonic-gate } 18830Sstevel@tonic-gate 18845084Sjohnlev #if defined(__xpv) 18855084Sjohnlev /* 18865084Sjohnlev * Give back 10% of the io_pool pages to the free list. 18875084Sjohnlev * Don't shrink the pool below some absolute minimum. 18885084Sjohnlev */ 18895084Sjohnlev static void 18905084Sjohnlev page_io_pool_shrink() 18915084Sjohnlev { 18925084Sjohnlev int retcnt; 18935084Sjohnlev page_t *pp, *pp_first, *pp_last, **curpool; 18945084Sjohnlev mfn_t mfn; 18955084Sjohnlev int bothpools = 0; 18965084Sjohnlev 18975084Sjohnlev mutex_enter(&io_pool_lock); 18985084Sjohnlev io_pool_shrink_attempts++; /* should be a kstat? */ 18995084Sjohnlev retcnt = io_pool_cnt / 10; 19005084Sjohnlev if (io_pool_cnt - retcnt < io_pool_cnt_min) 19015084Sjohnlev retcnt = io_pool_cnt - io_pool_cnt_min; 19025084Sjohnlev if (retcnt <= 0) 19035084Sjohnlev goto done; 19045084Sjohnlev io_pool_shrinks++; /* should be a kstat? */ 19055084Sjohnlev curpool = &io_pool_4g; 19065084Sjohnlev domore: 19075084Sjohnlev /* 19085084Sjohnlev * Loop through taking pages from the end of the list 19095084Sjohnlev * (highest mfns) till amount to return reached. 19105084Sjohnlev */ 19115084Sjohnlev for (pp = *curpool; pp && retcnt > 0; ) { 19125084Sjohnlev pp_first = pp_last = pp->p_prev; 19135084Sjohnlev if (pp_first == *curpool) 19145084Sjohnlev break; 19155084Sjohnlev retcnt--; 19165084Sjohnlev io_pool_cnt--; 19175084Sjohnlev page_io_pool_sub(curpool, pp_first, pp_last); 19185084Sjohnlev if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn) 19195084Sjohnlev start_mfn = mfn; 19205084Sjohnlev page_free(pp_first, 1); 19215084Sjohnlev pp = *curpool; 19225084Sjohnlev } 19235084Sjohnlev if (retcnt != 0 && !bothpools) { 19245084Sjohnlev /* 19255084Sjohnlev * If not enough found in less constrained pool try the 19265084Sjohnlev * more constrained one. 19275084Sjohnlev */ 19285084Sjohnlev curpool = &io_pool_16m; 19295084Sjohnlev bothpools = 1; 19305084Sjohnlev goto domore; 19315084Sjohnlev } 19325084Sjohnlev done: 19335084Sjohnlev mutex_exit(&io_pool_lock); 19345084Sjohnlev } 19355084Sjohnlev 19365084Sjohnlev #endif /* __xpv */ 19375084Sjohnlev 19385084Sjohnlev uint_t 19395084Sjohnlev page_create_update_flags_x86(uint_t flags) 19405084Sjohnlev { 19415084Sjohnlev #if defined(__xpv) 19425084Sjohnlev /* 19435084Sjohnlev * Check this is an urgent allocation and free pages are depleted. 19445084Sjohnlev */ 19455084Sjohnlev if (!(flags & PG_WAIT) && freemem < desfree) 19465084Sjohnlev page_io_pool_shrink(); 19475084Sjohnlev #else /* !__xpv */ 19485084Sjohnlev /* 19495084Sjohnlev * page_create_get_something may call this because 4g memory may be 19505084Sjohnlev * depleted. Set flags to allow for relocation of base page below 19515084Sjohnlev * 4g if necessary. 19525084Sjohnlev */ 19535084Sjohnlev if (physmax4g) 19545084Sjohnlev flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI); 19555084Sjohnlev #endif /* __xpv */ 19565084Sjohnlev return (flags); 19575084Sjohnlev } 19585084Sjohnlev 19590Sstevel@tonic-gate /*ARGSUSED*/ 19600Sstevel@tonic-gate int 19610Sstevel@tonic-gate bp_color(struct buf *bp) 19620Sstevel@tonic-gate { 19630Sstevel@tonic-gate return (0); 19640Sstevel@tonic-gate } 19650Sstevel@tonic-gate 19665084Sjohnlev #if defined(__xpv) 19675084Sjohnlev 19685084Sjohnlev /* 19695084Sjohnlev * Take pages out of an io_pool 19705084Sjohnlev */ 19715084Sjohnlev static void 19725084Sjohnlev page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last) 19735084Sjohnlev { 19745084Sjohnlev if (*poolp == pp_first) { 19755084Sjohnlev *poolp = pp_last->p_next; 19765084Sjohnlev if (*poolp == pp_first) 19775084Sjohnlev *poolp = NULL; 19785084Sjohnlev } 19795084Sjohnlev pp_first->p_prev->p_next = pp_last->p_next; 19805084Sjohnlev pp_last->p_next->p_prev = pp_first->p_prev; 19815084Sjohnlev pp_first->p_prev = pp_last; 19825084Sjohnlev pp_last->p_next = pp_first; 19835084Sjohnlev } 19845084Sjohnlev 19855084Sjohnlev /* 19865084Sjohnlev * Put a page on the io_pool list. The list is ordered by increasing MFN. 19875084Sjohnlev */ 19885084Sjohnlev static void 19895084Sjohnlev page_io_pool_add(page_t **poolp, page_t *pp) 19905084Sjohnlev { 19915084Sjohnlev page_t *look; 19925084Sjohnlev mfn_t mfn = mfn_list[pp->p_pagenum]; 19935084Sjohnlev 19945084Sjohnlev if (*poolp == NULL) { 19955084Sjohnlev *poolp = pp; 19965084Sjohnlev pp->p_next = pp; 19975084Sjohnlev pp->p_prev = pp; 19985084Sjohnlev return; 19995084Sjohnlev } 20005084Sjohnlev 20015084Sjohnlev /* 20025084Sjohnlev * Since we try to take pages from the high end of the pool 20035084Sjohnlev * chances are good that the pages to be put on the list will 20045084Sjohnlev * go at or near the end of the list. so start at the end and 20055084Sjohnlev * work backwards. 20065084Sjohnlev */ 20075084Sjohnlev look = (*poolp)->p_prev; 20085084Sjohnlev while (mfn < mfn_list[look->p_pagenum]) { 20095084Sjohnlev look = look->p_prev; 20105084Sjohnlev if (look == (*poolp)->p_prev) 20115084Sjohnlev break; /* backed all the way to front of list */ 20125084Sjohnlev } 20135084Sjohnlev 20145084Sjohnlev /* insert after look */ 20155084Sjohnlev pp->p_prev = look; 20165084Sjohnlev pp->p_next = look->p_next; 20175084Sjohnlev pp->p_next->p_prev = pp; 20185084Sjohnlev look->p_next = pp; 20195084Sjohnlev if (mfn < mfn_list[(*poolp)->p_pagenum]) { 20205084Sjohnlev /* 20215084Sjohnlev * we inserted a new first list element 20225084Sjohnlev * adjust pool pointer to newly inserted element 20235084Sjohnlev */ 20245084Sjohnlev *poolp = pp; 20255084Sjohnlev } 20265084Sjohnlev } 20275084Sjohnlev 20285084Sjohnlev /* 20295084Sjohnlev * Add a page to the io_pool. Setting the force flag will force the page 20305084Sjohnlev * into the io_pool no matter what. 20315084Sjohnlev */ 20325084Sjohnlev static void 20335084Sjohnlev add_page_to_pool(page_t *pp, int force) 20345084Sjohnlev { 20355084Sjohnlev page_t *highest; 20365084Sjohnlev page_t *freep = NULL; 20375084Sjohnlev 20385084Sjohnlev mutex_enter(&io_pool_lock); 20395084Sjohnlev /* 20405084Sjohnlev * Always keep the scarce low memory pages 20415084Sjohnlev */ 20425084Sjohnlev if (mfn_list[pp->p_pagenum] < PFN_16MEG) { 20435084Sjohnlev ++io_pool_cnt; 20445084Sjohnlev page_io_pool_add(&io_pool_16m, pp); 20455084Sjohnlev goto done; 20465084Sjohnlev } 20476159Ssmaybe if (io_pool_cnt < io_pool_cnt_max || force || io_pool_4g == NULL) { 20485084Sjohnlev ++io_pool_cnt; 20495084Sjohnlev page_io_pool_add(&io_pool_4g, pp); 20505084Sjohnlev } else { 20515084Sjohnlev highest = io_pool_4g->p_prev; 20525084Sjohnlev if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) { 20535084Sjohnlev page_io_pool_sub(&io_pool_4g, highest, highest); 20545084Sjohnlev page_io_pool_add(&io_pool_4g, pp); 20555084Sjohnlev freep = highest; 20565084Sjohnlev } else { 20575084Sjohnlev freep = pp; 20585084Sjohnlev } 20595084Sjohnlev } 20605084Sjohnlev done: 20615084Sjohnlev mutex_exit(&io_pool_lock); 20625084Sjohnlev if (freep) 20635084Sjohnlev page_free(freep, 1); 20645084Sjohnlev } 20655084Sjohnlev 20665084Sjohnlev 20675084Sjohnlev int contig_pfn_cnt; /* no of pfns in the contig pfn list */ 20685084Sjohnlev int contig_pfn_max; /* capacity of the contig pfn list */ 20695084Sjohnlev int next_alloc_pfn; /* next position in list to start a contig search */ 20705084Sjohnlev int contig_pfnlist_updates; /* pfn list update count */ 20715084Sjohnlev int contig_pfnlist_builds; /* how many times have we (re)built list */ 20725084Sjohnlev int contig_pfnlist_buildfailed; /* how many times has list build failed */ 20735084Sjohnlev int create_contig_pending; /* nonzero means taskq creating contig list */ 20745084Sjohnlev pfn_t *contig_pfn_list = NULL; /* list of contig pfns in ascending mfn order */ 20755084Sjohnlev 20765084Sjohnlev /* 20775084Sjohnlev * Function to use in sorting a list of pfns by their underlying mfns. 20785084Sjohnlev */ 20795084Sjohnlev static int 20805084Sjohnlev mfn_compare(const void *pfnp1, const void *pfnp2) 20815084Sjohnlev { 20825084Sjohnlev mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1]; 20835084Sjohnlev mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2]; 20845084Sjohnlev 20855084Sjohnlev if (mfn1 > mfn2) 20865084Sjohnlev return (1); 20875084Sjohnlev if (mfn1 < mfn2) 20885084Sjohnlev return (-1); 20895084Sjohnlev return (0); 20905084Sjohnlev } 20915084Sjohnlev 20925084Sjohnlev /* 20935084Sjohnlev * Compact the contig_pfn_list by tossing all the non-contiguous 20945084Sjohnlev * elements from the list. 20955084Sjohnlev */ 20965084Sjohnlev static void 20975084Sjohnlev compact_contig_pfn_list(void) 20985084Sjohnlev { 20995084Sjohnlev pfn_t pfn, lapfn, prev_lapfn; 21005084Sjohnlev mfn_t mfn; 21015084Sjohnlev int i, newcnt = 0; 21025084Sjohnlev 21035084Sjohnlev prev_lapfn = 0; 21045084Sjohnlev for (i = 0; i < contig_pfn_cnt - 1; i++) { 21055084Sjohnlev pfn = contig_pfn_list[i]; 21065084Sjohnlev lapfn = contig_pfn_list[i + 1]; 21075084Sjohnlev mfn = mfn_list[pfn]; 21085084Sjohnlev /* 21095084Sjohnlev * See if next pfn is for a contig mfn 21105084Sjohnlev */ 21115084Sjohnlev if (mfn_list[lapfn] != mfn + 1) 21125084Sjohnlev continue; 21135084Sjohnlev /* 21145084Sjohnlev * pfn and lookahead are both put in list 21155084Sjohnlev * unless pfn is the previous lookahead. 21165084Sjohnlev */ 21175084Sjohnlev if (pfn != prev_lapfn) 21185084Sjohnlev contig_pfn_list[newcnt++] = pfn; 21195084Sjohnlev contig_pfn_list[newcnt++] = lapfn; 21205084Sjohnlev prev_lapfn = lapfn; 21215084Sjohnlev } 21225084Sjohnlev for (i = newcnt; i < contig_pfn_cnt; i++) 21235084Sjohnlev contig_pfn_list[i] = 0; 21245084Sjohnlev contig_pfn_cnt = newcnt; 21255084Sjohnlev } 21265084Sjohnlev 21275084Sjohnlev /*ARGSUSED*/ 21285084Sjohnlev static void 21295084Sjohnlev call_create_contiglist(void *arg) 21305084Sjohnlev { 21315084Sjohnlev (void) create_contig_pfnlist(PG_WAIT); 21325084Sjohnlev } 21335084Sjohnlev 21345084Sjohnlev /* 21355084Sjohnlev * Create list of freelist pfns that have underlying 21365084Sjohnlev * contiguous mfns. The list is kept in ascending mfn order. 21375084Sjohnlev * returns 1 if list created else 0. 21385084Sjohnlev */ 21395084Sjohnlev static int 21405084Sjohnlev create_contig_pfnlist(uint_t flags) 21415084Sjohnlev { 21425084Sjohnlev pfn_t pfn; 21435084Sjohnlev page_t *pp; 21445529Ssmaybe int ret = 1; 21455529Ssmaybe 21465529Ssmaybe mutex_enter(&contig_list_lock); 21475084Sjohnlev if (contig_pfn_list != NULL) 21485529Ssmaybe goto out; 21495084Sjohnlev contig_pfn_max = freemem + (freemem / 10); 21505084Sjohnlev contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t), 21515084Sjohnlev (flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP); 21525084Sjohnlev if (contig_pfn_list == NULL) { 21535084Sjohnlev /* 21545084Sjohnlev * If we could not create the contig list (because 21555084Sjohnlev * we could not sleep for memory). Dispatch a taskq that can 21565084Sjohnlev * sleep to get the memory. 21575084Sjohnlev */ 21585084Sjohnlev if (!create_contig_pending) { 21595084Sjohnlev if (taskq_dispatch(system_taskq, call_create_contiglist, 21605084Sjohnlev NULL, TQ_NOSLEEP) != NULL) 21615084Sjohnlev create_contig_pending = 1; 21625084Sjohnlev } 21635084Sjohnlev contig_pfnlist_buildfailed++; /* count list build failures */ 21645529Ssmaybe ret = 0; 21655529Ssmaybe goto out; 21665084Sjohnlev } 21675529Ssmaybe create_contig_pending = 0; 21685084Sjohnlev ASSERT(contig_pfn_cnt == 0); 21695084Sjohnlev for (pfn = 0; pfn < mfn_count; pfn++) { 21705084Sjohnlev pp = page_numtopp_nolock(pfn); 21715084Sjohnlev if (pp == NULL || !PP_ISFREE(pp)) 21725084Sjohnlev continue; 21735084Sjohnlev contig_pfn_list[contig_pfn_cnt] = pfn; 21745084Sjohnlev if (++contig_pfn_cnt == contig_pfn_max) 21755084Sjohnlev break; 21765084Sjohnlev } 21779010SStuart.Maybee@Sun.COM /* 21789010SStuart.Maybee@Sun.COM * Sanity check the new list. 21799010SStuart.Maybee@Sun.COM */ 21809010SStuart.Maybee@Sun.COM if (contig_pfn_cnt < 2) { /* no contig pfns */ 21819010SStuart.Maybee@Sun.COM contig_pfn_cnt = 0; 21829010SStuart.Maybee@Sun.COM contig_pfnlist_buildfailed++; 21839010SStuart.Maybee@Sun.COM kmem_free(contig_pfn_list, contig_pfn_max * sizeof (pfn_t)); 21849010SStuart.Maybee@Sun.COM contig_pfn_list = NULL; 21859010SStuart.Maybee@Sun.COM contig_pfn_max = 0; 21869010SStuart.Maybee@Sun.COM ret = 0; 21879010SStuart.Maybee@Sun.COM goto out; 21889010SStuart.Maybee@Sun.COM } 21895084Sjohnlev qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare); 21905084Sjohnlev compact_contig_pfn_list(); 21915084Sjohnlev /* 21925084Sjohnlev * Make sure next search of the newly created contiguous pfn 21935084Sjohnlev * list starts at the beginning of the list. 21945084Sjohnlev */ 21955084Sjohnlev next_alloc_pfn = 0; 21965084Sjohnlev contig_pfnlist_builds++; /* count list builds */ 21975529Ssmaybe out: 21985529Ssmaybe mutex_exit(&contig_list_lock); 21995529Ssmaybe return (ret); 22005084Sjohnlev } 22015084Sjohnlev 22025084Sjohnlev 22035084Sjohnlev /* 22045084Sjohnlev * Toss the current contig pfnlist. Someone is about to do a massive 22055084Sjohnlev * update to pfn<->mfn mappings. So we have them destroy the list and lock 22065084Sjohnlev * it till they are done with their update. 22075084Sjohnlev */ 22085084Sjohnlev void 22095084Sjohnlev clear_and_lock_contig_pfnlist() 22105084Sjohnlev { 22115084Sjohnlev pfn_t *listp = NULL; 22125084Sjohnlev size_t listsize; 22135084Sjohnlev 22145529Ssmaybe mutex_enter(&contig_list_lock); 22155084Sjohnlev if (contig_pfn_list != NULL) { 22165084Sjohnlev listp = contig_pfn_list; 22175084Sjohnlev listsize = contig_pfn_max * sizeof (pfn_t); 22185084Sjohnlev contig_pfn_list = NULL; 22195084Sjohnlev contig_pfn_max = contig_pfn_cnt = 0; 22205084Sjohnlev } 22215084Sjohnlev if (listp != NULL) 22225084Sjohnlev kmem_free(listp, listsize); 22235084Sjohnlev } 22245084Sjohnlev 22255084Sjohnlev /* 22265084Sjohnlev * Unlock the contig_pfn_list. The next attempted use of it will cause 22275084Sjohnlev * it to be re-created. 22285084Sjohnlev */ 22295084Sjohnlev void 22305084Sjohnlev unlock_contig_pfnlist() 22315084Sjohnlev { 22325529Ssmaybe mutex_exit(&contig_list_lock); 22335084Sjohnlev } 22345084Sjohnlev 22355084Sjohnlev /* 22365084Sjohnlev * Update the contiguous pfn list in response to a pfn <-> mfn reassignment 22375084Sjohnlev */ 22385084Sjohnlev void 22395084Sjohnlev update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn) 22405084Sjohnlev { 22415084Sjohnlev int probe_hi, probe_lo, probe_pos, insert_after, insert_point; 22425084Sjohnlev pfn_t probe_pfn; 22435084Sjohnlev mfn_t probe_mfn; 22445529Ssmaybe int drop_lock = 0; 22455529Ssmaybe 22465529Ssmaybe if (mutex_owner(&contig_list_lock) != curthread) { 22475529Ssmaybe drop_lock = 1; 22485529Ssmaybe mutex_enter(&contig_list_lock); 22495529Ssmaybe } 22505084Sjohnlev if (contig_pfn_list == NULL) 22515529Ssmaybe goto done; 22525084Sjohnlev contig_pfnlist_updates++; 22535084Sjohnlev /* 22545084Sjohnlev * Find the pfn in the current list. Use a binary chop to locate it. 22555084Sjohnlev */ 22565084Sjohnlev probe_hi = contig_pfn_cnt - 1; 22575084Sjohnlev probe_lo = 0; 22585084Sjohnlev probe_pos = (probe_hi + probe_lo) / 2; 22595084Sjohnlev while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) { 22605084Sjohnlev if (probe_pos == probe_lo) { /* pfn not in list */ 22615084Sjohnlev probe_pos = -1; 22625084Sjohnlev break; 22635084Sjohnlev } 22645084Sjohnlev if (pfn_to_mfn(probe_pfn) <= oldmfn) 22655084Sjohnlev probe_lo = probe_pos; 22665084Sjohnlev else 22675084Sjohnlev probe_hi = probe_pos; 22685084Sjohnlev probe_pos = (probe_hi + probe_lo) / 2; 22695084Sjohnlev } 22709010SStuart.Maybee@Sun.COM if (probe_pos >= 0) { 22719010SStuart.Maybee@Sun.COM /* 22729010SStuart.Maybee@Sun.COM * Remove pfn from list and ensure next alloc 22739010SStuart.Maybee@Sun.COM * position stays in bounds. 22749010SStuart.Maybee@Sun.COM */ 22759010SStuart.Maybee@Sun.COM if (--contig_pfn_cnt <= next_alloc_pfn) 22769010SStuart.Maybee@Sun.COM next_alloc_pfn = 0; 227710175SStuart.Maybee@Sun.COM if (contig_pfn_cnt < 2) { /* no contig pfns */ 227810175SStuart.Maybee@Sun.COM contig_pfn_cnt = 0; 227910175SStuart.Maybee@Sun.COM kmem_free(contig_pfn_list, 228010175SStuart.Maybee@Sun.COM contig_pfn_max * sizeof (pfn_t)); 228110175SStuart.Maybee@Sun.COM contig_pfn_list = NULL; 228210175SStuart.Maybee@Sun.COM contig_pfn_max = 0; 228310175SStuart.Maybee@Sun.COM goto done; 228410175SStuart.Maybee@Sun.COM } 22855084Sjohnlev ovbcopy(&contig_pfn_list[probe_pos + 1], 22865084Sjohnlev &contig_pfn_list[probe_pos], 22875084Sjohnlev (contig_pfn_cnt - probe_pos) * sizeof (pfn_t)); 22885084Sjohnlev } 22895084Sjohnlev if (newmfn == MFN_INVALID) 22905084Sjohnlev goto done; 22915084Sjohnlev /* 22925084Sjohnlev * Check if new mfn has adjacent mfns in the list 22935084Sjohnlev */ 22945084Sjohnlev probe_hi = contig_pfn_cnt - 1; 22955084Sjohnlev probe_lo = 0; 22965084Sjohnlev insert_after = -2; 22975084Sjohnlev do { 22985084Sjohnlev probe_pos = (probe_hi + probe_lo) / 2; 22995084Sjohnlev probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]); 23005084Sjohnlev if (newmfn == probe_mfn + 1) 23015084Sjohnlev insert_after = probe_pos; 23025084Sjohnlev else if (newmfn == probe_mfn - 1) 23035084Sjohnlev insert_after = probe_pos - 1; 23045084Sjohnlev if (probe_pos == probe_lo) 23055084Sjohnlev break; 23065084Sjohnlev if (probe_mfn <= newmfn) 23075084Sjohnlev probe_lo = probe_pos; 23085084Sjohnlev else 23095084Sjohnlev probe_hi = probe_pos; 23105084Sjohnlev } while (insert_after == -2); 23115084Sjohnlev /* 23125084Sjohnlev * If there is space in the list and there are adjacent mfns 23135084Sjohnlev * insert the pfn in to its proper place in the list. 23145084Sjohnlev */ 23155084Sjohnlev if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) { 23165084Sjohnlev insert_point = insert_after + 1; 23175084Sjohnlev ovbcopy(&contig_pfn_list[insert_point], 23185084Sjohnlev &contig_pfn_list[insert_point + 1], 23195084Sjohnlev (contig_pfn_cnt - insert_point) * sizeof (pfn_t)); 23205084Sjohnlev contig_pfn_list[insert_point] = pfn; 23215084Sjohnlev contig_pfn_cnt++; 23225084Sjohnlev } 23235084Sjohnlev done: 23245529Ssmaybe if (drop_lock) 23255529Ssmaybe mutex_exit(&contig_list_lock); 23265084Sjohnlev } 23275084Sjohnlev 23285084Sjohnlev /* 23295084Sjohnlev * Called to (re-)populate the io_pool from the free page lists. 23305084Sjohnlev */ 23315084Sjohnlev long 23325084Sjohnlev populate_io_pool(void) 23335084Sjohnlev { 23345084Sjohnlev pfn_t pfn; 23355084Sjohnlev mfn_t mfn, max_mfn; 23365084Sjohnlev page_t *pp; 23375084Sjohnlev 23385084Sjohnlev /* 23395084Sjohnlev * Figure out the bounds of the pool on first invocation. 23405084Sjohnlev * We use a percentage of memory for the io pool size. 23415084Sjohnlev * we allow that to shrink, but not to less than a fixed minimum 23425084Sjohnlev */ 23435084Sjohnlev if (io_pool_cnt_max == 0) { 23445084Sjohnlev io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct); 23455084Sjohnlev io_pool_cnt_lowater = io_pool_cnt_max; 23465084Sjohnlev /* 23475084Sjohnlev * This is the first time in populate_io_pool, grab a va to use 23485084Sjohnlev * when we need to allocate pages. 23495084Sjohnlev */ 23505084Sjohnlev io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); 23515084Sjohnlev } 23525084Sjohnlev /* 23535084Sjohnlev * If we are out of pages in the pool, then grow the size of the pool 23545084Sjohnlev */ 23556159Ssmaybe if (io_pool_cnt == 0) { 23566159Ssmaybe /* 23576159Ssmaybe * Grow the max size of the io pool by 5%, but never more than 23586159Ssmaybe * 25% of physical memory. 23596159Ssmaybe */ 23606159Ssmaybe if (io_pool_cnt_max < physmem / 4) 23616159Ssmaybe io_pool_cnt_max += io_pool_cnt_max / 20; 23626159Ssmaybe } 23635084Sjohnlev io_pool_grows++; /* should be a kstat? */ 23645084Sjohnlev 23655084Sjohnlev /* 23665084Sjohnlev * Get highest mfn on this platform, but limit to the 32 bit DMA max. 23675084Sjohnlev */ 23685084Sjohnlev (void) mfn_to_pfn(start_mfn); 23695084Sjohnlev max_mfn = MIN(cached_max_mfn, PFN_4GIG); 23705084Sjohnlev for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) { 23715084Sjohnlev pfn = mfn_to_pfn(mfn); 23725084Sjohnlev if (pfn & PFN_IS_FOREIGN_MFN) 23735084Sjohnlev continue; 23745084Sjohnlev /* 23755084Sjohnlev * try to allocate it from free pages 23765084Sjohnlev */ 23775084Sjohnlev pp = page_numtopp_alloc(pfn); 23785084Sjohnlev if (pp == NULL) 23795084Sjohnlev continue; 23805084Sjohnlev PP_CLRFREE(pp); 23815084Sjohnlev add_page_to_pool(pp, 1); 23825084Sjohnlev if (io_pool_cnt >= io_pool_cnt_max) 23835084Sjohnlev break; 23845084Sjohnlev } 23855084Sjohnlev 23865084Sjohnlev return (io_pool_cnt); 23875084Sjohnlev } 23885084Sjohnlev 23895084Sjohnlev /* 23905084Sjohnlev * Destroy a page that was being used for DMA I/O. It may or 23915084Sjohnlev * may not actually go back to the io_pool. 23925084Sjohnlev */ 23935084Sjohnlev void 23945084Sjohnlev page_destroy_io(page_t *pp) 23955084Sjohnlev { 23965084Sjohnlev mfn_t mfn = mfn_list[pp->p_pagenum]; 23975084Sjohnlev 23985084Sjohnlev /* 23995084Sjohnlev * When the page was alloc'd a reservation was made, release it now 24005084Sjohnlev */ 24015084Sjohnlev page_unresv(1); 24025084Sjohnlev /* 24035084Sjohnlev * Unload translations, if any, then hash out the 24045084Sjohnlev * page to erase its identity. 24055084Sjohnlev */ 24065084Sjohnlev (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 24075084Sjohnlev page_hashout(pp, NULL); 24085084Sjohnlev 24095084Sjohnlev /* 24105084Sjohnlev * If the page came from the free lists, just put it back to them. 24115084Sjohnlev * DomU pages always go on the free lists as well. 24125084Sjohnlev */ 24135084Sjohnlev if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) { 24145084Sjohnlev page_free(pp, 1); 24155084Sjohnlev return; 24165084Sjohnlev } 24175084Sjohnlev 24185084Sjohnlev add_page_to_pool(pp, 0); 24195084Sjohnlev } 24205084Sjohnlev 24215084Sjohnlev 24225084Sjohnlev long contig_searches; /* count of times contig pages requested */ 24235084Sjohnlev long contig_search_restarts; /* count of contig ranges tried */ 24245084Sjohnlev long contig_search_failed; /* count of contig alloc failures */ 24255084Sjohnlev 24265084Sjohnlev /* 242710175SStuart.Maybee@Sun.COM * Free partial page list 242810175SStuart.Maybee@Sun.COM */ 242910175SStuart.Maybee@Sun.COM static void 243010175SStuart.Maybee@Sun.COM free_partial_list(page_t **pplist) 243110175SStuart.Maybee@Sun.COM { 243210175SStuart.Maybee@Sun.COM page_t *pp; 243310175SStuart.Maybee@Sun.COM 243410175SStuart.Maybee@Sun.COM while (*pplist != NULL) { 243510175SStuart.Maybee@Sun.COM pp = *pplist; 243610175SStuart.Maybee@Sun.COM page_io_pool_sub(pplist, pp, pp); 243710175SStuart.Maybee@Sun.COM page_free(pp, 1); 243810175SStuart.Maybee@Sun.COM } 243910175SStuart.Maybee@Sun.COM } 244010175SStuart.Maybee@Sun.COM 244110175SStuart.Maybee@Sun.COM /* 24425084Sjohnlev * Look thru the contiguous pfns that are not part of the io_pool for 24435084Sjohnlev * contiguous free pages. Return a list of the found pages or NULL. 24445084Sjohnlev */ 24455084Sjohnlev page_t * 244610175SStuart.Maybee@Sun.COM find_contig_free(uint_t npages, uint_t flags, uint64_t pfnseg, 244710175SStuart.Maybee@Sun.COM pgcnt_t pfnalign) 24485084Sjohnlev { 24495084Sjohnlev page_t *pp, *plist = NULL; 24506282Ssmaybe mfn_t mfn, prev_mfn, start_mfn; 24515084Sjohnlev pfn_t pfn; 24525084Sjohnlev int pages_needed, pages_requested; 24535084Sjohnlev int search_start; 24545084Sjohnlev 24555084Sjohnlev /* 24565084Sjohnlev * create the contig pfn list if not already done 24575084Sjohnlev */ 24585529Ssmaybe retry: 24595529Ssmaybe mutex_enter(&contig_list_lock); 24605084Sjohnlev if (contig_pfn_list == NULL) { 24615529Ssmaybe mutex_exit(&contig_list_lock); 24625529Ssmaybe if (!create_contig_pfnlist(flags)) { 24635084Sjohnlev return (NULL); 24645084Sjohnlev } 24655529Ssmaybe goto retry; 24665084Sjohnlev } 24675084Sjohnlev contig_searches++; 24685084Sjohnlev /* 24695084Sjohnlev * Search contiguous pfn list for physically contiguous pages not in 24705084Sjohnlev * the io_pool. Start the search where the last search left off. 24715084Sjohnlev */ 24725843Ssmaybe pages_requested = pages_needed = npages; 24735084Sjohnlev search_start = next_alloc_pfn; 24746282Ssmaybe start_mfn = prev_mfn = 0; 24755084Sjohnlev while (pages_needed) { 24765084Sjohnlev pfn = contig_pfn_list[next_alloc_pfn]; 24775084Sjohnlev mfn = pfn_to_mfn(pfn); 24786282Ssmaybe /* 24796282Ssmaybe * Check if mfn is first one or contig to previous one and 24806282Ssmaybe * if page corresponding to mfn is free and that mfn 24816282Ssmaybe * range is not crossing a segment boundary. 24826282Ssmaybe */ 24835084Sjohnlev if ((prev_mfn == 0 || mfn == prev_mfn + 1) && 24846282Ssmaybe (pp = page_numtopp_alloc(pfn)) != NULL && 24856282Ssmaybe !((mfn & pfnseg) < (start_mfn & pfnseg))) { 24865084Sjohnlev PP_CLRFREE(pp); 24875084Sjohnlev page_io_pool_add(&plist, pp); 24885084Sjohnlev pages_needed--; 248910175SStuart.Maybee@Sun.COM if (prev_mfn == 0) { 249010175SStuart.Maybee@Sun.COM if (pfnalign && 249110175SStuart.Maybee@Sun.COM mfn != P2ROUNDUP(mfn, pfnalign)) { 249210175SStuart.Maybee@Sun.COM /* 249310175SStuart.Maybee@Sun.COM * not properly aligned 249410175SStuart.Maybee@Sun.COM */ 249510175SStuart.Maybee@Sun.COM contig_search_restarts++; 249610175SStuart.Maybee@Sun.COM free_partial_list(&plist); 249710175SStuart.Maybee@Sun.COM pages_needed = pages_requested; 249810175SStuart.Maybee@Sun.COM start_mfn = prev_mfn = 0; 249910175SStuart.Maybee@Sun.COM goto skip; 250010175SStuart.Maybee@Sun.COM } 25016282Ssmaybe start_mfn = mfn; 250210175SStuart.Maybee@Sun.COM } 25035084Sjohnlev prev_mfn = mfn; 25045084Sjohnlev } else { 25055084Sjohnlev contig_search_restarts++; 250610175SStuart.Maybee@Sun.COM free_partial_list(&plist); 25075084Sjohnlev pages_needed = pages_requested; 25086282Ssmaybe start_mfn = prev_mfn = 0; 25095084Sjohnlev } 251010175SStuart.Maybee@Sun.COM skip: 25115084Sjohnlev if (++next_alloc_pfn == contig_pfn_cnt) 25125084Sjohnlev next_alloc_pfn = 0; 25135084Sjohnlev if (next_alloc_pfn == search_start) 25145084Sjohnlev break; /* all pfns searched */ 25155084Sjohnlev } 25165529Ssmaybe mutex_exit(&contig_list_lock); 25175084Sjohnlev if (pages_needed) { 25185084Sjohnlev contig_search_failed++; 25195084Sjohnlev /* 25205084Sjohnlev * Failed to find enough contig pages. 25215084Sjohnlev * free partial page list 25225084Sjohnlev */ 252310175SStuart.Maybee@Sun.COM free_partial_list(&plist); 25245084Sjohnlev } 25255084Sjohnlev return (plist); 25265084Sjohnlev } 25275084Sjohnlev 25285084Sjohnlev /* 25295843Ssmaybe * Search the reserved io pool pages for a page range with the 25305843Ssmaybe * desired characteristics. 25315084Sjohnlev */ 25325084Sjohnlev page_t * 25335843Ssmaybe page_io_pool_alloc(ddi_dma_attr_t *mattr, int contig, pgcnt_t minctg) 25345084Sjohnlev { 25355843Ssmaybe page_t *pp_first, *pp_last; 25365843Ssmaybe page_t *pp, **poolp; 25375843Ssmaybe pgcnt_t nwanted, pfnalign; 25385084Sjohnlev uint64_t pfnseg; 25395843Ssmaybe mfn_t mfn, tmfn, hi_mfn, lo_mfn; 25405843Ssmaybe int align, attempt = 0; 25415843Ssmaybe 25425843Ssmaybe if (minctg == 1) 25435843Ssmaybe contig = 0; 25445084Sjohnlev lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 25455084Sjohnlev hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 25465843Ssmaybe pfnseg = mmu_btop(mattr->dma_attr_seg); 25475084Sjohnlev align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 25485084Sjohnlev if (align > MMU_PAGESIZE) 25495084Sjohnlev pfnalign = mmu_btop(align); 25505843Ssmaybe else 25515843Ssmaybe pfnalign = 0; 25525843Ssmaybe 25535084Sjohnlev try_again: 25545084Sjohnlev /* 25555084Sjohnlev * See if we want pages for a legacy device 25565084Sjohnlev */ 25575084Sjohnlev if (hi_mfn < PFN_16MEG) 25585084Sjohnlev poolp = &io_pool_16m; 25595084Sjohnlev else 25605084Sjohnlev poolp = &io_pool_4g; 25615084Sjohnlev try_smaller: 25625084Sjohnlev /* 25635843Ssmaybe * Take pages from I/O pool. We'll use pages from the highest 25645843Ssmaybe * MFN range possible. 25655084Sjohnlev */ 25665084Sjohnlev pp_first = pp_last = NULL; 25675084Sjohnlev mutex_enter(&io_pool_lock); 25685843Ssmaybe nwanted = minctg; 25695843Ssmaybe for (pp = *poolp; pp && nwanted > 0; ) { 25705084Sjohnlev pp = pp->p_prev; 25715084Sjohnlev 25725084Sjohnlev /* 25735084Sjohnlev * skip pages above allowable range 25745084Sjohnlev */ 25755084Sjohnlev mfn = mfn_list[pp->p_pagenum]; 25765084Sjohnlev if (hi_mfn < mfn) 25775084Sjohnlev goto skip; 25785084Sjohnlev 25795084Sjohnlev /* 25805084Sjohnlev * stop at pages below allowable range 25815084Sjohnlev */ 25825084Sjohnlev if (lo_mfn > mfn) 25835084Sjohnlev break; 25845084Sjohnlev restart: 25855084Sjohnlev if (pp_last == NULL) { 25865084Sjohnlev /* 25875084Sjohnlev * Check alignment 25885084Sjohnlev */ 25895843Ssmaybe tmfn = mfn - (minctg - 1); 25905843Ssmaybe if (pfnalign && tmfn != P2ROUNDUP(tmfn, pfnalign)) 25915843Ssmaybe goto skip; /* not properly aligned */ 25925084Sjohnlev /* 25935084Sjohnlev * Check segment 25945084Sjohnlev */ 25955084Sjohnlev if ((mfn & pfnseg) < (tmfn & pfnseg)) 25965843Ssmaybe goto skip; /* crosses seg boundary */ 25975084Sjohnlev /* 25985084Sjohnlev * Start building page list 25995084Sjohnlev */ 26005084Sjohnlev pp_first = pp_last = pp; 26015843Ssmaybe nwanted--; 26025084Sjohnlev } else { 26035084Sjohnlev /* 26045084Sjohnlev * check physical contiguity if required 26055084Sjohnlev */ 26065084Sjohnlev if (contig && 26075084Sjohnlev mfn_list[pp_first->p_pagenum] != mfn + 1) { 26085084Sjohnlev /* 26095084Sjohnlev * not a contiguous page, restart list. 26105084Sjohnlev */ 26115084Sjohnlev pp_last = NULL; 26125843Ssmaybe nwanted = minctg; 26135084Sjohnlev goto restart; 26145084Sjohnlev } else { /* add page to list */ 26155084Sjohnlev pp_first = pp; 26165843Ssmaybe nwanted--; 26175084Sjohnlev } 26185084Sjohnlev } 26195084Sjohnlev skip: 26205084Sjohnlev if (pp == *poolp) 26215084Sjohnlev break; 26225084Sjohnlev } 26235084Sjohnlev 26245084Sjohnlev /* 26255084Sjohnlev * If we didn't find memory. Try the more constrained pool, then 26265843Ssmaybe * sweep free pages into the DMA pool and try again. 26275084Sjohnlev */ 26285843Ssmaybe if (nwanted != 0) { 26295084Sjohnlev mutex_exit(&io_pool_lock); 26305084Sjohnlev /* 26315843Ssmaybe * If we were looking in the less constrained pool and 26325843Ssmaybe * didn't find pages, try the more constrained pool. 26335084Sjohnlev */ 26345084Sjohnlev if (poolp == &io_pool_4g) { 26355084Sjohnlev poolp = &io_pool_16m; 26365084Sjohnlev goto try_smaller; 26375084Sjohnlev } 26385084Sjohnlev kmem_reap(); 26395084Sjohnlev if (++attempt < 4) { 26405084Sjohnlev /* 26415084Sjohnlev * Grab some more io_pool pages 26425084Sjohnlev */ 26435084Sjohnlev (void) populate_io_pool(); 26445843Ssmaybe goto try_again; /* go around and retry */ 26455084Sjohnlev } 26465843Ssmaybe return (NULL); 26475084Sjohnlev } 26485084Sjohnlev /* 26495084Sjohnlev * Found the pages, now snip them from the list 26505084Sjohnlev */ 26515084Sjohnlev page_io_pool_sub(poolp, pp_first, pp_last); 26525843Ssmaybe io_pool_cnt -= minctg; 26535843Ssmaybe /* 26545843Ssmaybe * reset low water mark 26555843Ssmaybe */ 26565084Sjohnlev if (io_pool_cnt < io_pool_cnt_lowater) 26575843Ssmaybe io_pool_cnt_lowater = io_pool_cnt; 26585084Sjohnlev mutex_exit(&io_pool_lock); 26595843Ssmaybe return (pp_first); 26605843Ssmaybe } 26615843Ssmaybe 26625843Ssmaybe page_t * 26635843Ssmaybe page_swap_with_hypervisor(struct vnode *vp, u_offset_t off, caddr_t vaddr, 26645843Ssmaybe ddi_dma_attr_t *mattr, uint_t flags, pgcnt_t minctg) 26655843Ssmaybe { 26665843Ssmaybe uint_t kflags; 26675843Ssmaybe int order, extra, extpages, i, contig, nbits, extents; 26685843Ssmaybe page_t *pp, *expp, *pp_first, **pplist = NULL; 26695843Ssmaybe mfn_t *mfnlist = NULL; 26705843Ssmaybe 26715843Ssmaybe contig = flags & PG_PHYSCONTIG; 26725843Ssmaybe if (minctg == 1) 26735843Ssmaybe contig = 0; 26745843Ssmaybe flags &= ~PG_PHYSCONTIG; 26755843Ssmaybe kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP; 26765843Ssmaybe /* 26775843Ssmaybe * Hypervisor will allocate extents, if we want contig 26785843Ssmaybe * pages extent must be >= minctg 26795843Ssmaybe */ 26805843Ssmaybe if (contig) { 26815843Ssmaybe order = highbit(minctg) - 1; 26825843Ssmaybe if (minctg & ((1 << order) - 1)) 26835843Ssmaybe order++; 26845843Ssmaybe extpages = 1 << order; 26855843Ssmaybe } else { 26865843Ssmaybe order = 0; 26875843Ssmaybe extpages = minctg; 26885843Ssmaybe } 26895843Ssmaybe if (extpages > minctg) { 26905843Ssmaybe extra = extpages - minctg; 26915843Ssmaybe if (!page_resv(extra, kflags)) 26925843Ssmaybe return (NULL); 26935843Ssmaybe } 26945843Ssmaybe pp_first = NULL; 26955843Ssmaybe pplist = kmem_alloc(extpages * sizeof (page_t *), kflags); 26965843Ssmaybe if (pplist == NULL) 26975843Ssmaybe goto balloon_fail; 26985843Ssmaybe mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags); 26995843Ssmaybe if (mfnlist == NULL) 27005843Ssmaybe goto balloon_fail; 27015843Ssmaybe pp = page_create_va(vp, off, minctg * PAGESIZE, flags, &kvseg, vaddr); 27025843Ssmaybe if (pp == NULL) 27035843Ssmaybe goto balloon_fail; 27045843Ssmaybe pp_first = pp; 27055843Ssmaybe if (extpages > minctg) { 27065843Ssmaybe /* 27075843Ssmaybe * fill out the rest of extent pages to swap 27085843Ssmaybe * with the hypervisor 27095843Ssmaybe */ 27105843Ssmaybe for (i = 0; i < extra; i++) { 27115843Ssmaybe expp = page_create_va(vp, 27125843Ssmaybe (u_offset_t)(uintptr_t)io_pool_kva, 27135843Ssmaybe PAGESIZE, flags, &kvseg, io_pool_kva); 27145843Ssmaybe if (expp == NULL) 27155843Ssmaybe goto balloon_fail; 27165843Ssmaybe (void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD); 27175843Ssmaybe page_io_unlock(expp); 27185843Ssmaybe page_hashout(expp, NULL); 27195843Ssmaybe page_io_lock(expp); 27205843Ssmaybe /* 27215843Ssmaybe * add page to end of list 27225843Ssmaybe */ 27235843Ssmaybe expp->p_prev = pp_first->p_prev; 27245843Ssmaybe expp->p_next = pp_first; 27255843Ssmaybe expp->p_prev->p_next = expp; 27265843Ssmaybe pp_first->p_prev = expp; 27275084Sjohnlev } 27285843Ssmaybe 27295843Ssmaybe } 27305843Ssmaybe for (i = 0; i < extpages; i++) { 27315843Ssmaybe pplist[i] = pp; 27325084Sjohnlev pp = pp->p_next; 27335843Ssmaybe } 27345843Ssmaybe nbits = highbit(mattr->dma_attr_addr_hi); 27355843Ssmaybe extents = contig ? 1 : minctg; 27365843Ssmaybe if (balloon_replace_pages(extents, pplist, nbits, order, 27375843Ssmaybe mfnlist) != extents) { 27385843Ssmaybe if (ioalloc_dbg) 27395843Ssmaybe cmn_err(CE_NOTE, "request to hypervisor" 27405843Ssmaybe " for %d pages, maxaddr %" PRIx64 " failed", 27415843Ssmaybe extpages, mattr->dma_attr_addr_hi); 27425843Ssmaybe goto balloon_fail; 27435843Ssmaybe } 27445843Ssmaybe 27455843Ssmaybe kmem_free(pplist, extpages * sizeof (page_t *)); 27465843Ssmaybe kmem_free(mfnlist, extpages * sizeof (mfn_t)); 27475843Ssmaybe /* 27485843Ssmaybe * Return any excess pages to free list 27495843Ssmaybe */ 27505843Ssmaybe if (extpages > minctg) { 27515843Ssmaybe for (i = 0; i < extra; i++) { 27525843Ssmaybe pp = pp_first->p_prev; 27535843Ssmaybe page_sub(&pp_first, pp); 27545843Ssmaybe page_io_unlock(pp); 27555843Ssmaybe page_unresv(1); 27565843Ssmaybe page_free(pp, 1); 27575843Ssmaybe } 27585843Ssmaybe } 27595084Sjohnlev return (pp_first); 27605084Sjohnlev balloon_fail: 27615084Sjohnlev /* 27625084Sjohnlev * Return pages to free list and return failure 27635084Sjohnlev */ 27645084Sjohnlev while (pp_first != NULL) { 27655084Sjohnlev pp = pp_first; 27665084Sjohnlev page_sub(&pp_first, pp); 27675084Sjohnlev page_io_unlock(pp); 27685084Sjohnlev if (pp->p_vnode != NULL) 27695084Sjohnlev page_hashout(pp, NULL); 27705084Sjohnlev page_free(pp, 1); 27715084Sjohnlev } 27725084Sjohnlev if (pplist) 27735084Sjohnlev kmem_free(pplist, extpages * sizeof (page_t *)); 27745084Sjohnlev if (mfnlist) 27755084Sjohnlev kmem_free(mfnlist, extpages * sizeof (mfn_t)); 27765843Ssmaybe page_unresv(extpages - minctg); 27775843Ssmaybe return (NULL); 27785843Ssmaybe } 27795843Ssmaybe 27805843Ssmaybe static void 27815843Ssmaybe return_partial_alloc(page_t *plist) 27825843Ssmaybe { 27835843Ssmaybe page_t *pp; 27845843Ssmaybe 27855843Ssmaybe while (plist != NULL) { 27865843Ssmaybe pp = plist; 27875843Ssmaybe page_sub(&plist, pp); 27887173Smrj page_io_unlock(pp); 27895843Ssmaybe page_destroy_io(pp); 27905843Ssmaybe } 27915843Ssmaybe } 27925843Ssmaybe 27935843Ssmaybe static page_t * 27945843Ssmaybe page_get_contigpages( 27955843Ssmaybe struct vnode *vp, 27965843Ssmaybe u_offset_t off, 27975843Ssmaybe int *npagesp, 27985843Ssmaybe uint_t flags, 27995843Ssmaybe caddr_t vaddr, 28005843Ssmaybe ddi_dma_attr_t *mattr) 28015843Ssmaybe { 28025843Ssmaybe mfn_t max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); 28035843Ssmaybe page_t *plist; /* list to return */ 28045843Ssmaybe page_t *pp, *mcpl; 28055843Ssmaybe int contig, anyaddr, npages, getone = 0; 28065843Ssmaybe mfn_t lo_mfn; 28075843Ssmaybe mfn_t hi_mfn; 28085843Ssmaybe pgcnt_t pfnalign = 0; 28095843Ssmaybe int align, sgllen; 28105843Ssmaybe uint64_t pfnseg; 28115843Ssmaybe pgcnt_t minctg; 28125843Ssmaybe 28135843Ssmaybe npages = *npagesp; 28145843Ssmaybe ASSERT(mattr != NULL); 28155843Ssmaybe lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 28165843Ssmaybe hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 28175843Ssmaybe sgllen = mattr->dma_attr_sgllen; 28185843Ssmaybe pfnseg = mmu_btop(mattr->dma_attr_seg); 28195843Ssmaybe align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 28205843Ssmaybe if (align > MMU_PAGESIZE) 28215843Ssmaybe pfnalign = mmu_btop(align); 28225843Ssmaybe 282310175SStuart.Maybee@Sun.COM contig = flags & PG_PHYSCONTIG; 282410175SStuart.Maybee@Sun.COM if (npages == -1) { 282510175SStuart.Maybee@Sun.COM npages = 1; 282610175SStuart.Maybee@Sun.COM pfnalign = 0; 282710175SStuart.Maybee@Sun.COM } 28285843Ssmaybe /* 28295843Ssmaybe * Clear the contig flag if only one page is needed. 28305843Ssmaybe */ 28315843Ssmaybe if (npages == 1) { 28325843Ssmaybe getone = 1; 28335843Ssmaybe contig = 0; 28345843Ssmaybe } 28355843Ssmaybe 28365843Ssmaybe /* 28375843Ssmaybe * Check if any page in the system is fine. 28385843Ssmaybe */ 283910175SStuart.Maybee@Sun.COM anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn; 284010175SStuart.Maybee@Sun.COM if (!contig && anyaddr && !pfnalign) { 28415843Ssmaybe flags &= ~PG_PHYSCONTIG; 28425843Ssmaybe plist = page_create_va(vp, off, npages * MMU_PAGESIZE, 28435843Ssmaybe flags, &kvseg, vaddr); 28445843Ssmaybe if (plist != NULL) { 28455843Ssmaybe *npagesp = 0; 28465843Ssmaybe return (plist); 28475843Ssmaybe } 28485843Ssmaybe } 28495843Ssmaybe plist = NULL; 28505843Ssmaybe minctg = howmany(npages, sgllen); 28515843Ssmaybe while (npages > sgllen || getone) { 28526015Ssmaybe if (minctg > npages) 28536015Ssmaybe minctg = npages; 28546015Ssmaybe mcpl = NULL; 28555843Ssmaybe /* 285610175SStuart.Maybee@Sun.COM * We could want contig pages with no address range limits. 28575843Ssmaybe */ 28586282Ssmaybe if (anyaddr && contig) { 28595843Ssmaybe /* 28605843Ssmaybe * Look for free contig pages to satisfy the request. 28615843Ssmaybe */ 286210175SStuart.Maybee@Sun.COM mcpl = find_contig_free(minctg, flags, pfnseg, 286310175SStuart.Maybee@Sun.COM pfnalign); 28645843Ssmaybe } 28655843Ssmaybe /* 28665843Ssmaybe * Try the reserved io pools next 28675843Ssmaybe */ 28685843Ssmaybe if (mcpl == NULL) 28695843Ssmaybe mcpl = page_io_pool_alloc(mattr, contig, minctg); 28705843Ssmaybe if (mcpl != NULL) { 28715843Ssmaybe pp = mcpl; 28725843Ssmaybe do { 28735843Ssmaybe if (!page_hashin(pp, vp, off, NULL)) { 28745843Ssmaybe panic("page_get_contigpages:" 28755843Ssmaybe " hashin failed" 28765843Ssmaybe " pp %p, vp %p, off %llx", 28775843Ssmaybe (void *)pp, (void *)vp, off); 28785843Ssmaybe } 28795843Ssmaybe off += MMU_PAGESIZE; 28805843Ssmaybe PP_CLRFREE(pp); 28815843Ssmaybe PP_CLRAGED(pp); 28825843Ssmaybe page_set_props(pp, P_REF); 28835843Ssmaybe page_io_lock(pp); 28845843Ssmaybe pp = pp->p_next; 28855843Ssmaybe } while (pp != mcpl); 28865843Ssmaybe } else { 28875843Ssmaybe /* 28885843Ssmaybe * Hypervisor exchange doesn't handle segment or 28895843Ssmaybe * alignment constraints 28905843Ssmaybe */ 28915843Ssmaybe if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi || 28925843Ssmaybe pfnalign) 28935843Ssmaybe goto fail; 28945843Ssmaybe /* 28955843Ssmaybe * Try exchanging pages with the hypervisor 28965843Ssmaybe */ 28975843Ssmaybe mcpl = page_swap_with_hypervisor(vp, off, vaddr, mattr, 28985843Ssmaybe flags, minctg); 28995843Ssmaybe if (mcpl == NULL) 29005843Ssmaybe goto fail; 29015843Ssmaybe off += minctg * MMU_PAGESIZE; 29025843Ssmaybe } 29035843Ssmaybe check_dma(mattr, mcpl, minctg); 29045843Ssmaybe /* 29055843Ssmaybe * Here with a minctg run of contiguous pages, add them to the 29065843Ssmaybe * list we will return for this request. 29075843Ssmaybe */ 29085843Ssmaybe page_list_concat(&plist, &mcpl); 29095843Ssmaybe npages -= minctg; 29105843Ssmaybe *npagesp = npages; 29115843Ssmaybe sgllen--; 29126015Ssmaybe if (getone) 29136015Ssmaybe break; 29145843Ssmaybe } 29155843Ssmaybe return (plist); 29165843Ssmaybe fail: 29175843Ssmaybe return_partial_alloc(plist); 29185843Ssmaybe return (NULL); 29195843Ssmaybe } 29205843Ssmaybe 29215843Ssmaybe /* 29225843Ssmaybe * Allocator for domain 0 I/O pages. We match the required 29235843Ssmaybe * DMA attributes and contiguity constraints. 29245843Ssmaybe */ 29255843Ssmaybe /*ARGSUSED*/ 29265843Ssmaybe page_t * 29275843Ssmaybe page_create_io( 29285843Ssmaybe struct vnode *vp, 29295843Ssmaybe u_offset_t off, 29305843Ssmaybe uint_t bytes, 29315843Ssmaybe uint_t flags, 29325843Ssmaybe struct as *as, 29335843Ssmaybe caddr_t vaddr, 29345843Ssmaybe ddi_dma_attr_t *mattr) 29355843Ssmaybe { 29365843Ssmaybe page_t *plist = NULL, *pp; 29375843Ssmaybe int npages = 0, contig, anyaddr, pages_req; 29385843Ssmaybe mfn_t lo_mfn; 29395843Ssmaybe mfn_t hi_mfn; 29405843Ssmaybe pgcnt_t pfnalign = 0; 29415843Ssmaybe int align; 29425843Ssmaybe int is_domu = 0; 29435843Ssmaybe int dummy, bytes_got; 29445843Ssmaybe mfn_t max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); 29455843Ssmaybe 29465843Ssmaybe ASSERT(mattr != NULL); 29475843Ssmaybe lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 29485843Ssmaybe hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 29495843Ssmaybe align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 29505843Ssmaybe if (align > MMU_PAGESIZE) 29515843Ssmaybe pfnalign = mmu_btop(align); 29525843Ssmaybe 29535843Ssmaybe /* 29545843Ssmaybe * Clear the contig flag if only one page is needed or the scatter 29555843Ssmaybe * gather list length is >= npages. 29565843Ssmaybe */ 29575843Ssmaybe pages_req = npages = mmu_btopr(bytes); 29585843Ssmaybe contig = (flags & PG_PHYSCONTIG); 29595843Ssmaybe bytes = P2ROUNDUP(bytes, MMU_PAGESIZE); 29605843Ssmaybe if (bytes == MMU_PAGESIZE || mattr->dma_attr_sgllen >= npages) 29615843Ssmaybe contig = 0; 29625843Ssmaybe 29635843Ssmaybe /* 29645843Ssmaybe * Check if any old page in the system is fine. 29655843Ssmaybe * DomU should always go down this path. 29665843Ssmaybe */ 29675843Ssmaybe is_domu = !DOMAIN_IS_INITDOMAIN(xen_info); 29685843Ssmaybe anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign; 29695843Ssmaybe if ((!contig && anyaddr) || is_domu) { 29705843Ssmaybe flags &= ~PG_PHYSCONTIG; 29715843Ssmaybe plist = page_create_va(vp, off, bytes, flags, &kvseg, vaddr); 29725843Ssmaybe if (plist != NULL) 29735843Ssmaybe return (plist); 29745843Ssmaybe else if (is_domu) 29755843Ssmaybe return (NULL); /* no memory available */ 29765843Ssmaybe } 29775843Ssmaybe /* 29785843Ssmaybe * DomU should never reach here 29795843Ssmaybe */ 29805843Ssmaybe if (contig) { 29815843Ssmaybe plist = page_get_contigpages(vp, off, &npages, flags, vaddr, 29825843Ssmaybe mattr); 29835843Ssmaybe if (plist == NULL) 29845843Ssmaybe goto fail; 29855843Ssmaybe bytes_got = (pages_req - npages) << MMU_PAGESHIFT; 29865843Ssmaybe vaddr += bytes_got; 29875843Ssmaybe off += bytes_got; 29885843Ssmaybe /* 29895843Ssmaybe * We now have all the contiguous pages we need, but 29905843Ssmaybe * we may still need additional non-contiguous pages. 29915843Ssmaybe */ 29925843Ssmaybe } 29935843Ssmaybe /* 29945843Ssmaybe * now loop collecting the requested number of pages, these do 29955843Ssmaybe * not have to be contiguous pages but we will use the contig 29965843Ssmaybe * page alloc code to get the pages since it will honor any 29975843Ssmaybe * other constraints the pages may have. 29985843Ssmaybe */ 29995843Ssmaybe while (npages--) { 300010175SStuart.Maybee@Sun.COM dummy = -1; 30015843Ssmaybe pp = page_get_contigpages(vp, off, &dummy, flags, vaddr, mattr); 30025843Ssmaybe if (pp == NULL) 30035843Ssmaybe goto fail; 30045843Ssmaybe page_add(&plist, pp); 30055843Ssmaybe vaddr += MMU_PAGESIZE; 30065843Ssmaybe off += MMU_PAGESIZE; 30075843Ssmaybe } 30085843Ssmaybe return (plist); 30095843Ssmaybe fail: 30105843Ssmaybe /* 30115843Ssmaybe * Failed to get enough pages, return ones we did get 30125843Ssmaybe */ 30135843Ssmaybe return_partial_alloc(plist); 30145084Sjohnlev return (NULL); 30155084Sjohnlev } 30165084Sjohnlev 30175084Sjohnlev /* 30185084Sjohnlev * Lock and return the page with the highest mfn that we can find. last_mfn 30195084Sjohnlev * holds the last one found, so the next search can start from there. We 30205084Sjohnlev * also keep a counter so that we don't loop forever if the machine has no 30215084Sjohnlev * free pages. 30225084Sjohnlev * 30235084Sjohnlev * This is called from the balloon thread to find pages to give away. new_high 30245084Sjohnlev * is used when new mfn's have been added to the system - we will reset our 30255084Sjohnlev * search if the new mfn's are higher than our current search position. 30265084Sjohnlev */ 30275084Sjohnlev page_t * 30285084Sjohnlev page_get_high_mfn(mfn_t new_high) 30295084Sjohnlev { 30305084Sjohnlev static mfn_t last_mfn = 0; 30315084Sjohnlev pfn_t pfn; 30325084Sjohnlev page_t *pp; 30335084Sjohnlev ulong_t loop_count = 0; 30345084Sjohnlev 30355084Sjohnlev if (new_high > last_mfn) 30365084Sjohnlev last_mfn = new_high; 30375084Sjohnlev 30385084Sjohnlev for (; loop_count < mfn_count; loop_count++, last_mfn--) { 30395084Sjohnlev if (last_mfn == 0) { 30405084Sjohnlev last_mfn = cached_max_mfn; 30415084Sjohnlev } 30425084Sjohnlev 30435084Sjohnlev pfn = mfn_to_pfn(last_mfn); 30445084Sjohnlev if (pfn & PFN_IS_FOREIGN_MFN) 30455084Sjohnlev continue; 30465084Sjohnlev 30475084Sjohnlev /* See if the page is free. If so, lock it. */ 30485084Sjohnlev pp = page_numtopp_alloc(pfn); 30495084Sjohnlev if (pp == NULL) 30505084Sjohnlev continue; 30515084Sjohnlev PP_CLRFREE(pp); 30525084Sjohnlev 30535084Sjohnlev ASSERT(PAGE_EXCL(pp)); 30545084Sjohnlev ASSERT(pp->p_vnode == NULL); 30555084Sjohnlev ASSERT(!hat_page_is_mapped(pp)); 30565084Sjohnlev last_mfn--; 30575084Sjohnlev return (pp); 30585084Sjohnlev } 30595084Sjohnlev return (NULL); 30605084Sjohnlev } 30615084Sjohnlev 30625084Sjohnlev #else /* !__xpv */ 30635084Sjohnlev 30640Sstevel@tonic-gate /* 30650Sstevel@tonic-gate * get a page from any list with the given mnode 30660Sstevel@tonic-gate */ 30675084Sjohnlev static page_t * 30680Sstevel@tonic-gate page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 30690Sstevel@tonic-gate int mnode, int mtype, ddi_dma_attr_t *dma_attr) 30700Sstevel@tonic-gate { 30712961Sdp78419 kmutex_t *pcm; 30722961Sdp78419 int i; 30732961Sdp78419 page_t *pp; 30742961Sdp78419 page_t *first_pp; 30752961Sdp78419 uint64_t pgaddr; 30762961Sdp78419 ulong_t bin; 30772961Sdp78419 int mtypestart; 30782961Sdp78419 int plw_initialized; 30792961Sdp78419 page_list_walker_t plw; 30800Sstevel@tonic-gate 30810Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_alloc); 30820Sstevel@tonic-gate 30830Sstevel@tonic-gate ASSERT((flags & PG_MATCH_COLOR) == 0); 30840Sstevel@tonic-gate ASSERT(szc == 0); 30850Sstevel@tonic-gate ASSERT(dma_attr != NULL); 30860Sstevel@tonic-gate 30870Sstevel@tonic-gate MTYPE_START(mnode, mtype, flags); 30880Sstevel@tonic-gate if (mtype < 0) { 30890Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocempty); 30900Sstevel@tonic-gate return (NULL); 30910Sstevel@tonic-gate } 30920Sstevel@tonic-gate 30930Sstevel@tonic-gate mtypestart = mtype; 30940Sstevel@tonic-gate 30950Sstevel@tonic-gate bin = origbin; 30960Sstevel@tonic-gate 30970Sstevel@tonic-gate /* 30980Sstevel@tonic-gate * check up to page_colors + 1 bins - origbin may be checked twice 30990Sstevel@tonic-gate * because of BIN_STEP skip 31000Sstevel@tonic-gate */ 31010Sstevel@tonic-gate do { 31022961Sdp78419 plw_initialized = 0; 31032961Sdp78419 31042961Sdp78419 for (plw.plw_count = 0; 31052961Sdp78419 plw.plw_count < page_colors; plw.plw_count++) { 31062961Sdp78419 31070Sstevel@tonic-gate if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 31080Sstevel@tonic-gate goto nextfreebin; 31090Sstevel@tonic-gate 31100Sstevel@tonic-gate pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 31110Sstevel@tonic-gate mutex_enter(pcm); 31120Sstevel@tonic-gate pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 31130Sstevel@tonic-gate first_pp = pp; 31140Sstevel@tonic-gate while (pp != NULL) { 31150Sstevel@tonic-gate if (page_trylock(pp, SE_EXCL) == 0) { 31160Sstevel@tonic-gate pp = pp->p_next; 31170Sstevel@tonic-gate if (pp == first_pp) { 31180Sstevel@tonic-gate pp = NULL; 31190Sstevel@tonic-gate } 31200Sstevel@tonic-gate continue; 31210Sstevel@tonic-gate } 31220Sstevel@tonic-gate 31230Sstevel@tonic-gate ASSERT(PP_ISFREE(pp)); 31240Sstevel@tonic-gate ASSERT(PP_ISAGED(pp)); 31250Sstevel@tonic-gate ASSERT(pp->p_vnode == NULL); 31260Sstevel@tonic-gate ASSERT(pp->p_hash == NULL); 31270Sstevel@tonic-gate ASSERT(pp->p_offset == (u_offset_t)-1); 31280Sstevel@tonic-gate ASSERT(pp->p_szc == szc); 31290Sstevel@tonic-gate ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 31300Sstevel@tonic-gate /* check if page within DMA attributes */ 31313446Smrj pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 31320Sstevel@tonic-gate if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 31330Sstevel@tonic-gate (pgaddr + MMU_PAGESIZE - 1 <= 31340Sstevel@tonic-gate dma_attr->dma_attr_addr_hi)) { 31350Sstevel@tonic-gate break; 31360Sstevel@tonic-gate } 31370Sstevel@tonic-gate 31380Sstevel@tonic-gate /* continue looking */ 31390Sstevel@tonic-gate page_unlock(pp); 31400Sstevel@tonic-gate pp = pp->p_next; 31410Sstevel@tonic-gate if (pp == first_pp) 31420Sstevel@tonic-gate pp = NULL; 31430Sstevel@tonic-gate 31440Sstevel@tonic-gate } 31450Sstevel@tonic-gate if (pp != NULL) { 31460Sstevel@tonic-gate ASSERT(mtype == PP_2_MTYPE(pp)); 31470Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 31480Sstevel@tonic-gate 31490Sstevel@tonic-gate /* found a page with specified DMA attributes */ 31500Sstevel@tonic-gate page_sub(&PAGE_FREELISTS(mnode, szc, bin, 31510Sstevel@tonic-gate mtype), pp); 3152414Skchow page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 31530Sstevel@tonic-gate 31540Sstevel@tonic-gate if ((PP_ISFREE(pp) == 0) || 31550Sstevel@tonic-gate (PP_ISAGED(pp) == 0)) { 31560Sstevel@tonic-gate cmn_err(CE_PANIC, "page %p is not free", 31570Sstevel@tonic-gate (void *)pp); 31580Sstevel@tonic-gate } 31590Sstevel@tonic-gate 31600Sstevel@tonic-gate mutex_exit(pcm); 31610Sstevel@tonic-gate check_dma(dma_attr, pp, 1); 31620Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocok); 31630Sstevel@tonic-gate return (pp); 31640Sstevel@tonic-gate } 31650Sstevel@tonic-gate mutex_exit(pcm); 31660Sstevel@tonic-gate nextfreebin: 31672961Sdp78419 if (plw_initialized == 0) { 31682961Sdp78419 page_list_walk_init(szc, 0, bin, 1, 0, &plw); 31692961Sdp78419 ASSERT(plw.plw_ceq_dif == page_colors); 31702961Sdp78419 plw_initialized = 1; 31712961Sdp78419 } 31720Sstevel@tonic-gate 31732961Sdp78419 if (plw.plw_do_split) { 31742961Sdp78419 pp = page_freelist_split(szc, bin, mnode, 31752961Sdp78419 mtype, 31767656SSherry.Moore@Sun.COM mmu_btop(dma_attr->dma_attr_addr_lo), 31772961Sdp78419 mmu_btop(dma_attr->dma_attr_addr_hi + 1), 31782961Sdp78419 &plw); 31797656SSherry.Moore@Sun.COM if (pp != NULL) { 31807656SSherry.Moore@Sun.COM check_dma(dma_attr, pp, 1); 31812961Sdp78419 return (pp); 31827656SSherry.Moore@Sun.COM } 31832961Sdp78419 } 31842961Sdp78419 31852961Sdp78419 bin = page_list_walk_next_bin(szc, bin, &plw); 31860Sstevel@tonic-gate } 31872961Sdp78419 3188414Skchow MTYPE_NEXT(mnode, mtype, flags); 3189414Skchow } while (mtype >= 0); 31900Sstevel@tonic-gate 31910Sstevel@tonic-gate /* failed to find a page in the freelist; try it in the cachelist */ 31920Sstevel@tonic-gate 31930Sstevel@tonic-gate /* reset mtype start for cachelist search */ 31940Sstevel@tonic-gate mtype = mtypestart; 31950Sstevel@tonic-gate ASSERT(mtype >= 0); 31960Sstevel@tonic-gate 31970Sstevel@tonic-gate /* start with the bin of matching color */ 31980Sstevel@tonic-gate bin = origbin; 31990Sstevel@tonic-gate 32000Sstevel@tonic-gate do { 32010Sstevel@tonic-gate for (i = 0; i <= page_colors; i++) { 32020Sstevel@tonic-gate if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 32030Sstevel@tonic-gate goto nextcachebin; 32040Sstevel@tonic-gate pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 32050Sstevel@tonic-gate mutex_enter(pcm); 32060Sstevel@tonic-gate pp = PAGE_CACHELISTS(mnode, bin, mtype); 32070Sstevel@tonic-gate first_pp = pp; 32080Sstevel@tonic-gate while (pp != NULL) { 32090Sstevel@tonic-gate if (page_trylock(pp, SE_EXCL) == 0) { 32100Sstevel@tonic-gate pp = pp->p_next; 32110Sstevel@tonic-gate if (pp == first_pp) 32129244SSherry.Moore@Sun.COM pp = NULL; 32130Sstevel@tonic-gate continue; 32140Sstevel@tonic-gate } 32150Sstevel@tonic-gate ASSERT(pp->p_vnode); 32160Sstevel@tonic-gate ASSERT(PP_ISAGED(pp) == 0); 32170Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 32180Sstevel@tonic-gate ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 32190Sstevel@tonic-gate 32200Sstevel@tonic-gate /* check if page within DMA attributes */ 32210Sstevel@tonic-gate 32223446Smrj pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 32230Sstevel@tonic-gate if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 32240Sstevel@tonic-gate (pgaddr + MMU_PAGESIZE - 1 <= 32250Sstevel@tonic-gate dma_attr->dma_attr_addr_hi)) { 32260Sstevel@tonic-gate break; 32270Sstevel@tonic-gate } 32280Sstevel@tonic-gate 32290Sstevel@tonic-gate /* continue looking */ 32300Sstevel@tonic-gate page_unlock(pp); 32310Sstevel@tonic-gate pp = pp->p_next; 32320Sstevel@tonic-gate if (pp == first_pp) 32330Sstevel@tonic-gate pp = NULL; 32340Sstevel@tonic-gate } 32350Sstevel@tonic-gate 32360Sstevel@tonic-gate if (pp != NULL) { 32370Sstevel@tonic-gate ASSERT(mtype == PP_2_MTYPE(pp)); 32380Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 32390Sstevel@tonic-gate 32400Sstevel@tonic-gate /* found a page with specified DMA attributes */ 32410Sstevel@tonic-gate page_sub(&PAGE_CACHELISTS(mnode, bin, 32420Sstevel@tonic-gate mtype), pp); 3243414Skchow page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 32440Sstevel@tonic-gate 32450Sstevel@tonic-gate mutex_exit(pcm); 32460Sstevel@tonic-gate ASSERT(pp->p_vnode); 32470Sstevel@tonic-gate ASSERT(PP_ISAGED(pp) == 0); 32480Sstevel@tonic-gate check_dma(dma_attr, pp, 1); 32490Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocok); 32500Sstevel@tonic-gate return (pp); 32510Sstevel@tonic-gate } 32520Sstevel@tonic-gate mutex_exit(pcm); 32530Sstevel@tonic-gate nextcachebin: 32540Sstevel@tonic-gate bin += (i == 0) ? BIN_STEP : 1; 32550Sstevel@tonic-gate bin &= page_colors_mask; 32560Sstevel@tonic-gate } 3257414Skchow MTYPE_NEXT(mnode, mtype, flags); 3258414Skchow } while (mtype >= 0); 32590Sstevel@tonic-gate 32600Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 32610Sstevel@tonic-gate return (NULL); 32620Sstevel@tonic-gate } 32630Sstevel@tonic-gate 32640Sstevel@tonic-gate /* 32650Sstevel@tonic-gate * This function is similar to page_get_freelist()/page_get_cachelist() 32660Sstevel@tonic-gate * but it searches both the lists to find a page with the specified 32670Sstevel@tonic-gate * color (or no color) and DMA attributes. The search is done in the 32680Sstevel@tonic-gate * freelist first and then in the cache list within the highest memory 32690Sstevel@tonic-gate * range (based on DMA attributes) before searching in the lower 32700Sstevel@tonic-gate * memory ranges. 32710Sstevel@tonic-gate * 32720Sstevel@tonic-gate * Note: This function is called only by page_create_io(). 32730Sstevel@tonic-gate */ 32740Sstevel@tonic-gate /*ARGSUSED*/ 32755084Sjohnlev static page_t * 32760Sstevel@tonic-gate page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 32770Sstevel@tonic-gate size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 32780Sstevel@tonic-gate { 32790Sstevel@tonic-gate uint_t bin; 32800Sstevel@tonic-gate int mtype; 32810Sstevel@tonic-gate page_t *pp; 32820Sstevel@tonic-gate int n; 32830Sstevel@tonic-gate int m; 32840Sstevel@tonic-gate int szc; 32850Sstevel@tonic-gate int fullrange; 32860Sstevel@tonic-gate int mnode; 32870Sstevel@tonic-gate int local_failed_stat = 0; 32880Sstevel@tonic-gate lgrp_mnode_cookie_t lgrp_cookie; 32890Sstevel@tonic-gate 32900Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_alloc); 32910Sstevel@tonic-gate 32920Sstevel@tonic-gate /* only base pagesize currently supported */ 32930Sstevel@tonic-gate if (size != MMU_PAGESIZE) 32940Sstevel@tonic-gate return (NULL); 32950Sstevel@tonic-gate 32960Sstevel@tonic-gate /* 32970Sstevel@tonic-gate * If we're passed a specific lgroup, we use it. Otherwise, 32980Sstevel@tonic-gate * assume first-touch placement is desired. 32990Sstevel@tonic-gate */ 33000Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp)) 33010Sstevel@tonic-gate lgrp = lgrp_home_lgrp(); 33020Sstevel@tonic-gate 33030Sstevel@tonic-gate /* LINTED */ 33042961Sdp78419 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 33050Sstevel@tonic-gate 33060Sstevel@tonic-gate /* 33070Sstevel@tonic-gate * Only hold one freelist or cachelist lock at a time, that way we 33080Sstevel@tonic-gate * can start anywhere and not have to worry about lock 33090Sstevel@tonic-gate * ordering. 33100Sstevel@tonic-gate */ 33110Sstevel@tonic-gate if (dma_attr == NULL) { 3312*12004Sjiang.liu@intel.com n = mtype16m; 3313*12004Sjiang.liu@intel.com m = mtypetop; 33140Sstevel@tonic-gate fullrange = 1; 33150Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 33160Sstevel@tonic-gate } else { 33170Sstevel@tonic-gate pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 33180Sstevel@tonic-gate pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 33190Sstevel@tonic-gate 33200Sstevel@tonic-gate /* 33210Sstevel@tonic-gate * We can guarantee alignment only for page boundary. 33220Sstevel@tonic-gate */ 33230Sstevel@tonic-gate if (dma_attr->dma_attr_align > MMU_PAGESIZE) 33240Sstevel@tonic-gate return (NULL); 33250Sstevel@tonic-gate 3326*12004Sjiang.liu@intel.com /* Sanity check the dma_attr */ 3327*12004Sjiang.liu@intel.com if (pfnlo > pfnhi) 3328*12004Sjiang.liu@intel.com return (NULL); 3329*12004Sjiang.liu@intel.com 33300Sstevel@tonic-gate n = pfn_2_mtype(pfnlo); 33310Sstevel@tonic-gate m = pfn_2_mtype(pfnhi); 33320Sstevel@tonic-gate 33330Sstevel@tonic-gate fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 33340Sstevel@tonic-gate (pfnhi >= mnoderanges[m].mnr_pfnhi)); 33350Sstevel@tonic-gate } 33360Sstevel@tonic-gate VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 33370Sstevel@tonic-gate 33380Sstevel@tonic-gate szc = 0; 33390Sstevel@tonic-gate 3340*12004Sjiang.liu@intel.com /* cylcing thru mtype handled by RANGE0 if n == mtype16m */ 3341*12004Sjiang.liu@intel.com if (n == mtype16m) { 33420Sstevel@tonic-gate flags |= PGI_MT_RANGE0; 33430Sstevel@tonic-gate n = m; 33440Sstevel@tonic-gate } 33450Sstevel@tonic-gate 33460Sstevel@tonic-gate /* 33470Sstevel@tonic-gate * Try local memory node first, but try remote if we can't 33480Sstevel@tonic-gate * get a page of the right color. 33490Sstevel@tonic-gate */ 33500Sstevel@tonic-gate LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 33510Sstevel@tonic-gate while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 33520Sstevel@tonic-gate /* 33530Sstevel@tonic-gate * allocate pages from high pfn to low. 33540Sstevel@tonic-gate */ 3355*12004Sjiang.liu@intel.com mtype = m; 3356*12004Sjiang.liu@intel.com do { 33570Sstevel@tonic-gate if (fullrange != 0) { 33580Sstevel@tonic-gate pp = page_get_mnode_freelist(mnode, 33590Sstevel@tonic-gate bin, mtype, szc, flags); 33600Sstevel@tonic-gate if (pp == NULL) { 33610Sstevel@tonic-gate pp = page_get_mnode_cachelist( 33625084Sjohnlev bin, flags, mnode, mtype); 33630Sstevel@tonic-gate } 33640Sstevel@tonic-gate } else { 33650Sstevel@tonic-gate pp = page_get_mnode_anylist(bin, szc, 33660Sstevel@tonic-gate flags, mnode, mtype, dma_attr); 33670Sstevel@tonic-gate } 33680Sstevel@tonic-gate if (pp != NULL) { 33690Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_allocok); 33700Sstevel@tonic-gate check_dma(dma_attr, pp, 1); 33710Sstevel@tonic-gate return (pp); 33720Sstevel@tonic-gate } 3373*12004Sjiang.liu@intel.com } while (mtype != n && 3374*12004Sjiang.liu@intel.com (mtype = mnoderanges[mtype].mnr_next) != -1); 33750Sstevel@tonic-gate if (!local_failed_stat) { 33760Sstevel@tonic-gate lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 33770Sstevel@tonic-gate local_failed_stat = 1; 33780Sstevel@tonic-gate } 33790Sstevel@tonic-gate } 33800Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_allocfailed); 33810Sstevel@tonic-gate 33820Sstevel@tonic-gate return (NULL); 33830Sstevel@tonic-gate } 33840Sstevel@tonic-gate 33850Sstevel@tonic-gate /* 33860Sstevel@tonic-gate * page_create_io() 33870Sstevel@tonic-gate * 33880Sstevel@tonic-gate * This function is a copy of page_create_va() with an additional 33890Sstevel@tonic-gate * argument 'mattr' that specifies DMA memory requirements to 33900Sstevel@tonic-gate * the page list functions. This function is used by the segkmem 33910Sstevel@tonic-gate * allocator so it is only to create new pages (i.e PG_EXCL is 33920Sstevel@tonic-gate * set). 33930Sstevel@tonic-gate * 33940Sstevel@tonic-gate * Note: This interface is currently used by x86 PSM only and is 33950Sstevel@tonic-gate * not fully specified so the commitment level is only for 33960Sstevel@tonic-gate * private interface specific to x86. This interface uses PSM 33970Sstevel@tonic-gate * specific page_get_anylist() interface. 33980Sstevel@tonic-gate */ 33990Sstevel@tonic-gate 34000Sstevel@tonic-gate #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 34010Sstevel@tonic-gate for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 34020Sstevel@tonic-gate if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 34030Sstevel@tonic-gate break; \ 34040Sstevel@tonic-gate } \ 34050Sstevel@tonic-gate } 34060Sstevel@tonic-gate 34070Sstevel@tonic-gate 34080Sstevel@tonic-gate page_t * 34090Sstevel@tonic-gate page_create_io( 34100Sstevel@tonic-gate struct vnode *vp, 34110Sstevel@tonic-gate u_offset_t off, 34120Sstevel@tonic-gate uint_t bytes, 34130Sstevel@tonic-gate uint_t flags, 34140Sstevel@tonic-gate struct as *as, 34150Sstevel@tonic-gate caddr_t vaddr, 34160Sstevel@tonic-gate ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 34170Sstevel@tonic-gate { 34180Sstevel@tonic-gate page_t *plist = NULL; 34190Sstevel@tonic-gate uint_t plist_len = 0; 34200Sstevel@tonic-gate pgcnt_t npages; 34210Sstevel@tonic-gate page_t *npp = NULL; 34220Sstevel@tonic-gate uint_t pages_req; 34230Sstevel@tonic-gate page_t *pp; 34240Sstevel@tonic-gate kmutex_t *phm = NULL; 34250Sstevel@tonic-gate uint_t index; 34260Sstevel@tonic-gate 34270Sstevel@tonic-gate TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 34285084Sjohnlev "page_create_start:vp %p off %llx bytes %u flags %x", 34295084Sjohnlev vp, off, bytes, flags); 34300Sstevel@tonic-gate 34310Sstevel@tonic-gate ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 34320Sstevel@tonic-gate 34330Sstevel@tonic-gate pages_req = npages = mmu_btopr(bytes); 34340Sstevel@tonic-gate 34350Sstevel@tonic-gate /* 34360Sstevel@tonic-gate * Do the freemem and pcf accounting. 34370Sstevel@tonic-gate */ 34380Sstevel@tonic-gate if (!page_create_wait(npages, flags)) { 34390Sstevel@tonic-gate return (NULL); 34400Sstevel@tonic-gate } 34410Sstevel@tonic-gate 34420Sstevel@tonic-gate TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 34435084Sjohnlev "page_create_success:vp %p off %llx", vp, off); 34440Sstevel@tonic-gate 34450Sstevel@tonic-gate /* 34460Sstevel@tonic-gate * If satisfying this request has left us with too little 34470Sstevel@tonic-gate * memory, start the wheels turning to get some back. The 34480Sstevel@tonic-gate * first clause of the test prevents waking up the pageout 34490Sstevel@tonic-gate * daemon in situations where it would decide that there's 34500Sstevel@tonic-gate * nothing to do. 34510Sstevel@tonic-gate */ 34520Sstevel@tonic-gate if (nscan < desscan && freemem < minfree) { 34530Sstevel@tonic-gate TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 34545084Sjohnlev "pageout_cv_signal:freemem %ld", freemem); 34550Sstevel@tonic-gate cv_signal(&proc_pageout->p_cv); 34560Sstevel@tonic-gate } 34570Sstevel@tonic-gate 34580Sstevel@tonic-gate if (flags & PG_PHYSCONTIG) { 34590Sstevel@tonic-gate 34600Sstevel@tonic-gate plist = page_get_contigpage(&npages, mattr, 1); 34610Sstevel@tonic-gate if (plist == NULL) { 34620Sstevel@tonic-gate page_create_putback(npages); 34630Sstevel@tonic-gate return (NULL); 34640Sstevel@tonic-gate } 34650Sstevel@tonic-gate 34660Sstevel@tonic-gate pp = plist; 34670Sstevel@tonic-gate 34680Sstevel@tonic-gate do { 34690Sstevel@tonic-gate if (!page_hashin(pp, vp, off, NULL)) { 34700Sstevel@tonic-gate panic("pg_creat_io: hashin failed %p %p %llx", 34710Sstevel@tonic-gate (void *)pp, (void *)vp, off); 34720Sstevel@tonic-gate } 34730Sstevel@tonic-gate VM_STAT_ADD(page_create_new); 34740Sstevel@tonic-gate off += MMU_PAGESIZE; 34750Sstevel@tonic-gate PP_CLRFREE(pp); 34760Sstevel@tonic-gate PP_CLRAGED(pp); 34770Sstevel@tonic-gate page_set_props(pp, P_REF); 34780Sstevel@tonic-gate pp = pp->p_next; 34790Sstevel@tonic-gate } while (pp != plist); 34800Sstevel@tonic-gate 34810Sstevel@tonic-gate if (!npages) { 34820Sstevel@tonic-gate check_dma(mattr, plist, pages_req); 34830Sstevel@tonic-gate return (plist); 34840Sstevel@tonic-gate } else { 34850Sstevel@tonic-gate vaddr += (pages_req - npages) << MMU_PAGESHIFT; 34860Sstevel@tonic-gate } 34870Sstevel@tonic-gate 34880Sstevel@tonic-gate /* 34890Sstevel@tonic-gate * fall-thru: 34900Sstevel@tonic-gate * 34910Sstevel@tonic-gate * page_get_contigpage returns when npages <= sgllen. 34920Sstevel@tonic-gate * Grab the rest of the non-contig pages below from anylist. 34930Sstevel@tonic-gate */ 34940Sstevel@tonic-gate } 34950Sstevel@tonic-gate 34960Sstevel@tonic-gate /* 34970Sstevel@tonic-gate * Loop around collecting the requested number of pages. 34980Sstevel@tonic-gate * Most of the time, we have to `create' a new page. With 34990Sstevel@tonic-gate * this in mind, pull the page off the free list before 35000Sstevel@tonic-gate * getting the hash lock. This will minimize the hash 35010Sstevel@tonic-gate * lock hold time, nesting, and the like. If it turns 35020Sstevel@tonic-gate * out we don't need the page, we put it back at the end. 35030Sstevel@tonic-gate */ 35040Sstevel@tonic-gate while (npages--) { 35050Sstevel@tonic-gate phm = NULL; 35060Sstevel@tonic-gate 35070Sstevel@tonic-gate index = PAGE_HASH_FUNC(vp, off); 35080Sstevel@tonic-gate top: 35090Sstevel@tonic-gate ASSERT(phm == NULL); 35100Sstevel@tonic-gate ASSERT(index == PAGE_HASH_FUNC(vp, off)); 35110Sstevel@tonic-gate ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 35120Sstevel@tonic-gate 35130Sstevel@tonic-gate if (npp == NULL) { 35140Sstevel@tonic-gate /* 35150Sstevel@tonic-gate * Try to get the page of any color either from 35160Sstevel@tonic-gate * the freelist or from the cache list. 35170Sstevel@tonic-gate */ 35180Sstevel@tonic-gate npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 35190Sstevel@tonic-gate flags & ~PG_MATCH_COLOR, mattr, NULL); 35200Sstevel@tonic-gate if (npp == NULL) { 35210Sstevel@tonic-gate if (mattr == NULL) { 35220Sstevel@tonic-gate /* 35230Sstevel@tonic-gate * Not looking for a special page; 35240Sstevel@tonic-gate * panic! 35250Sstevel@tonic-gate */ 35260Sstevel@tonic-gate panic("no page found %d", (int)npages); 35270Sstevel@tonic-gate } 35280Sstevel@tonic-gate /* 35290Sstevel@tonic-gate * No page found! This can happen 35300Sstevel@tonic-gate * if we are looking for a page 35310Sstevel@tonic-gate * within a specific memory range 35320Sstevel@tonic-gate * for DMA purposes. If PG_WAIT is 35330Sstevel@tonic-gate * specified then we wait for a 35340Sstevel@tonic-gate * while and then try again. The 35350Sstevel@tonic-gate * wait could be forever if we 35360Sstevel@tonic-gate * don't get the page(s) we need. 35370Sstevel@tonic-gate * 35380Sstevel@tonic-gate * Note: XXX We really need a mechanism 35390Sstevel@tonic-gate * to wait for pages in the desired 35400Sstevel@tonic-gate * range. For now, we wait for any 35410Sstevel@tonic-gate * pages and see if we can use it. 35420Sstevel@tonic-gate */ 35430Sstevel@tonic-gate 35440Sstevel@tonic-gate if ((mattr != NULL) && (flags & PG_WAIT)) { 35450Sstevel@tonic-gate delay(10); 35460Sstevel@tonic-gate goto top; 35470Sstevel@tonic-gate } 35480Sstevel@tonic-gate goto fail; /* undo accounting stuff */ 35490Sstevel@tonic-gate } 35500Sstevel@tonic-gate 35510Sstevel@tonic-gate if (PP_ISAGED(npp) == 0) { 35520Sstevel@tonic-gate /* 35530Sstevel@tonic-gate * Since this page came from the 35540Sstevel@tonic-gate * cachelist, we must destroy the 35550Sstevel@tonic-gate * old vnode association. 35560Sstevel@tonic-gate */ 35570Sstevel@tonic-gate page_hashout(npp, (kmutex_t *)NULL); 35580Sstevel@tonic-gate } 35590Sstevel@tonic-gate } 35600Sstevel@tonic-gate 35610Sstevel@tonic-gate /* 35620Sstevel@tonic-gate * We own this page! 35630Sstevel@tonic-gate */ 35640Sstevel@tonic-gate ASSERT(PAGE_EXCL(npp)); 35650Sstevel@tonic-gate ASSERT(npp->p_vnode == NULL); 35660Sstevel@tonic-gate ASSERT(!hat_page_is_mapped(npp)); 35670Sstevel@tonic-gate PP_CLRFREE(npp); 35680Sstevel@tonic-gate PP_CLRAGED(npp); 35690Sstevel@tonic-gate 35700Sstevel@tonic-gate /* 35710Sstevel@tonic-gate * Here we have a page in our hot little mits and are 35720Sstevel@tonic-gate * just waiting to stuff it on the appropriate lists. 35730Sstevel@tonic-gate * Get the mutex and check to see if it really does 35740Sstevel@tonic-gate * not exist. 35750Sstevel@tonic-gate */ 35760Sstevel@tonic-gate phm = PAGE_HASH_MUTEX(index); 35770Sstevel@tonic-gate mutex_enter(phm); 35780Sstevel@tonic-gate PAGE_HASH_SEARCH(index, pp, vp, off); 35790Sstevel@tonic-gate if (pp == NULL) { 35800Sstevel@tonic-gate VM_STAT_ADD(page_create_new); 35810Sstevel@tonic-gate pp = npp; 35820Sstevel@tonic-gate npp = NULL; 35830Sstevel@tonic-gate if (!page_hashin(pp, vp, off, phm)) { 35840Sstevel@tonic-gate /* 35850Sstevel@tonic-gate * Since we hold the page hash mutex and 35860Sstevel@tonic-gate * just searched for this page, page_hashin 35870Sstevel@tonic-gate * had better not fail. If it does, that 35880Sstevel@tonic-gate * means somethread did not follow the 35890Sstevel@tonic-gate * page hash mutex rules. Panic now and 35900Sstevel@tonic-gate * get it over with. As usual, go down 35910Sstevel@tonic-gate * holding all the locks. 35920Sstevel@tonic-gate */ 35930Sstevel@tonic-gate ASSERT(MUTEX_HELD(phm)); 35940Sstevel@tonic-gate panic("page_create: hashin fail %p %p %llx %p", 35950Sstevel@tonic-gate (void *)pp, (void *)vp, off, (void *)phm); 35960Sstevel@tonic-gate 35970Sstevel@tonic-gate } 35980Sstevel@tonic-gate ASSERT(MUTEX_HELD(phm)); 35990Sstevel@tonic-gate mutex_exit(phm); 36000Sstevel@tonic-gate phm = NULL; 36010Sstevel@tonic-gate 36020Sstevel@tonic-gate /* 36030Sstevel@tonic-gate * Hat layer locking need not be done to set 36040Sstevel@tonic-gate * the following bits since the page is not hashed 36050Sstevel@tonic-gate * and was on the free list (i.e., had no mappings). 36060Sstevel@tonic-gate * 36070Sstevel@tonic-gate * Set the reference bit to protect 36080Sstevel@tonic-gate * against immediate pageout 36090Sstevel@tonic-gate * 36100Sstevel@tonic-gate * XXXmh modify freelist code to set reference 36110Sstevel@tonic-gate * bit so we don't have to do it here. 36120Sstevel@tonic-gate */ 36130Sstevel@tonic-gate page_set_props(pp, P_REF); 36140Sstevel@tonic-gate } else { 36150Sstevel@tonic-gate ASSERT(MUTEX_HELD(phm)); 36160Sstevel@tonic-gate mutex_exit(phm); 36170Sstevel@tonic-gate phm = NULL; 36180Sstevel@tonic-gate /* 36190Sstevel@tonic-gate * NOTE: This should not happen for pages associated 36200Sstevel@tonic-gate * with kernel vnode 'kvp'. 36210Sstevel@tonic-gate */ 36220Sstevel@tonic-gate /* XX64 - to debug why this happens! */ 36233290Sjohansen ASSERT(!VN_ISKAS(vp)); 36243290Sjohansen if (VN_ISKAS(vp)) 36250Sstevel@tonic-gate cmn_err(CE_NOTE, 36260Sstevel@tonic-gate "page_create: page not expected " 36270Sstevel@tonic-gate "in hash list for kernel vnode - pp 0x%p", 36280Sstevel@tonic-gate (void *)pp); 36290Sstevel@tonic-gate VM_STAT_ADD(page_create_exists); 36300Sstevel@tonic-gate goto fail; 36310Sstevel@tonic-gate } 36320Sstevel@tonic-gate 36330Sstevel@tonic-gate /* 36340Sstevel@tonic-gate * Got a page! It is locked. Acquire the i/o 36350Sstevel@tonic-gate * lock since we are going to use the p_next and 36360Sstevel@tonic-gate * p_prev fields to link the requested pages together. 36370Sstevel@tonic-gate */ 36380Sstevel@tonic-gate page_io_lock(pp); 36390Sstevel@tonic-gate page_add(&plist, pp); 36400Sstevel@tonic-gate plist = plist->p_next; 36410Sstevel@tonic-gate off += MMU_PAGESIZE; 36420Sstevel@tonic-gate vaddr += MMU_PAGESIZE; 36430Sstevel@tonic-gate } 36440Sstevel@tonic-gate 36450Sstevel@tonic-gate check_dma(mattr, plist, pages_req); 36460Sstevel@tonic-gate return (plist); 36470Sstevel@tonic-gate 36480Sstevel@tonic-gate fail: 36490Sstevel@tonic-gate if (npp != NULL) { 36500Sstevel@tonic-gate /* 36510Sstevel@tonic-gate * Did not need this page after all. 36520Sstevel@tonic-gate * Put it back on the free list. 36530Sstevel@tonic-gate */ 36540Sstevel@tonic-gate VM_STAT_ADD(page_create_putbacks); 36550Sstevel@tonic-gate PP_SETFREE(npp); 36560Sstevel@tonic-gate PP_SETAGED(npp); 36570Sstevel@tonic-gate npp->p_offset = (u_offset_t)-1; 36580Sstevel@tonic-gate page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 36590Sstevel@tonic-gate page_unlock(npp); 36600Sstevel@tonic-gate } 36610Sstevel@tonic-gate 36620Sstevel@tonic-gate /* 36630Sstevel@tonic-gate * Give up the pages we already got. 36640Sstevel@tonic-gate */ 36650Sstevel@tonic-gate while (plist != NULL) { 36660Sstevel@tonic-gate pp = plist; 36670Sstevel@tonic-gate page_sub(&plist, pp); 36680Sstevel@tonic-gate page_io_unlock(pp); 36690Sstevel@tonic-gate plist_len++; 36700Sstevel@tonic-gate /*LINTED: constant in conditional ctx*/ 36710Sstevel@tonic-gate VN_DISPOSE(pp, B_INVAL, 0, kcred); 36720Sstevel@tonic-gate } 36730Sstevel@tonic-gate 36740Sstevel@tonic-gate /* 36750Sstevel@tonic-gate * VN_DISPOSE does freemem accounting for the pages in plist 36760Sstevel@tonic-gate * by calling page_free. So, we need to undo the pcf accounting 36770Sstevel@tonic-gate * for only the remaining pages. 36780Sstevel@tonic-gate */ 36790Sstevel@tonic-gate VM_STAT_ADD(page_create_putbacks); 36800Sstevel@tonic-gate page_create_putback(pages_req - plist_len); 36810Sstevel@tonic-gate 36820Sstevel@tonic-gate return (NULL); 36830Sstevel@tonic-gate } 36845084Sjohnlev #endif /* !__xpv */ 36850Sstevel@tonic-gate 36860Sstevel@tonic-gate 36870Sstevel@tonic-gate /* 36880Sstevel@tonic-gate * Copy the data from the physical page represented by "frompp" to 36890Sstevel@tonic-gate * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 36900Sstevel@tonic-gate * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 36910Sstevel@tonic-gate * level and no one sleeps with an active mapping there. 36920Sstevel@tonic-gate * 36930Sstevel@tonic-gate * Note that the ref/mod bits in the page_t's are not affected by 36940Sstevel@tonic-gate * this operation, hence it is up to the caller to update them appropriately. 36950Sstevel@tonic-gate */ 36963253Smec int 36970Sstevel@tonic-gate ppcopy(page_t *frompp, page_t *topp) 36980Sstevel@tonic-gate { 36990Sstevel@tonic-gate caddr_t pp_addr1; 37000Sstevel@tonic-gate caddr_t pp_addr2; 37013446Smrj hat_mempte_t pte1; 37023446Smrj hat_mempte_t pte2; 37030Sstevel@tonic-gate kmutex_t *ppaddr_mutex; 37043253Smec label_t ljb; 37053253Smec int ret = 1; 37060Sstevel@tonic-gate 37070Sstevel@tonic-gate ASSERT_STACK_ALIGNED(); 37080Sstevel@tonic-gate ASSERT(PAGE_LOCKED(frompp)); 37090Sstevel@tonic-gate ASSERT(PAGE_LOCKED(topp)); 37100Sstevel@tonic-gate 37110Sstevel@tonic-gate if (kpm_enable) { 37120Sstevel@tonic-gate pp_addr1 = hat_kpm_page2va(frompp, 0); 37130Sstevel@tonic-gate pp_addr2 = hat_kpm_page2va(topp, 0); 37140Sstevel@tonic-gate kpreempt_disable(); 37150Sstevel@tonic-gate } else { 37160Sstevel@tonic-gate /* 37170Sstevel@tonic-gate * disable pre-emption so that CPU can't change 37180Sstevel@tonic-gate */ 37190Sstevel@tonic-gate kpreempt_disable(); 37200Sstevel@tonic-gate 37210Sstevel@tonic-gate pp_addr1 = CPU->cpu_caddr1; 37220Sstevel@tonic-gate pp_addr2 = CPU->cpu_caddr2; 37233446Smrj pte1 = CPU->cpu_caddr1pte; 37243446Smrj pte2 = CPU->cpu_caddr2pte; 37250Sstevel@tonic-gate 37260Sstevel@tonic-gate ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 37270Sstevel@tonic-gate mutex_enter(ppaddr_mutex); 37280Sstevel@tonic-gate 37290Sstevel@tonic-gate hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 37300Sstevel@tonic-gate PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 37310Sstevel@tonic-gate hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 37320Sstevel@tonic-gate PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 37330Sstevel@tonic-gate HAT_LOAD_NOCONSIST); 37340Sstevel@tonic-gate } 37350Sstevel@tonic-gate 37363253Smec if (on_fault(&ljb)) { 37373253Smec ret = 0; 37383253Smec goto faulted; 37393253Smec } 37400Sstevel@tonic-gate if (use_sse_pagecopy) 37415084Sjohnlev #ifdef __xpv 37425084Sjohnlev page_copy_no_xmm(pp_addr2, pp_addr1); 37435084Sjohnlev #else 37440Sstevel@tonic-gate hwblkpagecopy(pp_addr1, pp_addr2); 37455084Sjohnlev #endif 37460Sstevel@tonic-gate else 37470Sstevel@tonic-gate bcopy(pp_addr1, pp_addr2, PAGESIZE); 37480Sstevel@tonic-gate 37493253Smec no_fault(); 37503253Smec faulted: 37513446Smrj if (!kpm_enable) { 37525084Sjohnlev #ifdef __xpv 37535084Sjohnlev /* 37545217Sjosephb * We can't leave unused mappings laying about under the 37555217Sjosephb * hypervisor, so blow them away. 37565084Sjohnlev */ 37575217Sjosephb if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1, 0, 37585217Sjosephb UVMF_INVLPG | UVMF_LOCAL) < 0) 37595217Sjosephb panic("HYPERVISOR_update_va_mapping() failed"); 37605084Sjohnlev if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0, 37615084Sjohnlev UVMF_INVLPG | UVMF_LOCAL) < 0) 37625084Sjohnlev panic("HYPERVISOR_update_va_mapping() failed"); 37635084Sjohnlev #endif 37640Sstevel@tonic-gate mutex_exit(ppaddr_mutex); 37653446Smrj } 37660Sstevel@tonic-gate kpreempt_enable(); 37673253Smec return (ret); 37680Sstevel@tonic-gate } 37690Sstevel@tonic-gate 37705262Srscott void 37715262Srscott pagezero(page_t *pp, uint_t off, uint_t len) 37725262Srscott { 37735262Srscott ASSERT(PAGE_LOCKED(pp)); 37745262Srscott pfnzero(page_pptonum(pp), off, len); 37755262Srscott } 37765262Srscott 37770Sstevel@tonic-gate /* 37785262Srscott * Zero the physical page from off to off + len given by pfn 37790Sstevel@tonic-gate * without changing the reference and modified bits of page. 37800Sstevel@tonic-gate * 37810Sstevel@tonic-gate * We use this using CPU private page address #2, see ppcopy() for more info. 37825262Srscott * pfnzero() must not be called at interrupt level. 37830Sstevel@tonic-gate */ 37840Sstevel@tonic-gate void 37855262Srscott pfnzero(pfn_t pfn, uint_t off, uint_t len) 37860Sstevel@tonic-gate { 37870Sstevel@tonic-gate caddr_t pp_addr2; 37883446Smrj hat_mempte_t pte2; 37895262Srscott kmutex_t *ppaddr_mutex = NULL; 37900Sstevel@tonic-gate 37910Sstevel@tonic-gate ASSERT_STACK_ALIGNED(); 37920Sstevel@tonic-gate ASSERT(len <= MMU_PAGESIZE); 37930Sstevel@tonic-gate ASSERT(off <= MMU_PAGESIZE); 37940Sstevel@tonic-gate ASSERT(off + len <= MMU_PAGESIZE); 37955262Srscott 37965262Srscott if (kpm_enable && !pfn_is_foreign(pfn)) { 37975262Srscott pp_addr2 = hat_kpm_pfn2va(pfn); 37980Sstevel@tonic-gate kpreempt_disable(); 37990Sstevel@tonic-gate } else { 38000Sstevel@tonic-gate kpreempt_disable(); 38010Sstevel@tonic-gate 38020Sstevel@tonic-gate pp_addr2 = CPU->cpu_caddr2; 38033446Smrj pte2 = CPU->cpu_caddr2pte; 38040Sstevel@tonic-gate 38050Sstevel@tonic-gate ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 38060Sstevel@tonic-gate mutex_enter(ppaddr_mutex); 38070Sstevel@tonic-gate 38085262Srscott hat_mempte_remap(pfn, pp_addr2, pte2, 38090Sstevel@tonic-gate PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 38100Sstevel@tonic-gate HAT_LOAD_NOCONSIST); 38110Sstevel@tonic-gate } 38120Sstevel@tonic-gate 38133446Smrj if (use_sse_pagezero) { 38145084Sjohnlev #ifdef __xpv 38155084Sjohnlev uint_t rem; 38165084Sjohnlev 38175084Sjohnlev /* 38185084Sjohnlev * zero a byte at a time until properly aligned for 38195084Sjohnlev * block_zero_no_xmm(). 38205084Sjohnlev */ 38215084Sjohnlev while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0) 38225084Sjohnlev pp_addr2[off++] = 0; 38235084Sjohnlev 38245084Sjohnlev /* 38255084Sjohnlev * Now use faster block_zero_no_xmm() for any range 38265084Sjohnlev * that is properly aligned and sized. 38275084Sjohnlev */ 38285084Sjohnlev rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN)); 38295084Sjohnlev len -= rem; 38305084Sjohnlev if (len != 0) { 38315084Sjohnlev block_zero_no_xmm(pp_addr2 + off, len); 38325084Sjohnlev off += len; 38335084Sjohnlev } 38345084Sjohnlev 38355084Sjohnlev /* 38365084Sjohnlev * zero remainder with byte stores. 38375084Sjohnlev */ 38385084Sjohnlev while (rem-- > 0) 38395084Sjohnlev pp_addr2[off++] = 0; 38405084Sjohnlev #else 38410Sstevel@tonic-gate hwblkclr(pp_addr2 + off, len); 38425084Sjohnlev #endif 38433446Smrj } else { 38440Sstevel@tonic-gate bzero(pp_addr2 + off, len); 38453446Smrj } 38460Sstevel@tonic-gate 38475262Srscott if (!kpm_enable || pfn_is_foreign(pfn)) { 38485084Sjohnlev #ifdef __xpv 38495262Srscott /* 38505262Srscott * On the hypervisor this page might get used for a page 38515262Srscott * table before any intervening change to this mapping, 38525262Srscott * so blow it away. 38535262Srscott */ 38545262Srscott if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0, 38555262Srscott UVMF_INVLPG) < 0) 38565262Srscott panic("HYPERVISOR_update_va_mapping() failed"); 38575084Sjohnlev #endif 38580Sstevel@tonic-gate mutex_exit(ppaddr_mutex); 38595262Srscott } 38605262Srscott 38610Sstevel@tonic-gate kpreempt_enable(); 38620Sstevel@tonic-gate } 38630Sstevel@tonic-gate 38640Sstevel@tonic-gate /* 38650Sstevel@tonic-gate * Platform-dependent page scrub call. 38660Sstevel@tonic-gate */ 38670Sstevel@tonic-gate void 38680Sstevel@tonic-gate pagescrub(page_t *pp, uint_t off, uint_t len) 38690Sstevel@tonic-gate { 38700Sstevel@tonic-gate /* 38710Sstevel@tonic-gate * For now, we rely on the fact that pagezero() will 38720Sstevel@tonic-gate * always clear UEs. 38730Sstevel@tonic-gate */ 38740Sstevel@tonic-gate pagezero(pp, off, len); 38750Sstevel@tonic-gate } 38760Sstevel@tonic-gate 38770Sstevel@tonic-gate /* 38780Sstevel@tonic-gate * set up two private addresses for use on a given CPU for use in ppcopy() 38790Sstevel@tonic-gate */ 38800Sstevel@tonic-gate void 38810Sstevel@tonic-gate setup_vaddr_for_ppcopy(struct cpu *cpup) 38820Sstevel@tonic-gate { 38830Sstevel@tonic-gate void *addr; 38843446Smrj hat_mempte_t pte_pa; 38850Sstevel@tonic-gate 38860Sstevel@tonic-gate addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 38873446Smrj pte_pa = hat_mempte_setup(addr); 38880Sstevel@tonic-gate cpup->cpu_caddr1 = addr; 38893446Smrj cpup->cpu_caddr1pte = pte_pa; 38900Sstevel@tonic-gate 38910Sstevel@tonic-gate addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 38923446Smrj pte_pa = hat_mempte_setup(addr); 38930Sstevel@tonic-gate cpup->cpu_caddr2 = addr; 38943446Smrj cpup->cpu_caddr2pte = pte_pa; 38950Sstevel@tonic-gate 38960Sstevel@tonic-gate mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 38970Sstevel@tonic-gate } 38980Sstevel@tonic-gate 38993446Smrj /* 39003446Smrj * Undo setup_vaddr_for_ppcopy 39013446Smrj */ 39023446Smrj void 39033446Smrj teardown_vaddr_for_ppcopy(struct cpu *cpup) 39043446Smrj { 39053446Smrj mutex_destroy(&cpup->cpu_ppaddr_mutex); 39063446Smrj 39073446Smrj hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte); 39083446Smrj cpup->cpu_caddr2pte = 0; 39093446Smrj vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1)); 39103446Smrj cpup->cpu_caddr2 = 0; 39113446Smrj 39123446Smrj hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte); 39133446Smrj cpup->cpu_caddr1pte = 0; 39143446Smrj vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1)); 39153446Smrj cpup->cpu_caddr1 = 0; 39163446Smrj } 39170Sstevel@tonic-gate 39180Sstevel@tonic-gate /* 39190Sstevel@tonic-gate * Function for flushing D-cache when performing module relocations 39200Sstevel@tonic-gate * to an alternate mapping. Unnecessary on Intel / AMD platforms. 39210Sstevel@tonic-gate */ 39220Sstevel@tonic-gate void 39230Sstevel@tonic-gate dcache_flushall() 39240Sstevel@tonic-gate {} 39253177Sdp78419 39263177Sdp78419 size_t 39273177Sdp78419 exec_get_spslew(void) 39283177Sdp78419 { 39293177Sdp78419 return (0); 39303177Sdp78419 } 39313446Smrj 39323446Smrj /* 39333446Smrj * Allocate a memory page. The argument 'seed' can be any pseudo-random 39343446Smrj * number to vary where the pages come from. This is quite a hacked up 39353446Smrj * method -- it works for now, but really needs to be fixed up a bit. 39363446Smrj * 39373446Smrj * We currently use page_create_va() on the kvp with fake offsets, 39383446Smrj * segments and virt address. This is pretty bogus, but was copied from the 39393446Smrj * old hat_i86.c code. A better approach would be to specify either mnode 39403446Smrj * random or mnode local and takes a page from whatever color has the MOST 39413446Smrj * available - this would have a minimal impact on page coloring. 39423446Smrj */ 39433446Smrj page_t * 39449062SVikram.Hegde@Sun.COM page_get_physical(uintptr_t seed) 39453446Smrj { 39463446Smrj page_t *pp; 39479062SVikram.Hegde@Sun.COM u_offset_t offset; 39483446Smrj static struct seg tmpseg; 39493446Smrj static uintptr_t ctr = 0; 39503446Smrj 39513446Smrj /* 39523446Smrj * This code is gross, we really need a simpler page allocator. 39533446Smrj * 39549062SVikram.Hegde@Sun.COM * We need to assign an offset for the page to call page_create_va() 39553446Smrj * To avoid conflicts with other pages, we get creative with the offset. 39567589SVikram.Hegde@Sun.COM * For 32 bits, we need an offset > 4Gig 39577589SVikram.Hegde@Sun.COM * For 64 bits, need an offset somewhere in the VA hole. 39583446Smrj */ 39599062SVikram.Hegde@Sun.COM offset = seed; 39609062SVikram.Hegde@Sun.COM if (offset > kernelbase) 39619062SVikram.Hegde@Sun.COM offset -= kernelbase; 39629062SVikram.Hegde@Sun.COM offset <<= MMU_PAGESHIFT; 39639062SVikram.Hegde@Sun.COM #if defined(__amd64) 39649062SVikram.Hegde@Sun.COM offset += mmu.hole_start; /* something in VA hole */ 39659062SVikram.Hegde@Sun.COM #else 39669062SVikram.Hegde@Sun.COM offset += 1ULL << 40; /* something > 4 Gig */ 39679062SVikram.Hegde@Sun.COM #endif 39689062SVikram.Hegde@Sun.COM 39699062SVikram.Hegde@Sun.COM if (page_resv(1, KM_NOSLEEP) == 0) 39703446Smrj return (NULL); 39713446Smrj 39723446Smrj #ifdef DEBUG 39733446Smrj pp = page_exists(&kvp, offset); 39743446Smrj if (pp != NULL) 39757240Srh87107 panic("page already exists %p", (void *)pp); 39763446Smrj #endif 39773446Smrj 39785084Sjohnlev pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL, 39793446Smrj &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE)); /* changing VA usage */ 39807589SVikram.Hegde@Sun.COM if (pp != NULL) { 39817589SVikram.Hegde@Sun.COM page_io_unlock(pp); 39827589SVikram.Hegde@Sun.COM page_hashout(pp, NULL); 39838198SVikram.Hegde@Sun.COM page_downgrade(pp); 39847589SVikram.Hegde@Sun.COM } 39853446Smrj return (pp); 39863446Smrj } 3987