10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 51443Skchow * Common Development and Distribution License (the "License"). 61443Skchow * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate /* 228947SMichael.Corcoran@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 230Sstevel@tonic-gate * Use is subject to license terms. 240Sstevel@tonic-gate */ 250Sstevel@tonic-gate 260Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 270Sstevel@tonic-gate /* All Rights Reserved */ 280Sstevel@tonic-gate 290Sstevel@tonic-gate /* 300Sstevel@tonic-gate * Portions of this source code were derived from Berkeley 4.3 BSD 310Sstevel@tonic-gate * under license from the Regents of the University of California. 320Sstevel@tonic-gate */ 330Sstevel@tonic-gate 340Sstevel@tonic-gate /* 350Sstevel@tonic-gate * UNIX machine dependent virtual memory support. 360Sstevel@tonic-gate */ 370Sstevel@tonic-gate 380Sstevel@tonic-gate #include <sys/types.h> 390Sstevel@tonic-gate #include <sys/param.h> 400Sstevel@tonic-gate #include <sys/systm.h> 410Sstevel@tonic-gate #include <sys/user.h> 420Sstevel@tonic-gate #include <sys/proc.h> 430Sstevel@tonic-gate #include <sys/kmem.h> 440Sstevel@tonic-gate #include <sys/vmem.h> 450Sstevel@tonic-gate #include <sys/buf.h> 460Sstevel@tonic-gate #include <sys/cpuvar.h> 470Sstevel@tonic-gate #include <sys/lgrp.h> 480Sstevel@tonic-gate #include <sys/disp.h> 490Sstevel@tonic-gate #include <sys/vm.h> 500Sstevel@tonic-gate #include <sys/mman.h> 510Sstevel@tonic-gate #include <sys/vnode.h> 520Sstevel@tonic-gate #include <sys/cred.h> 530Sstevel@tonic-gate #include <sys/exec.h> 540Sstevel@tonic-gate #include <sys/exechdr.h> 550Sstevel@tonic-gate #include <sys/debug.h> 562991Ssusans #include <sys/vmsystm.h> 570Sstevel@tonic-gate 580Sstevel@tonic-gate #include <vm/hat.h> 590Sstevel@tonic-gate #include <vm/as.h> 600Sstevel@tonic-gate #include <vm/seg.h> 610Sstevel@tonic-gate #include <vm/seg_kp.h> 620Sstevel@tonic-gate #include <vm/seg_vn.h> 630Sstevel@tonic-gate #include <vm/page.h> 640Sstevel@tonic-gate #include <vm/seg_kmem.h> 650Sstevel@tonic-gate #include <vm/seg_kpm.h> 660Sstevel@tonic-gate #include <vm/vm_dep.h> 670Sstevel@tonic-gate 680Sstevel@tonic-gate #include <sys/cpu.h> 690Sstevel@tonic-gate #include <sys/vm_machparam.h> 700Sstevel@tonic-gate #include <sys/memlist.h> 710Sstevel@tonic-gate #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 720Sstevel@tonic-gate #include <vm/hat_i86.h> 730Sstevel@tonic-gate #include <sys/x86_archext.h> 740Sstevel@tonic-gate #include <sys/elf_386.h> 750Sstevel@tonic-gate #include <sys/cmn_err.h> 760Sstevel@tonic-gate #include <sys/archsystm.h> 770Sstevel@tonic-gate #include <sys/machsystm.h> 780Sstevel@tonic-gate 790Sstevel@tonic-gate #include <sys/vtrace.h> 800Sstevel@tonic-gate #include <sys/ddidmareq.h> 810Sstevel@tonic-gate #include <sys/promif.h> 820Sstevel@tonic-gate #include <sys/memnode.h> 830Sstevel@tonic-gate #include <sys/stack.h> 845084Sjohnlev #include <util/qsort.h> 855084Sjohnlev #include <sys/taskq.h> 865084Sjohnlev 875084Sjohnlev #ifdef __xpv 885084Sjohnlev 895084Sjohnlev #include <sys/hypervisor.h> 905084Sjohnlev #include <sys/xen_mmu.h> 915084Sjohnlev #include <sys/balloon_impl.h> 925084Sjohnlev 935084Sjohnlev /* 945084Sjohnlev * domain 0 pages usable for DMA are kept pre-allocated and kept in 955084Sjohnlev * distinct lists, ordered by increasing mfn. 965084Sjohnlev */ 975084Sjohnlev static kmutex_t io_pool_lock; 985529Ssmaybe static kmutex_t contig_list_lock; 995084Sjohnlev static page_t *io_pool_4g; /* pool for 32 bit dma limited devices */ 1005084Sjohnlev static page_t *io_pool_16m; /* pool for 24 bit dma limited legacy devices */ 1015084Sjohnlev static long io_pool_cnt; 1025084Sjohnlev static long io_pool_cnt_max = 0; 1035084Sjohnlev #define DEFAULT_IO_POOL_MIN 128 1045084Sjohnlev static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN; 1055084Sjohnlev static long io_pool_cnt_lowater = 0; 1065084Sjohnlev static long io_pool_shrink_attempts; /* how many times did we try to shrink */ 1075084Sjohnlev static long io_pool_shrinks; /* how many times did we really shrink */ 1085084Sjohnlev static long io_pool_grows; /* how many times did we grow */ 1095084Sjohnlev static mfn_t start_mfn = 1; 1105084Sjohnlev static caddr_t io_pool_kva; /* use to alloc pages when needed */ 1115084Sjohnlev 1125084Sjohnlev static int create_contig_pfnlist(uint_t); 1135084Sjohnlev 1145084Sjohnlev /* 1155084Sjohnlev * percentage of phys mem to hold in the i/o pool 1165084Sjohnlev */ 1175084Sjohnlev #define DEFAULT_IO_POOL_PCT 2 1185084Sjohnlev static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT; 1195084Sjohnlev static void page_io_pool_sub(page_t **, page_t *, page_t *); 1205529Ssmaybe int ioalloc_dbg = 0; 1215084Sjohnlev 1225084Sjohnlev #endif /* __xpv */ 1230Sstevel@tonic-gate 1242961Sdp78419 uint_t vac_colors = 1; 1250Sstevel@tonic-gate 1260Sstevel@tonic-gate int largepagesupport = 0; 1270Sstevel@tonic-gate extern uint_t page_create_new; 1280Sstevel@tonic-gate extern uint_t page_create_exists; 1290Sstevel@tonic-gate extern uint_t page_create_putbacks; 1300Sstevel@tonic-gate extern uint_t page_create_putbacks; 1313446Smrj /* 1323446Smrj * Allow users to disable the kernel's use of SSE. 1333446Smrj */ 1343446Smrj extern int use_sse_pagecopy, use_sse_pagezero; 1350Sstevel@tonic-gate 1365084Sjohnlev /* 1375084Sjohnlev * combined memory ranges from mnode and memranges[] to manage single 1385084Sjohnlev * mnode/mtype dimension in the page lists. 1395084Sjohnlev */ 1405084Sjohnlev typedef struct { 1415084Sjohnlev pfn_t mnr_pfnlo; 1425084Sjohnlev pfn_t mnr_pfnhi; 1435084Sjohnlev int mnr_mnode; 1445084Sjohnlev int mnr_memrange; /* index into memranges[] */ 1455084Sjohnlev /* maintain page list stats */ 1465084Sjohnlev pgcnt_t mnr_mt_clpgcnt; /* cache list cnt */ 1475466Skchow pgcnt_t mnr_mt_flpgcnt[MMU_PAGE_SIZES]; /* free list cnt per szc */ 1485466Skchow pgcnt_t mnr_mt_totcnt; /* sum of cache and free lists */ 1495084Sjohnlev #ifdef DEBUG 1505084Sjohnlev struct mnr_mts { /* mnode/mtype szc stats */ 1515084Sjohnlev pgcnt_t mnr_mts_pgcnt; 1525084Sjohnlev int mnr_mts_colors; 1535084Sjohnlev pgcnt_t *mnr_mtsc_pgcnt; 1545084Sjohnlev } *mnr_mts; 1555084Sjohnlev #endif 1565084Sjohnlev } mnoderange_t; 1575084Sjohnlev 1585084Sjohnlev #define MEMRANGEHI(mtype) \ 1595084Sjohnlev ((mtype > 0) ? memranges[mtype - 1] - 1: physmax) 1605084Sjohnlev #define MEMRANGELO(mtype) (memranges[mtype]) 1615084Sjohnlev 1625466Skchow #define MTYPE_FREEMEM(mt) (mnoderanges[mt].mnr_mt_totcnt) 1635084Sjohnlev 1645084Sjohnlev /* 1655084Sjohnlev * As the PC architecture evolved memory up was clumped into several 1665084Sjohnlev * ranges for various historical I/O devices to do DMA. 1675084Sjohnlev * < 16Meg - ISA bus 1685084Sjohnlev * < 2Gig - ??? 1695084Sjohnlev * < 4Gig - PCI bus or drivers that don't understand PAE mode 1705084Sjohnlev * 1715084Sjohnlev * These are listed in reverse order, so that we can skip over unused 1725084Sjohnlev * ranges on machines with small memories. 1735084Sjohnlev * 1745084Sjohnlev * For now under the Hypervisor, we'll only ever have one memrange. 1755084Sjohnlev */ 1765084Sjohnlev #define PFN_4GIG 0x100000 1775084Sjohnlev #define PFN_16MEG 0x1000 1785084Sjohnlev static pfn_t arch_memranges[NUM_MEM_RANGES] = { 1795084Sjohnlev PFN_4GIG, /* pfn range for 4G and above */ 1805084Sjohnlev 0x80000, /* pfn range for 2G-4G */ 1815084Sjohnlev PFN_16MEG, /* pfn range for 16M-2G */ 1825084Sjohnlev 0x00000, /* pfn range for 0-16M */ 1835084Sjohnlev }; 1845084Sjohnlev pfn_t *memranges = &arch_memranges[0]; 1855084Sjohnlev int nranges = NUM_MEM_RANGES; 1865084Sjohnlev 1875084Sjohnlev /* 1885084Sjohnlev * This combines mem_node_config and memranges into one data 1895084Sjohnlev * structure to be used for page list management. 1905084Sjohnlev */ 1915084Sjohnlev mnoderange_t *mnoderanges; 1925084Sjohnlev int mnoderangecnt; 1935084Sjohnlev int mtype4g; 1945084Sjohnlev 1955084Sjohnlev /* 1965084Sjohnlev * 4g memory management variables for systems with more than 4g of memory: 1975084Sjohnlev * 1985084Sjohnlev * physical memory below 4g is required for 32bit dma devices and, currently, 1995084Sjohnlev * for kmem memory. On systems with more than 4g of memory, the pool of memory 2005084Sjohnlev * below 4g can be depleted without any paging activity given that there is 2015084Sjohnlev * likely to be sufficient memory above 4g. 2025084Sjohnlev * 2035084Sjohnlev * physmax4g is set true if the largest pfn is over 4g. The rest of the 2045084Sjohnlev * 4g memory management code is enabled only when physmax4g is true. 2055084Sjohnlev * 2065084Sjohnlev * maxmem4g is the count of the maximum number of pages on the page lists 2075084Sjohnlev * with physical addresses below 4g. It can be a lot less then 4g given that 2085084Sjohnlev * BIOS may reserve large chunks of space below 4g for hot plug pci devices, 2095084Sjohnlev * agp aperture etc. 2105084Sjohnlev * 2115084Sjohnlev * freemem4g maintains the count of the number of available pages on the 2125084Sjohnlev * page lists with physical addresses below 4g. 2135084Sjohnlev * 2145084Sjohnlev * DESFREE4G specifies the desired amount of below 4g memory. It defaults to 2155084Sjohnlev * 6% (desfree4gshift = 4) of maxmem4g. 2165084Sjohnlev * 2175084Sjohnlev * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G 2185084Sjohnlev * and the amount of physical memory above 4g is greater than freemem4g. 2195084Sjohnlev * In this case, page_get_* routines will restrict below 4g allocations 2205084Sjohnlev * for requests that don't specifically require it. 2215084Sjohnlev */ 2225084Sjohnlev 2235084Sjohnlev #define LOTSFREE4G (maxmem4g >> lotsfree4gshift) 2245084Sjohnlev #define DESFREE4G (maxmem4g >> desfree4gshift) 2255084Sjohnlev 2265084Sjohnlev #define RESTRICT4G_ALLOC \ 2275084Sjohnlev (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem)) 2285084Sjohnlev 2295084Sjohnlev static pgcnt_t maxmem4g; 2305084Sjohnlev static pgcnt_t freemem4g; 2315084Sjohnlev static int physmax4g; 2325084Sjohnlev static int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 2335084Sjohnlev static int lotsfree4gshift = 3; 2345084Sjohnlev 2355084Sjohnlev /* 2365084Sjohnlev * 16m memory management: 2375084Sjohnlev * 2385084Sjohnlev * reserve some amount of physical memory below 16m for legacy devices. 2395084Sjohnlev * 2405084Sjohnlev * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above 2415084Sjohnlev * 16m or if the 16m pool drops below DESFREE16M. 2425084Sjohnlev * 2435084Sjohnlev * In this case, general page allocations via page_get_{free,cache}list 2445084Sjohnlev * routines will be restricted from allocating from the 16m pool. Allocations 2455084Sjohnlev * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations 2465084Sjohnlev * are not restricted. 2475084Sjohnlev */ 2485084Sjohnlev 2495084Sjohnlev #define FREEMEM16M MTYPE_FREEMEM(0) 2505084Sjohnlev #define DESFREE16M desfree16m 2515084Sjohnlev #define RESTRICT16M_ALLOC(freemem, pgcnt, flags) \ 2525084Sjohnlev ((freemem != 0) && ((flags & PG_PANIC) == 0) && \ 2535084Sjohnlev ((freemem >= (FREEMEM16M)) || \ 2545084Sjohnlev (FREEMEM16M < (DESFREE16M + pgcnt)))) 2555084Sjohnlev 2565084Sjohnlev static pgcnt_t desfree16m = 0x380; 2575084Sjohnlev 2585084Sjohnlev /* 2595084Sjohnlev * This can be patched via /etc/system to allow old non-PAE aware device 2605084Sjohnlev * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 2615084Sjohnlev */ 2625084Sjohnlev int restricted_kmemalloc = 0; 2631385Skchow 2640Sstevel@tonic-gate #ifdef VM_STATS 2650Sstevel@tonic-gate struct { 2660Sstevel@tonic-gate ulong_t pga_alloc; 2670Sstevel@tonic-gate ulong_t pga_notfullrange; 2680Sstevel@tonic-gate ulong_t pga_nulldmaattr; 2690Sstevel@tonic-gate ulong_t pga_allocok; 2700Sstevel@tonic-gate ulong_t pga_allocfailed; 2710Sstevel@tonic-gate ulong_t pgma_alloc; 2720Sstevel@tonic-gate ulong_t pgma_allocok; 2730Sstevel@tonic-gate ulong_t pgma_allocfailed; 2740Sstevel@tonic-gate ulong_t pgma_allocempty; 2750Sstevel@tonic-gate } pga_vmstats; 2760Sstevel@tonic-gate #endif 2770Sstevel@tonic-gate 2780Sstevel@tonic-gate uint_t mmu_page_sizes; 2790Sstevel@tonic-gate 2800Sstevel@tonic-gate /* How many page sizes the users can see */ 2810Sstevel@tonic-gate uint_t mmu_exported_page_sizes; 2820Sstevel@tonic-gate 2835349Skchow /* page sizes that legacy applications can see */ 2845349Skchow uint_t mmu_legacy_page_sizes; 2855349Skchow 286423Sdavemq /* 287423Sdavemq * Number of pages in 1 GB. Don't enable automatic large pages if we have 288423Sdavemq * fewer than this many pages. 289423Sdavemq */ 2902991Ssusans pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 2912991Ssusans pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 2922991Ssusans 2932991Ssusans /* 2942991Ssusans * Maximum and default segment size tunables for user private 2952991Ssusans * and shared anon memory, and user text and initialized data. 2962991Ssusans * These can be patched via /etc/system to allow large pages 2972991Ssusans * to be used for mapping application private and shared anon memory. 2982991Ssusans */ 2992991Ssusans size_t mcntl0_lpsize = MMU_PAGESIZE; 3002991Ssusans size_t max_uheap_lpsize = MMU_PAGESIZE; 3012991Ssusans size_t default_uheap_lpsize = MMU_PAGESIZE; 3022991Ssusans size_t max_ustack_lpsize = MMU_PAGESIZE; 3032991Ssusans size_t default_ustack_lpsize = MMU_PAGESIZE; 3042991Ssusans size_t max_privmap_lpsize = MMU_PAGESIZE; 3052991Ssusans size_t max_uidata_lpsize = MMU_PAGESIZE; 3062991Ssusans size_t max_utext_lpsize = MMU_PAGESIZE; 3072991Ssusans size_t max_shm_lpsize = MMU_PAGESIZE; 3080Sstevel@tonic-gate 3095084Sjohnlev 3105084Sjohnlev /* 3115084Sjohnlev * initialized by page_coloring_init(). 3125084Sjohnlev */ 3135084Sjohnlev uint_t page_colors; 3145084Sjohnlev uint_t page_colors_mask; 3155084Sjohnlev uint_t page_coloring_shift; 3165084Sjohnlev int cpu_page_colors; 3175084Sjohnlev static uint_t l2_colors; 3185084Sjohnlev 3195084Sjohnlev /* 3205084Sjohnlev * Page freelists and cachelists are dynamically allocated once mnoderangecnt 3215084Sjohnlev * and page_colors are calculated from the l2 cache n-way set size. Within a 3225084Sjohnlev * mnode range, the page freelist and cachelist are hashed into bins based on 3235084Sjohnlev * color. This makes it easier to search for a page within a specific memory 3245084Sjohnlev * range. 3255084Sjohnlev */ 3265084Sjohnlev #define PAGE_COLORS_MIN 16 3275084Sjohnlev 3285084Sjohnlev page_t ****page_freelists; 3295084Sjohnlev page_t ***page_cachelists; 3305084Sjohnlev 3315084Sjohnlev 3325084Sjohnlev /* 3335084Sjohnlev * Used by page layer to know about page sizes 3345084Sjohnlev */ 3355084Sjohnlev hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 3365084Sjohnlev 3375084Sjohnlev kmutex_t *fpc_mutex[NPC_MUTEX]; 3385084Sjohnlev kmutex_t *cpc_mutex[NPC_MUTEX]; 3395084Sjohnlev 3405084Sjohnlev /* 3415084Sjohnlev * Only let one thread at a time try to coalesce large pages, to 3425084Sjohnlev * prevent them from working against each other. 3435084Sjohnlev */ 3445084Sjohnlev static kmutex_t contig_lock; 3455084Sjohnlev #define CONTIG_LOCK() mutex_enter(&contig_lock); 3465084Sjohnlev #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 3475084Sjohnlev 3485084Sjohnlev #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 3495084Sjohnlev 3500Sstevel@tonic-gate /* 3510Sstevel@tonic-gate * Return the optimum page size for a given mapping 3520Sstevel@tonic-gate */ 3530Sstevel@tonic-gate /*ARGSUSED*/ 3540Sstevel@tonic-gate size_t 3552991Ssusans map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl) 3560Sstevel@tonic-gate { 3572991Ssusans level_t l = 0; 3582991Ssusans size_t pgsz = MMU_PAGESIZE; 3592991Ssusans size_t max_lpsize; 3602991Ssusans uint_t mszc; 3610Sstevel@tonic-gate 3622991Ssusans ASSERT(maptype != MAPPGSZ_VA); 3632991Ssusans 3642991Ssusans if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) { 3652991Ssusans return (MMU_PAGESIZE); 3662991Ssusans } 3670Sstevel@tonic-gate 3680Sstevel@tonic-gate switch (maptype) { 3692991Ssusans case MAPPGSZ_HEAP: 3700Sstevel@tonic-gate case MAPPGSZ_STK: 3712991Ssusans max_lpsize = memcntl ? mcntl0_lpsize : (maptype == 3722991Ssusans MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize); 3732991Ssusans if (max_lpsize == MMU_PAGESIZE) { 3742991Ssusans return (MMU_PAGESIZE); 3752991Ssusans } 3762991Ssusans if (len == 0) { 3772991Ssusans len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase + 3782991Ssusans p->p_brksize - p->p_bssbase : p->p_stksize; 3792991Ssusans } 3802991Ssusans len = (maptype == MAPPGSZ_HEAP) ? MAX(len, 3812991Ssusans default_uheap_lpsize) : MAX(len, default_ustack_lpsize); 3822991Ssusans 3830Sstevel@tonic-gate /* 3840Sstevel@tonic-gate * use the pages size that best fits len 3850Sstevel@tonic-gate */ 3865349Skchow for (l = mmu.umax_page_level; l > 0; --l) { 3872991Ssusans if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) { 3880Sstevel@tonic-gate continue; 3892991Ssusans } else { 3902991Ssusans pgsz = LEVEL_SIZE(l); 3912991Ssusans } 3920Sstevel@tonic-gate break; 3930Sstevel@tonic-gate } 3942991Ssusans 3952991Ssusans mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc : 3962991Ssusans p->p_stkpageszc); 3972991Ssusans if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) { 3982991Ssusans pgsz = hw_page_array[mszc].hp_size; 3992991Ssusans } 4002991Ssusans return (pgsz); 4010Sstevel@tonic-gate 4020Sstevel@tonic-gate case MAPPGSZ_ISM: 4035349Skchow for (l = mmu.umax_page_level; l > 0; --l) { 4045349Skchow if (len >= LEVEL_SIZE(l)) 4055349Skchow return (LEVEL_SIZE(l)); 4065349Skchow } 4075349Skchow return (LEVEL_SIZE(0)); 4080Sstevel@tonic-gate } 4092991Ssusans return (pgsz); 4100Sstevel@tonic-gate } 4110Sstevel@tonic-gate 4122991Ssusans static uint_t 4132991Ssusans map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize, 4142991Ssusans size_t min_physmem) 4152991Ssusans { 4162991Ssusans caddr_t eaddr = addr + size; 4172991Ssusans uint_t szcvec = 0; 4182991Ssusans caddr_t raddr; 4192991Ssusans caddr_t readdr; 4202991Ssusans size_t pgsz; 4212991Ssusans int i; 4222991Ssusans 4232991Ssusans if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) { 4242991Ssusans return (0); 4252991Ssusans } 4262991Ssusans 4275349Skchow for (i = mmu_exported_page_sizes - 1; i > 0; i--) { 4282991Ssusans pgsz = page_get_pagesize(i); 4292991Ssusans if (pgsz > max_lpsize) { 4302991Ssusans continue; 4312991Ssusans } 4322991Ssusans raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 4332991Ssusans readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 4342991Ssusans if (raddr < addr || raddr >= readdr) { 4352991Ssusans continue; 4362991Ssusans } 4372991Ssusans if (P2PHASE((uintptr_t)addr ^ off, pgsz)) { 4382991Ssusans continue; 4392991Ssusans } 4402991Ssusans /* 4412991Ssusans * Set szcvec to the remaining page sizes. 4422991Ssusans */ 4432991Ssusans szcvec = ((1 << (i + 1)) - 1) & ~1; 4442991Ssusans break; 4452991Ssusans } 4462991Ssusans return (szcvec); 4472991Ssusans } 4480Sstevel@tonic-gate 4490Sstevel@tonic-gate /* 4500Sstevel@tonic-gate * Return a bit vector of large page size codes that 4510Sstevel@tonic-gate * can be used to map [addr, addr + len) region. 4520Sstevel@tonic-gate */ 4530Sstevel@tonic-gate /*ARGSUSED*/ 4540Sstevel@tonic-gate uint_t 4552991Ssusans map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type, 4562991Ssusans int memcntl) 4570Sstevel@tonic-gate { 4582991Ssusans size_t max_lpsize = mcntl0_lpsize; 4590Sstevel@tonic-gate 4602991Ssusans if (mmu.max_page_level == 0) 4610Sstevel@tonic-gate return (0); 4620Sstevel@tonic-gate 4632991Ssusans if (flags & MAP_TEXT) { 4645084Sjohnlev if (!memcntl) 4655084Sjohnlev max_lpsize = max_utext_lpsize; 4665084Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 4672991Ssusans shm_lpg_min_physmem)); 4682991Ssusans 4692991Ssusans } else if (flags & MAP_INITDATA) { 4705084Sjohnlev if (!memcntl) 4715084Sjohnlev max_lpsize = max_uidata_lpsize; 4725084Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 4732991Ssusans privm_lpg_min_physmem)); 4742991Ssusans 4752991Ssusans } else if (type == MAPPGSZC_SHM) { 4765084Sjohnlev if (!memcntl) 4775084Sjohnlev max_lpsize = max_shm_lpsize; 4785084Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 4792991Ssusans shm_lpg_min_physmem)); 4800Sstevel@tonic-gate 4812991Ssusans } else if (type == MAPPGSZC_HEAP) { 4825084Sjohnlev if (!memcntl) 4835084Sjohnlev max_lpsize = max_uheap_lpsize; 4845084Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 4852991Ssusans privm_lpg_min_physmem)); 4862414Saguzovsk 4872991Ssusans } else if (type == MAPPGSZC_STACK) { 4885084Sjohnlev if (!memcntl) 4895084Sjohnlev max_lpsize = max_ustack_lpsize; 4905084Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 4912991Ssusans privm_lpg_min_physmem)); 4922991Ssusans 4932991Ssusans } else { 4945084Sjohnlev if (!memcntl) 4955084Sjohnlev max_lpsize = max_privmap_lpsize; 4965084Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 4972991Ssusans privm_lpg_min_physmem)); 4982414Saguzovsk } 4992414Saguzovsk } 5002414Saguzovsk 5010Sstevel@tonic-gate /* 5020Sstevel@tonic-gate * Handle a pagefault. 5030Sstevel@tonic-gate */ 5040Sstevel@tonic-gate faultcode_t 5050Sstevel@tonic-gate pagefault( 5060Sstevel@tonic-gate caddr_t addr, 5070Sstevel@tonic-gate enum fault_type type, 5080Sstevel@tonic-gate enum seg_rw rw, 5090Sstevel@tonic-gate int iskernel) 5100Sstevel@tonic-gate { 5110Sstevel@tonic-gate struct as *as; 5120Sstevel@tonic-gate struct hat *hat; 5130Sstevel@tonic-gate struct proc *p; 5140Sstevel@tonic-gate kthread_t *t; 5150Sstevel@tonic-gate faultcode_t res; 5160Sstevel@tonic-gate caddr_t base; 5170Sstevel@tonic-gate size_t len; 5180Sstevel@tonic-gate int err; 5190Sstevel@tonic-gate int mapped_red; 5200Sstevel@tonic-gate uintptr_t ea; 5210Sstevel@tonic-gate 5220Sstevel@tonic-gate ASSERT_STACK_ALIGNED(); 5230Sstevel@tonic-gate 5240Sstevel@tonic-gate if (INVALID_VADDR(addr)) 5250Sstevel@tonic-gate return (FC_NOMAP); 5260Sstevel@tonic-gate 5270Sstevel@tonic-gate mapped_red = segkp_map_red(); 5280Sstevel@tonic-gate 5290Sstevel@tonic-gate if (iskernel) { 5300Sstevel@tonic-gate as = &kas; 5310Sstevel@tonic-gate hat = as->a_hat; 5320Sstevel@tonic-gate } else { 5330Sstevel@tonic-gate t = curthread; 5340Sstevel@tonic-gate p = ttoproc(t); 5350Sstevel@tonic-gate as = p->p_as; 5360Sstevel@tonic-gate hat = as->a_hat; 5370Sstevel@tonic-gate } 5380Sstevel@tonic-gate 5390Sstevel@tonic-gate /* 5400Sstevel@tonic-gate * Dispatch pagefault. 5410Sstevel@tonic-gate */ 5420Sstevel@tonic-gate res = as_fault(hat, as, addr, 1, type, rw); 5430Sstevel@tonic-gate 5440Sstevel@tonic-gate /* 5450Sstevel@tonic-gate * If this isn't a potential unmapped hole in the user's 5460Sstevel@tonic-gate * UNIX data or stack segments, just return status info. 5470Sstevel@tonic-gate */ 5480Sstevel@tonic-gate if (res != FC_NOMAP || iskernel) 5490Sstevel@tonic-gate goto out; 5500Sstevel@tonic-gate 5510Sstevel@tonic-gate /* 5520Sstevel@tonic-gate * Check to see if we happened to faulted on a currently unmapped 5530Sstevel@tonic-gate * part of the UNIX data or stack segments. If so, create a zfod 5540Sstevel@tonic-gate * mapping there and then try calling the fault routine again. 5550Sstevel@tonic-gate */ 5560Sstevel@tonic-gate base = p->p_brkbase; 5570Sstevel@tonic-gate len = p->p_brksize; 5580Sstevel@tonic-gate 5590Sstevel@tonic-gate if (addr < base || addr >= base + len) { /* data seg? */ 5600Sstevel@tonic-gate base = (caddr_t)p->p_usrstack - p->p_stksize; 5610Sstevel@tonic-gate len = p->p_stksize; 5620Sstevel@tonic-gate if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 5630Sstevel@tonic-gate /* not in either UNIX data or stack segments */ 5640Sstevel@tonic-gate res = FC_NOMAP; 5650Sstevel@tonic-gate goto out; 5660Sstevel@tonic-gate } 5670Sstevel@tonic-gate } 5680Sstevel@tonic-gate 5690Sstevel@tonic-gate /* 5700Sstevel@tonic-gate * the rest of this function implements a 3.X 4.X 5.X compatibility 5710Sstevel@tonic-gate * This code is probably not needed anymore 5720Sstevel@tonic-gate */ 5730Sstevel@tonic-gate if (p->p_model == DATAMODEL_ILP32) { 5740Sstevel@tonic-gate 5750Sstevel@tonic-gate /* expand the gap to the page boundaries on each side */ 5760Sstevel@tonic-gate ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 5770Sstevel@tonic-gate base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 5780Sstevel@tonic-gate len = ea - (uintptr_t)base; 5790Sstevel@tonic-gate 5800Sstevel@tonic-gate as_rangelock(as); 5810Sstevel@tonic-gate if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 5820Sstevel@tonic-gate 0) { 5830Sstevel@tonic-gate err = as_map(as, base, len, segvn_create, zfod_argsp); 5840Sstevel@tonic-gate as_rangeunlock(as); 5850Sstevel@tonic-gate if (err) { 5860Sstevel@tonic-gate res = FC_MAKE_ERR(err); 5870Sstevel@tonic-gate goto out; 5880Sstevel@tonic-gate } 5890Sstevel@tonic-gate } else { 5900Sstevel@tonic-gate /* 5910Sstevel@tonic-gate * This page is already mapped by another thread after 5920Sstevel@tonic-gate * we returned from as_fault() above. We just fall 5930Sstevel@tonic-gate * through as_fault() below. 5940Sstevel@tonic-gate */ 5950Sstevel@tonic-gate as_rangeunlock(as); 5960Sstevel@tonic-gate } 5970Sstevel@tonic-gate 5980Sstevel@tonic-gate res = as_fault(hat, as, addr, 1, F_INVAL, rw); 5990Sstevel@tonic-gate } 6000Sstevel@tonic-gate 6010Sstevel@tonic-gate out: 6020Sstevel@tonic-gate if (mapped_red) 6030Sstevel@tonic-gate segkp_unmap_red(); 6040Sstevel@tonic-gate 6050Sstevel@tonic-gate return (res); 6060Sstevel@tonic-gate } 6070Sstevel@tonic-gate 6080Sstevel@tonic-gate void 6090Sstevel@tonic-gate map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 6100Sstevel@tonic-gate { 6110Sstevel@tonic-gate struct proc *p = curproc; 6120Sstevel@tonic-gate caddr_t userlimit = (flags & _MAP_LOW32) ? 6130Sstevel@tonic-gate (caddr_t)_userlimit32 : p->p_as->a_userlimit; 6140Sstevel@tonic-gate 6150Sstevel@tonic-gate map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 6160Sstevel@tonic-gate } 6170Sstevel@tonic-gate 6180Sstevel@tonic-gate /*ARGSUSED*/ 6190Sstevel@tonic-gate int 6200Sstevel@tonic-gate map_addr_vacalign_check(caddr_t addr, u_offset_t off) 6210Sstevel@tonic-gate { 6220Sstevel@tonic-gate return (0); 6230Sstevel@tonic-gate } 6240Sstevel@tonic-gate 6250Sstevel@tonic-gate /* 6260Sstevel@tonic-gate * map_addr_proc() is the routine called when the system is to 6270Sstevel@tonic-gate * choose an address for the user. We will pick an address 6283446Smrj * range which is the highest available below userlimit. 6290Sstevel@tonic-gate * 6305668Smec * Every mapping will have a redzone of a single page on either side of 6315668Smec * the request. This is done to leave one page unmapped between segments. 6325668Smec * This is not required, but it's useful for the user because if their 6335668Smec * program strays across a segment boundary, it will catch a fault 6345668Smec * immediately making debugging a little easier. Currently the redzone 6355668Smec * is mandatory. 6365668Smec * 6370Sstevel@tonic-gate * addrp is a value/result parameter. 6380Sstevel@tonic-gate * On input it is a hint from the user to be used in a completely 6390Sstevel@tonic-gate * machine dependent fashion. We decide to completely ignore this hint. 6405668Smec * If MAP_ALIGN was specified, addrp contains the minimal alignment, which 6415668Smec * must be some "power of two" multiple of pagesize. 6420Sstevel@tonic-gate * 6430Sstevel@tonic-gate * On output it is NULL if no address can be found in the current 6440Sstevel@tonic-gate * processes address space or else an address that is currently 6450Sstevel@tonic-gate * not mapped for len bytes with a page of red zone on either side. 6460Sstevel@tonic-gate * 6475668Smec * vacalign is not needed on x86 (it's for viturally addressed caches) 6480Sstevel@tonic-gate */ 6490Sstevel@tonic-gate /*ARGSUSED*/ 6500Sstevel@tonic-gate void 6510Sstevel@tonic-gate map_addr_proc( 6520Sstevel@tonic-gate caddr_t *addrp, 6530Sstevel@tonic-gate size_t len, 6540Sstevel@tonic-gate offset_t off, 6550Sstevel@tonic-gate int vacalign, 6560Sstevel@tonic-gate caddr_t userlimit, 6570Sstevel@tonic-gate struct proc *p, 6580Sstevel@tonic-gate uint_t flags) 6590Sstevel@tonic-gate { 6600Sstevel@tonic-gate struct as *as = p->p_as; 6610Sstevel@tonic-gate caddr_t addr; 6620Sstevel@tonic-gate caddr_t base; 6630Sstevel@tonic-gate size_t slen; 6640Sstevel@tonic-gate size_t align_amount; 6650Sstevel@tonic-gate 6660Sstevel@tonic-gate ASSERT32(userlimit == as->a_userlimit); 6670Sstevel@tonic-gate 6680Sstevel@tonic-gate base = p->p_brkbase; 6690Sstevel@tonic-gate #if defined(__amd64) 6700Sstevel@tonic-gate /* 6710Sstevel@tonic-gate * XX64 Yes, this needs more work. 6720Sstevel@tonic-gate */ 6730Sstevel@tonic-gate if (p->p_model == DATAMODEL_NATIVE) { 6740Sstevel@tonic-gate if (userlimit < as->a_userlimit) { 6750Sstevel@tonic-gate /* 6760Sstevel@tonic-gate * This happens when a program wants to map 6770Sstevel@tonic-gate * something in a range that's accessible to a 6780Sstevel@tonic-gate * program in a smaller address space. For example, 6790Sstevel@tonic-gate * a 64-bit program calling mmap32(2) to guarantee 6800Sstevel@tonic-gate * that the returned address is below 4Gbytes. 6810Sstevel@tonic-gate */ 6820Sstevel@tonic-gate ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 6830Sstevel@tonic-gate 6840Sstevel@tonic-gate if (userlimit > base) 6850Sstevel@tonic-gate slen = userlimit - base; 6860Sstevel@tonic-gate else { 6870Sstevel@tonic-gate *addrp = NULL; 6880Sstevel@tonic-gate return; 6890Sstevel@tonic-gate } 6900Sstevel@tonic-gate } else { 6910Sstevel@tonic-gate /* 6920Sstevel@tonic-gate * XX64 This layout is probably wrong .. but in 6930Sstevel@tonic-gate * the event we make the amd64 address space look 6940Sstevel@tonic-gate * like sparcv9 i.e. with the stack -above- the 6950Sstevel@tonic-gate * heap, this bit of code might even be correct. 6960Sstevel@tonic-gate */ 6970Sstevel@tonic-gate slen = p->p_usrstack - base - 6988947SMichael.Corcoran@Sun.COM ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK); 6990Sstevel@tonic-gate } 7000Sstevel@tonic-gate } else 7010Sstevel@tonic-gate #endif 7020Sstevel@tonic-gate slen = userlimit - base; 7030Sstevel@tonic-gate 7045668Smec /* Make len be a multiple of PAGESIZE */ 7050Sstevel@tonic-gate len = (len + PAGEOFFSET) & PAGEMASK; 7060Sstevel@tonic-gate 7070Sstevel@tonic-gate /* 7080Sstevel@tonic-gate * figure out what the alignment should be 7090Sstevel@tonic-gate * 7100Sstevel@tonic-gate * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 7110Sstevel@tonic-gate */ 7120Sstevel@tonic-gate if (len <= ELF_386_MAXPGSZ) { 7130Sstevel@tonic-gate /* 7140Sstevel@tonic-gate * Align virtual addresses to ensure that ELF shared libraries 7150Sstevel@tonic-gate * are mapped with the appropriate alignment constraints by 7160Sstevel@tonic-gate * the run-time linker. 7170Sstevel@tonic-gate */ 7180Sstevel@tonic-gate align_amount = ELF_386_MAXPGSZ; 7190Sstevel@tonic-gate } else { 7205349Skchow int l = mmu.umax_page_level; 7210Sstevel@tonic-gate 7220Sstevel@tonic-gate while (l && len < LEVEL_SIZE(l)) 7230Sstevel@tonic-gate --l; 7240Sstevel@tonic-gate 7250Sstevel@tonic-gate align_amount = LEVEL_SIZE(l); 7260Sstevel@tonic-gate } 7270Sstevel@tonic-gate 7280Sstevel@tonic-gate if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 7290Sstevel@tonic-gate align_amount = (uintptr_t)*addrp; 7300Sstevel@tonic-gate 7315668Smec ASSERT(ISP2(align_amount)); 7325668Smec ASSERT(align_amount == 0 || align_amount >= PAGESIZE); 7335668Smec 7345668Smec off = off & (align_amount - 1); 7350Sstevel@tonic-gate /* 7360Sstevel@tonic-gate * Look for a large enough hole starting below userlimit. 7375668Smec * After finding it, use the upper part. 7380Sstevel@tonic-gate */ 7395668Smec if (as_gap_aligned(as, len, &base, &slen, AH_HI, NULL, align_amount, 7405668Smec PAGESIZE, off) == 0) { 7410Sstevel@tonic-gate caddr_t as_addr; 7420Sstevel@tonic-gate 7435668Smec /* 7445668Smec * addr is the highest possible address to use since we have 7455668Smec * a PAGESIZE redzone at the beginning and end. 7465668Smec */ 7475668Smec addr = base + slen - (PAGESIZE + len); 7480Sstevel@tonic-gate as_addr = addr; 7490Sstevel@tonic-gate /* 7505668Smec * Round address DOWN to the alignment amount and 7515668Smec * add the offset in. 7525668Smec * If addr is greater than as_addr, len would not be large 7535668Smec * enough to include the redzone, so we must adjust down 7545668Smec * by the alignment amount. 7550Sstevel@tonic-gate */ 7560Sstevel@tonic-gate addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 7575668Smec addr += (uintptr_t)off; 7585668Smec if (addr > as_addr) { 7595668Smec addr -= align_amount; 7605668Smec } 7615668Smec 7625668Smec ASSERT(addr > base); 7635668Smec ASSERT(addr + len < base + slen); 7640Sstevel@tonic-gate ASSERT(((uintptr_t)addr & (align_amount - 1)) == 7655668Smec ((uintptr_t)(off))); 7660Sstevel@tonic-gate *addrp = addr; 7670Sstevel@tonic-gate } else { 7680Sstevel@tonic-gate *addrp = NULL; /* no more virtual space */ 7690Sstevel@tonic-gate } 7700Sstevel@tonic-gate } 7710Sstevel@tonic-gate 7725668Smec int valid_va_range_aligned_wraparound; 7735668Smec 7740Sstevel@tonic-gate /* 7755668Smec * Determine whether [*basep, *basep + *lenp) contains a mappable range of 7765668Smec * addresses at least "minlen" long, where the base of the range is at "off" 7775668Smec * phase from an "align" boundary and there is space for a "redzone"-sized 7785668Smec * redzone on either side of the range. On success, 1 is returned and *basep 7795668Smec * and *lenp are adjusted to describe the acceptable range (including 7805668Smec * the redzone). On failure, 0 is returned. 7810Sstevel@tonic-gate */ 7820Sstevel@tonic-gate /*ARGSUSED3*/ 7830Sstevel@tonic-gate int 7845668Smec valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir, 7855668Smec size_t align, size_t redzone, size_t off) 7860Sstevel@tonic-gate { 7870Sstevel@tonic-gate uintptr_t hi, lo; 7885668Smec size_t tot_len; 7895668Smec 7905668Smec ASSERT(align == 0 ? off == 0 : off < align); 7915668Smec ASSERT(ISP2(align)); 7925668Smec ASSERT(align == 0 || align >= PAGESIZE); 7930Sstevel@tonic-gate 7940Sstevel@tonic-gate lo = (uintptr_t)*basep; 7950Sstevel@tonic-gate hi = lo + *lenp; 7965668Smec tot_len = minlen + 2 * redzone; /* need at least this much space */ 7970Sstevel@tonic-gate 7980Sstevel@tonic-gate /* 7990Sstevel@tonic-gate * If hi rolled over the top, try cutting back. 8000Sstevel@tonic-gate */ 8010Sstevel@tonic-gate if (hi < lo) { 8025668Smec *lenp = 0UL - lo - 1UL; 8035668Smec /* See if this really happens. If so, then we figure out why */ 8045668Smec valid_va_range_aligned_wraparound++; 8055668Smec hi = lo + *lenp; 8065668Smec } 8075668Smec if (*lenp < tot_len) { 8080Sstevel@tonic-gate return (0); 8090Sstevel@tonic-gate } 8105668Smec 8110Sstevel@tonic-gate #if defined(__amd64) 8120Sstevel@tonic-gate /* 8130Sstevel@tonic-gate * Deal with a possible hole in the address range between 8140Sstevel@tonic-gate * hole_start and hole_end that should never be mapped. 8150Sstevel@tonic-gate */ 8160Sstevel@tonic-gate if (lo < hole_start) { 8170Sstevel@tonic-gate if (hi > hole_start) { 8180Sstevel@tonic-gate if (hi < hole_end) { 8190Sstevel@tonic-gate hi = hole_start; 8200Sstevel@tonic-gate } else { 8210Sstevel@tonic-gate /* lo < hole_start && hi >= hole_end */ 8220Sstevel@tonic-gate if (dir == AH_LO) { 8230Sstevel@tonic-gate /* 8240Sstevel@tonic-gate * prefer lowest range 8250Sstevel@tonic-gate */ 8265668Smec if (hole_start - lo >= tot_len) 8270Sstevel@tonic-gate hi = hole_start; 8285668Smec else if (hi - hole_end >= tot_len) 8290Sstevel@tonic-gate lo = hole_end; 8300Sstevel@tonic-gate else 8310Sstevel@tonic-gate return (0); 8320Sstevel@tonic-gate } else { 8330Sstevel@tonic-gate /* 8340Sstevel@tonic-gate * prefer highest range 8350Sstevel@tonic-gate */ 8365668Smec if (hi - hole_end >= tot_len) 8370Sstevel@tonic-gate lo = hole_end; 8385668Smec else if (hole_start - lo >= tot_len) 8390Sstevel@tonic-gate hi = hole_start; 8400Sstevel@tonic-gate else 8410Sstevel@tonic-gate return (0); 8420Sstevel@tonic-gate } 8430Sstevel@tonic-gate } 8440Sstevel@tonic-gate } 8450Sstevel@tonic-gate } else { 8460Sstevel@tonic-gate /* lo >= hole_start */ 8470Sstevel@tonic-gate if (hi < hole_end) 8480Sstevel@tonic-gate return (0); 8490Sstevel@tonic-gate if (lo < hole_end) 8500Sstevel@tonic-gate lo = hole_end; 8510Sstevel@tonic-gate } 8525668Smec #endif 8535668Smec 8545668Smec if (hi - lo < tot_len) 8550Sstevel@tonic-gate return (0); 8560Sstevel@tonic-gate 8575668Smec if (align > 1) { 8585668Smec uintptr_t tlo = lo + redzone; 8595668Smec uintptr_t thi = hi - redzone; 8605668Smec tlo = (uintptr_t)P2PHASEUP(tlo, align, off); 8615668Smec if (tlo < lo + redzone) { 8625668Smec return (0); 8635668Smec } 8645668Smec if (thi < tlo || thi - tlo < minlen) { 8655668Smec return (0); 8665668Smec } 8675668Smec } 8685668Smec 8690Sstevel@tonic-gate *basep = (caddr_t)lo; 8700Sstevel@tonic-gate *lenp = hi - lo; 8710Sstevel@tonic-gate return (1); 8720Sstevel@tonic-gate } 8730Sstevel@tonic-gate 8740Sstevel@tonic-gate /* 8755668Smec * Determine whether [*basep, *basep + *lenp) contains a mappable range of 8765668Smec * addresses at least "minlen" long. On success, 1 is returned and *basep 8775668Smec * and *lenp are adjusted to describe the acceptable range. On failure, 0 8785668Smec * is returned. 8795668Smec */ 8805668Smec int 8815668Smec valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 8825668Smec { 8835668Smec return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0)); 8845668Smec } 8855668Smec 8865668Smec /* 8870Sstevel@tonic-gate * Determine whether [addr, addr+len] are valid user addresses. 8880Sstevel@tonic-gate */ 8890Sstevel@tonic-gate /*ARGSUSED*/ 8900Sstevel@tonic-gate int 8910Sstevel@tonic-gate valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 8920Sstevel@tonic-gate caddr_t userlimit) 8930Sstevel@tonic-gate { 8940Sstevel@tonic-gate caddr_t eaddr = addr + len; 8950Sstevel@tonic-gate 8960Sstevel@tonic-gate if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 8970Sstevel@tonic-gate return (RANGE_BADADDR); 8980Sstevel@tonic-gate 8990Sstevel@tonic-gate #if defined(__amd64) 9000Sstevel@tonic-gate /* 9010Sstevel@tonic-gate * Check for the VA hole 9020Sstevel@tonic-gate */ 9030Sstevel@tonic-gate if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 9040Sstevel@tonic-gate return (RANGE_BADADDR); 9050Sstevel@tonic-gate #endif 9060Sstevel@tonic-gate 9070Sstevel@tonic-gate return (RANGE_OKAY); 9080Sstevel@tonic-gate } 9090Sstevel@tonic-gate 9100Sstevel@tonic-gate /* 9110Sstevel@tonic-gate * Return 1 if the page frame is onboard memory, else 0. 9120Sstevel@tonic-gate */ 9130Sstevel@tonic-gate int 9140Sstevel@tonic-gate pf_is_memory(pfn_t pf) 9150Sstevel@tonic-gate { 9163446Smrj if (pfn_is_foreign(pf)) 9173446Smrj return (0); 9183446Smrj return (address_in_memlist(phys_install, pfn_to_pa(pf), 1)); 9190Sstevel@tonic-gate } 9200Sstevel@tonic-gate 9210Sstevel@tonic-gate /* 9220Sstevel@tonic-gate * return the memrange containing pfn 9230Sstevel@tonic-gate */ 9240Sstevel@tonic-gate int 9250Sstevel@tonic-gate memrange_num(pfn_t pfn) 9260Sstevel@tonic-gate { 9270Sstevel@tonic-gate int n; 9280Sstevel@tonic-gate 9290Sstevel@tonic-gate for (n = 0; n < nranges - 1; ++n) { 9300Sstevel@tonic-gate if (pfn >= memranges[n]) 9310Sstevel@tonic-gate break; 9320Sstevel@tonic-gate } 9330Sstevel@tonic-gate return (n); 9340Sstevel@tonic-gate } 9350Sstevel@tonic-gate 9360Sstevel@tonic-gate /* 9370Sstevel@tonic-gate * return the mnoderange containing pfn 9380Sstevel@tonic-gate */ 9395084Sjohnlev /*ARGSUSED*/ 9400Sstevel@tonic-gate int 9410Sstevel@tonic-gate pfn_2_mtype(pfn_t pfn) 9420Sstevel@tonic-gate { 9435084Sjohnlev #if defined(__xpv) 9445084Sjohnlev return (0); 9455084Sjohnlev #else 9460Sstevel@tonic-gate int n; 9470Sstevel@tonic-gate 9480Sstevel@tonic-gate for (n = mnoderangecnt - 1; n >= 0; n--) { 9490Sstevel@tonic-gate if (pfn >= mnoderanges[n].mnr_pfnlo) { 9500Sstevel@tonic-gate break; 9510Sstevel@tonic-gate } 9520Sstevel@tonic-gate } 9530Sstevel@tonic-gate return (n); 9545084Sjohnlev #endif 9550Sstevel@tonic-gate } 9560Sstevel@tonic-gate 9575084Sjohnlev #if !defined(__xpv) 9580Sstevel@tonic-gate /* 9590Sstevel@tonic-gate * is_contigpage_free: 9600Sstevel@tonic-gate * returns a page list of contiguous pages. It minimally has to return 9610Sstevel@tonic-gate * minctg pages. Caller determines minctg based on the scatter-gather 9620Sstevel@tonic-gate * list length. 9630Sstevel@tonic-gate * 9640Sstevel@tonic-gate * pfnp is set to the next page frame to search on return. 9650Sstevel@tonic-gate */ 9660Sstevel@tonic-gate static page_t * 9670Sstevel@tonic-gate is_contigpage_free( 9680Sstevel@tonic-gate pfn_t *pfnp, 9690Sstevel@tonic-gate pgcnt_t *pgcnt, 9700Sstevel@tonic-gate pgcnt_t minctg, 9710Sstevel@tonic-gate uint64_t pfnseg, 9720Sstevel@tonic-gate int iolock) 9730Sstevel@tonic-gate { 9740Sstevel@tonic-gate int i = 0; 9750Sstevel@tonic-gate pfn_t pfn = *pfnp; 9760Sstevel@tonic-gate page_t *pp; 9770Sstevel@tonic-gate page_t *plist = NULL; 9780Sstevel@tonic-gate 9790Sstevel@tonic-gate /* 9800Sstevel@tonic-gate * fail if pfn + minctg crosses a segment boundary. 9810Sstevel@tonic-gate * Adjust for next starting pfn to begin at segment boundary. 9820Sstevel@tonic-gate */ 9830Sstevel@tonic-gate 9840Sstevel@tonic-gate if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 9850Sstevel@tonic-gate *pfnp = roundup(*pfnp, pfnseg + 1); 9860Sstevel@tonic-gate return (NULL); 9870Sstevel@tonic-gate } 9880Sstevel@tonic-gate 9890Sstevel@tonic-gate do { 9900Sstevel@tonic-gate retry: 9910Sstevel@tonic-gate pp = page_numtopp_nolock(pfn + i); 9920Sstevel@tonic-gate if ((pp == NULL) || 9930Sstevel@tonic-gate (page_trylock(pp, SE_EXCL) == 0)) { 9940Sstevel@tonic-gate (*pfnp)++; 9950Sstevel@tonic-gate break; 9960Sstevel@tonic-gate } 9970Sstevel@tonic-gate if (page_pptonum(pp) != pfn + i) { 9980Sstevel@tonic-gate page_unlock(pp); 9990Sstevel@tonic-gate goto retry; 10000Sstevel@tonic-gate } 10010Sstevel@tonic-gate 10020Sstevel@tonic-gate if (!(PP_ISFREE(pp))) { 10030Sstevel@tonic-gate page_unlock(pp); 10040Sstevel@tonic-gate (*pfnp)++; 10050Sstevel@tonic-gate break; 10060Sstevel@tonic-gate } 10070Sstevel@tonic-gate 10080Sstevel@tonic-gate if (!PP_ISAGED(pp)) { 10090Sstevel@tonic-gate page_list_sub(pp, PG_CACHE_LIST); 10100Sstevel@tonic-gate page_hashout(pp, (kmutex_t *)NULL); 10110Sstevel@tonic-gate } else { 10120Sstevel@tonic-gate page_list_sub(pp, PG_FREE_LIST); 10130Sstevel@tonic-gate } 10140Sstevel@tonic-gate 10150Sstevel@tonic-gate if (iolock) 10160Sstevel@tonic-gate page_io_lock(pp); 10170Sstevel@tonic-gate page_list_concat(&plist, &pp); 10180Sstevel@tonic-gate 10190Sstevel@tonic-gate /* 10200Sstevel@tonic-gate * exit loop when pgcnt satisfied or segment boundary reached. 10210Sstevel@tonic-gate */ 10220Sstevel@tonic-gate 10230Sstevel@tonic-gate } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 10240Sstevel@tonic-gate 10250Sstevel@tonic-gate *pfnp += i; /* set to next pfn to search */ 10260Sstevel@tonic-gate 10270Sstevel@tonic-gate if (i >= minctg) { 10280Sstevel@tonic-gate *pgcnt -= i; 10290Sstevel@tonic-gate return (plist); 10300Sstevel@tonic-gate } 10310Sstevel@tonic-gate 10320Sstevel@tonic-gate /* 10330Sstevel@tonic-gate * failure: minctg not satisfied. 10340Sstevel@tonic-gate * 10350Sstevel@tonic-gate * if next request crosses segment boundary, set next pfn 10360Sstevel@tonic-gate * to search from the segment boundary. 10370Sstevel@tonic-gate */ 10380Sstevel@tonic-gate if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 10390Sstevel@tonic-gate *pfnp = roundup(*pfnp, pfnseg + 1); 10400Sstevel@tonic-gate 10410Sstevel@tonic-gate /* clean up any pages already allocated */ 10420Sstevel@tonic-gate 10430Sstevel@tonic-gate while (plist) { 10440Sstevel@tonic-gate pp = plist; 10450Sstevel@tonic-gate page_sub(&plist, pp); 10460Sstevel@tonic-gate page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 10470Sstevel@tonic-gate if (iolock) 10480Sstevel@tonic-gate page_io_unlock(pp); 10490Sstevel@tonic-gate page_unlock(pp); 10500Sstevel@tonic-gate } 10510Sstevel@tonic-gate 10520Sstevel@tonic-gate return (NULL); 10530Sstevel@tonic-gate } 10545084Sjohnlev #endif /* !__xpv */ 10550Sstevel@tonic-gate 10560Sstevel@tonic-gate /* 10570Sstevel@tonic-gate * verify that pages being returned from allocator have correct DMA attribute 10580Sstevel@tonic-gate */ 10590Sstevel@tonic-gate #ifndef DEBUG 10600Sstevel@tonic-gate #define check_dma(a, b, c) (0) 10610Sstevel@tonic-gate #else 10620Sstevel@tonic-gate static void 10630Sstevel@tonic-gate check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 10640Sstevel@tonic-gate { 10650Sstevel@tonic-gate if (dma_attr == NULL) 10660Sstevel@tonic-gate return; 10670Sstevel@tonic-gate 10680Sstevel@tonic-gate while (cnt-- > 0) { 10693446Smrj if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) < 10700Sstevel@tonic-gate dma_attr->dma_attr_addr_lo) 10717240Srh87107 panic("PFN (pp=%p) below dma_attr_addr_lo", (void *)pp); 10723446Smrj if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >= 10730Sstevel@tonic-gate dma_attr->dma_attr_addr_hi) 10747240Srh87107 panic("PFN (pp=%p) above dma_attr_addr_hi", (void *)pp); 10750Sstevel@tonic-gate pp = pp->p_next; 10760Sstevel@tonic-gate } 10770Sstevel@tonic-gate } 10780Sstevel@tonic-gate #endif 10790Sstevel@tonic-gate 10805084Sjohnlev #if !defined(__xpv) 10810Sstevel@tonic-gate static page_t * 10820Sstevel@tonic-gate page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 10830Sstevel@tonic-gate { 10840Sstevel@tonic-gate pfn_t pfn; 10850Sstevel@tonic-gate int sgllen; 10860Sstevel@tonic-gate uint64_t pfnseg; 10870Sstevel@tonic-gate pgcnt_t minctg; 10880Sstevel@tonic-gate page_t *pplist = NULL, *plist; 10890Sstevel@tonic-gate uint64_t lo, hi; 10900Sstevel@tonic-gate pgcnt_t pfnalign = 0; 10910Sstevel@tonic-gate static pfn_t startpfn; 10920Sstevel@tonic-gate static pgcnt_t lastctgcnt; 10930Sstevel@tonic-gate uintptr_t align; 10940Sstevel@tonic-gate 10950Sstevel@tonic-gate CONTIG_LOCK(); 10960Sstevel@tonic-gate 10970Sstevel@tonic-gate if (mattr) { 10980Sstevel@tonic-gate lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 10990Sstevel@tonic-gate hi = mmu_btop(mattr->dma_attr_addr_hi); 11000Sstevel@tonic-gate if (hi >= physmax) 11010Sstevel@tonic-gate hi = physmax - 1; 11020Sstevel@tonic-gate sgllen = mattr->dma_attr_sgllen; 11030Sstevel@tonic-gate pfnseg = mmu_btop(mattr->dma_attr_seg); 11040Sstevel@tonic-gate 11050Sstevel@tonic-gate align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 11060Sstevel@tonic-gate if (align > MMU_PAGESIZE) 11070Sstevel@tonic-gate pfnalign = mmu_btop(align); 11080Sstevel@tonic-gate 11090Sstevel@tonic-gate /* 11100Sstevel@tonic-gate * in order to satisfy the request, must minimally 11110Sstevel@tonic-gate * acquire minctg contiguous pages 11120Sstevel@tonic-gate */ 11130Sstevel@tonic-gate minctg = howmany(*pgcnt, sgllen); 11140Sstevel@tonic-gate 11150Sstevel@tonic-gate ASSERT(hi >= lo); 11160Sstevel@tonic-gate 11170Sstevel@tonic-gate /* 11180Sstevel@tonic-gate * start from where last searched if the minctg >= lastctgcnt 11190Sstevel@tonic-gate */ 11200Sstevel@tonic-gate if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 11210Sstevel@tonic-gate startpfn = lo; 11220Sstevel@tonic-gate } else { 11230Sstevel@tonic-gate hi = physmax - 1; 11240Sstevel@tonic-gate lo = 0; 11250Sstevel@tonic-gate sgllen = 1; 11260Sstevel@tonic-gate pfnseg = mmu.highest_pfn; 11270Sstevel@tonic-gate minctg = *pgcnt; 11280Sstevel@tonic-gate 11290Sstevel@tonic-gate if (minctg < lastctgcnt) 11300Sstevel@tonic-gate startpfn = lo; 11310Sstevel@tonic-gate } 11320Sstevel@tonic-gate lastctgcnt = minctg; 11330Sstevel@tonic-gate 11340Sstevel@tonic-gate ASSERT(pfnseg + 1 >= (uint64_t)minctg); 11350Sstevel@tonic-gate 11360Sstevel@tonic-gate /* conserve 16m memory - start search above 16m when possible */ 11370Sstevel@tonic-gate if (hi > PFN_16M && startpfn < PFN_16M) 11380Sstevel@tonic-gate startpfn = PFN_16M; 11390Sstevel@tonic-gate 11400Sstevel@tonic-gate pfn = startpfn; 11410Sstevel@tonic-gate if (pfnalign) 11420Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 11430Sstevel@tonic-gate 11440Sstevel@tonic-gate while (pfn + minctg - 1 <= hi) { 11450Sstevel@tonic-gate 11460Sstevel@tonic-gate plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 11470Sstevel@tonic-gate if (plist) { 11480Sstevel@tonic-gate page_list_concat(&pplist, &plist); 11490Sstevel@tonic-gate sgllen--; 11500Sstevel@tonic-gate /* 11510Sstevel@tonic-gate * return when contig pages no longer needed 11520Sstevel@tonic-gate */ 11530Sstevel@tonic-gate if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 11540Sstevel@tonic-gate startpfn = pfn; 11550Sstevel@tonic-gate CONTIG_UNLOCK(); 11560Sstevel@tonic-gate check_dma(mattr, pplist, *pgcnt); 11570Sstevel@tonic-gate return (pplist); 11580Sstevel@tonic-gate } 11590Sstevel@tonic-gate minctg = howmany(*pgcnt, sgllen); 11600Sstevel@tonic-gate } 11610Sstevel@tonic-gate if (pfnalign) 11620Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 11630Sstevel@tonic-gate } 11640Sstevel@tonic-gate 11650Sstevel@tonic-gate /* cannot find contig pages in specified range */ 11660Sstevel@tonic-gate if (startpfn == lo) { 11670Sstevel@tonic-gate CONTIG_UNLOCK(); 11680Sstevel@tonic-gate return (NULL); 11690Sstevel@tonic-gate } 11700Sstevel@tonic-gate 11710Sstevel@tonic-gate /* did not start with lo previously */ 11720Sstevel@tonic-gate pfn = lo; 11730Sstevel@tonic-gate if (pfnalign) 11740Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 11750Sstevel@tonic-gate 11760Sstevel@tonic-gate /* allow search to go above startpfn */ 11770Sstevel@tonic-gate while (pfn < startpfn) { 11780Sstevel@tonic-gate 11790Sstevel@tonic-gate plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 11800Sstevel@tonic-gate if (plist != NULL) { 11810Sstevel@tonic-gate 11820Sstevel@tonic-gate page_list_concat(&pplist, &plist); 11830Sstevel@tonic-gate sgllen--; 11840Sstevel@tonic-gate 11850Sstevel@tonic-gate /* 11860Sstevel@tonic-gate * return when contig pages no longer needed 11870Sstevel@tonic-gate */ 11880Sstevel@tonic-gate if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 11890Sstevel@tonic-gate startpfn = pfn; 11900Sstevel@tonic-gate CONTIG_UNLOCK(); 11910Sstevel@tonic-gate check_dma(mattr, pplist, *pgcnt); 11920Sstevel@tonic-gate return (pplist); 11930Sstevel@tonic-gate } 11940Sstevel@tonic-gate minctg = howmany(*pgcnt, sgllen); 11950Sstevel@tonic-gate } 11960Sstevel@tonic-gate if (pfnalign) 11970Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 11980Sstevel@tonic-gate } 11990Sstevel@tonic-gate CONTIG_UNLOCK(); 12000Sstevel@tonic-gate return (NULL); 12010Sstevel@tonic-gate } 12025084Sjohnlev #endif /* !__xpv */ 12030Sstevel@tonic-gate 12040Sstevel@tonic-gate /* 12050Sstevel@tonic-gate * mnode_range_cnt() calculates the number of memory ranges for mnode and 12060Sstevel@tonic-gate * memranges[]. Used to determine the size of page lists and mnoderanges. 12070Sstevel@tonic-gate */ 12080Sstevel@tonic-gate int 12092961Sdp78419 mnode_range_cnt(int mnode) 12100Sstevel@tonic-gate { 12115084Sjohnlev #if defined(__xpv) 12125084Sjohnlev ASSERT(mnode == 0); 12135084Sjohnlev return (1); 12145084Sjohnlev #else /* __xpv */ 12150Sstevel@tonic-gate int mri; 12160Sstevel@tonic-gate int mnrcnt = 0; 12170Sstevel@tonic-gate 12182961Sdp78419 if (mem_node_config[mnode].exists != 0) { 12190Sstevel@tonic-gate mri = nranges - 1; 12200Sstevel@tonic-gate 12210Sstevel@tonic-gate /* find the memranges index below contained in mnode range */ 12220Sstevel@tonic-gate 12230Sstevel@tonic-gate while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 12240Sstevel@tonic-gate mri--; 12250Sstevel@tonic-gate 12260Sstevel@tonic-gate /* 12270Sstevel@tonic-gate * increment mnode range counter when memranges or mnode 12280Sstevel@tonic-gate * boundary is reached. 12290Sstevel@tonic-gate */ 12300Sstevel@tonic-gate while (mri >= 0 && 12310Sstevel@tonic-gate mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 12320Sstevel@tonic-gate mnrcnt++; 12330Sstevel@tonic-gate if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 12340Sstevel@tonic-gate mri--; 12350Sstevel@tonic-gate else 12360Sstevel@tonic-gate break; 12370Sstevel@tonic-gate } 12380Sstevel@tonic-gate } 12392961Sdp78419 ASSERT(mnrcnt <= MAX_MNODE_MRANGES); 12400Sstevel@tonic-gate return (mnrcnt); 12415084Sjohnlev #endif /* __xpv */ 12420Sstevel@tonic-gate } 12430Sstevel@tonic-gate 12445084Sjohnlev /* 12455084Sjohnlev * mnode_range_setup() initializes mnoderanges. 12465084Sjohnlev */ 12470Sstevel@tonic-gate void 12480Sstevel@tonic-gate mnode_range_setup(mnoderange_t *mnoderanges) 12490Sstevel@tonic-gate { 12500Sstevel@tonic-gate int mnode, mri; 12510Sstevel@tonic-gate 12520Sstevel@tonic-gate for (mnode = 0; mnode < max_mem_nodes; mnode++) { 12530Sstevel@tonic-gate if (mem_node_config[mnode].exists == 0) 12540Sstevel@tonic-gate continue; 12550Sstevel@tonic-gate 12560Sstevel@tonic-gate mri = nranges - 1; 12570Sstevel@tonic-gate 12580Sstevel@tonic-gate while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 12590Sstevel@tonic-gate mri--; 12600Sstevel@tonic-gate 12610Sstevel@tonic-gate while (mri >= 0 && mem_node_config[mnode].physmax >= 12620Sstevel@tonic-gate MEMRANGELO(mri)) { 12635084Sjohnlev mnoderanges->mnr_pfnlo = MAX(MEMRANGELO(mri), 12645084Sjohnlev mem_node_config[mnode].physbase); 12655084Sjohnlev mnoderanges->mnr_pfnhi = MIN(MEMRANGEHI(mri), 12665084Sjohnlev mem_node_config[mnode].physmax); 12670Sstevel@tonic-gate mnoderanges->mnr_mnode = mnode; 12680Sstevel@tonic-gate mnoderanges->mnr_memrange = mri; 12690Sstevel@tonic-gate mnoderanges++; 12700Sstevel@tonic-gate if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 12710Sstevel@tonic-gate mri--; 12720Sstevel@tonic-gate else 12730Sstevel@tonic-gate break; 12740Sstevel@tonic-gate } 12750Sstevel@tonic-gate } 12760Sstevel@tonic-gate } 12770Sstevel@tonic-gate 12785084Sjohnlev /*ARGSUSED*/ 12795084Sjohnlev int 12805084Sjohnlev mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz) 12815084Sjohnlev { 12825084Sjohnlev int mtype = mnoderangecnt - 1; 12835084Sjohnlev 12845084Sjohnlev #if !defined(__xpv) 12855084Sjohnlev #if defined(__i386) 12865084Sjohnlev /* 12875084Sjohnlev * set the mtype range 12885084Sjohnlev * - kmem requests needs to be below 4g if restricted_kmemalloc is set. 12895084Sjohnlev * - for non kmem requests, set range to above 4g if memory below 4g 12905084Sjohnlev * runs low. 12915084Sjohnlev */ 12925084Sjohnlev if (restricted_kmemalloc && VN_ISKAS(vp) && 12935084Sjohnlev (caddr_t)(vaddr) >= kernelheap && 12945084Sjohnlev (caddr_t)(vaddr) < ekernelheap) { 12955084Sjohnlev ASSERT(physmax4g); 12965084Sjohnlev mtype = mtype4g; 12975084Sjohnlev if (RESTRICT16M_ALLOC(freemem4g - btop(pgsz), 12985084Sjohnlev btop(pgsz), *flags)) { 12995084Sjohnlev *flags |= PGI_MT_RANGE16M; 13005084Sjohnlev } else { 13015084Sjohnlev VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 13025084Sjohnlev VM_STAT_COND_ADD((*flags & PG_PANIC), 13035084Sjohnlev vmm_vmstats.pgpanicalloc); 13045084Sjohnlev *flags |= PGI_MT_RANGE0; 13055084Sjohnlev } 13065084Sjohnlev return (mtype); 13075084Sjohnlev } 13085084Sjohnlev #endif /* __i386 */ 13095084Sjohnlev 13105084Sjohnlev if (RESTRICT4G_ALLOC) { 13115084Sjohnlev VM_STAT_ADD(vmm_vmstats.restrict4gcnt); 13125084Sjohnlev /* here only for > 4g systems */ 13135084Sjohnlev *flags |= PGI_MT_RANGE4G; 13145084Sjohnlev } else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) { 13155084Sjohnlev *flags |= PGI_MT_RANGE16M; 13165084Sjohnlev } else { 13175084Sjohnlev VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 13185084Sjohnlev VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc); 13195084Sjohnlev *flags |= PGI_MT_RANGE0; 13205084Sjohnlev } 13215084Sjohnlev #endif /* !__xpv */ 13225084Sjohnlev return (mtype); 13235084Sjohnlev } 13245084Sjohnlev 13255084Sjohnlev 13265084Sjohnlev /* mtype init for page_get_replacement_page */ 13275084Sjohnlev /*ARGSUSED*/ 13285084Sjohnlev int 13295084Sjohnlev mtype_pgr_init(int *flags, page_t *pp, int mnode, pgcnt_t pgcnt) 13305084Sjohnlev { 13315084Sjohnlev int mtype = mnoderangecnt - 1; 13325084Sjohnlev #if !defined(__ixpv) 13335084Sjohnlev if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) { 13345084Sjohnlev *flags |= PGI_MT_RANGE16M; 13355084Sjohnlev } else { 13365084Sjohnlev VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 13375084Sjohnlev *flags |= PGI_MT_RANGE0; 13385084Sjohnlev } 13395084Sjohnlev #endif 13405084Sjohnlev return (mtype); 13415084Sjohnlev } 13425084Sjohnlev 13430Sstevel@tonic-gate /* 13440Sstevel@tonic-gate * Determine if the mnode range specified in mtype contains memory belonging 13450Sstevel@tonic-gate * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 13461385Skchow * the range of indices from high pfn to 0, 16m or 4g. 13470Sstevel@tonic-gate * 13480Sstevel@tonic-gate * Return first mnode range type index found otherwise return -1 if none found. 13490Sstevel@tonic-gate */ 13500Sstevel@tonic-gate int 13510Sstevel@tonic-gate mtype_func(int mnode, int mtype, uint_t flags) 13520Sstevel@tonic-gate { 13530Sstevel@tonic-gate if (flags & PGI_MT_RANGE) { 13545084Sjohnlev int mtlim = 0; 13550Sstevel@tonic-gate 13560Sstevel@tonic-gate if (flags & PGI_MT_NEXT) 13570Sstevel@tonic-gate mtype--; 13585084Sjohnlev if (flags & PGI_MT_RANGE4G) 13591385Skchow mtlim = mtype4g + 1; /* exclude 0-4g range */ 13601385Skchow else if (flags & PGI_MT_RANGE16M) 13611385Skchow mtlim = 1; /* exclude 0-16m range */ 13620Sstevel@tonic-gate while (mtype >= mtlim) { 13630Sstevel@tonic-gate if (mnoderanges[mtype].mnr_mnode == mnode) 13640Sstevel@tonic-gate return (mtype); 13650Sstevel@tonic-gate mtype--; 13660Sstevel@tonic-gate } 13675084Sjohnlev } else if (mnoderanges[mtype].mnr_mnode == mnode) { 13685084Sjohnlev return (mtype); 13690Sstevel@tonic-gate } 13700Sstevel@tonic-gate return (-1); 13710Sstevel@tonic-gate } 13720Sstevel@tonic-gate 13730Sstevel@tonic-gate /* 13741373Skchow * Update the page list max counts with the pfn range specified by the 13751373Skchow * input parameters. Called from add_physmem() when physical memory with 13761373Skchow * page_t's are initially added to the page lists. 13771373Skchow */ 13781373Skchow void 13791373Skchow mtype_modify_max(pfn_t startpfn, long cnt) 13801373Skchow { 13811373Skchow int mtype = 0; 13821373Skchow pfn_t endpfn = startpfn + cnt, pfn; 13831373Skchow pgcnt_t inc; 13841373Skchow 13851373Skchow ASSERT(cnt > 0); 13861373Skchow 13875084Sjohnlev if (!physmax4g) 13885084Sjohnlev return; 13895084Sjohnlev 13901373Skchow for (pfn = startpfn; pfn < endpfn; ) { 13911373Skchow if (pfn <= mnoderanges[mtype].mnr_pfnhi) { 13921373Skchow if (endpfn < mnoderanges[mtype].mnr_pfnhi) { 13931373Skchow inc = endpfn - pfn; 13941373Skchow } else { 13951373Skchow inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1; 13961373Skchow } 13975084Sjohnlev if (mtype <= mtype4g) 13981373Skchow maxmem4g += inc; 13991373Skchow pfn += inc; 14001373Skchow } 14011373Skchow mtype++; 14021373Skchow ASSERT(mtype < mnoderangecnt || pfn >= endpfn); 14031373Skchow } 14041373Skchow } 14051373Skchow 14065084Sjohnlev int 14075084Sjohnlev mtype_2_mrange(int mtype) 14085084Sjohnlev { 14095084Sjohnlev return (mnoderanges[mtype].mnr_memrange); 14105084Sjohnlev } 14115084Sjohnlev 14125084Sjohnlev void 14135084Sjohnlev mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi) 14145084Sjohnlev { 14155084Sjohnlev ASSERT(mnoderanges[mtype].mnr_mnode == mnode); 14165084Sjohnlev *pfnlo = mnoderanges[mtype].mnr_pfnlo; 14175084Sjohnlev *pfnhi = mnoderanges[mtype].mnr_pfnhi; 14185084Sjohnlev } 14195084Sjohnlev 14205084Sjohnlev size_t 14215084Sjohnlev plcnt_sz(size_t ctrs_sz) 14225084Sjohnlev { 14235084Sjohnlev #ifdef DEBUG 14245084Sjohnlev int szc, colors; 14255084Sjohnlev 14265084Sjohnlev ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes; 14275084Sjohnlev for (szc = 0; szc < mmu_page_sizes; szc++) { 14285084Sjohnlev colors = page_get_pagecolors(szc); 14295084Sjohnlev ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors; 14305084Sjohnlev } 14315084Sjohnlev #endif 14325084Sjohnlev return (ctrs_sz); 14335084Sjohnlev } 14345084Sjohnlev 14355084Sjohnlev caddr_t 14365084Sjohnlev plcnt_init(caddr_t addr) 14375084Sjohnlev { 14385084Sjohnlev #ifdef DEBUG 14395084Sjohnlev int mt, szc, colors; 14405084Sjohnlev 14415084Sjohnlev for (mt = 0; mt < mnoderangecnt; mt++) { 14425084Sjohnlev mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr; 14435084Sjohnlev addr += (sizeof (struct mnr_mts) * mmu_page_sizes); 14445084Sjohnlev for (szc = 0; szc < mmu_page_sizes; szc++) { 14455084Sjohnlev colors = page_get_pagecolors(szc); 14465084Sjohnlev mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors; 14475084Sjohnlev mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt = 14485084Sjohnlev (pgcnt_t *)addr; 14495084Sjohnlev addr += (sizeof (pgcnt_t) * colors); 14505084Sjohnlev } 14515084Sjohnlev } 14525084Sjohnlev #endif 14535084Sjohnlev return (addr); 14545084Sjohnlev } 14555084Sjohnlev 14565084Sjohnlev void 14575084Sjohnlev plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags) 14585084Sjohnlev { 14595084Sjohnlev #ifdef DEBUG 14605084Sjohnlev int bin = PP_2_BIN(pp); 14615084Sjohnlev 14625084Sjohnlev atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt); 14635084Sjohnlev atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin], 14645084Sjohnlev cnt); 14655084Sjohnlev #endif 14665084Sjohnlev ASSERT(mtype == PP_2_MTYPE(pp)); 14675084Sjohnlev if (physmax4g && mtype <= mtype4g) 14685084Sjohnlev atomic_add_long(&freemem4g, cnt); 14695084Sjohnlev if (flags & PG_CACHE_LIST) 14705084Sjohnlev atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt); 14715084Sjohnlev else 14725466Skchow atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt[szc], cnt); 14735466Skchow atomic_add_long(&mnoderanges[mtype].mnr_mt_totcnt, cnt); 14745084Sjohnlev } 14755084Sjohnlev 14761373Skchow /* 1477414Skchow * Returns the free page count for mnode 1478414Skchow */ 1479414Skchow int 1480414Skchow mnode_pgcnt(int mnode) 1481414Skchow { 1482414Skchow int mtype = mnoderangecnt - 1; 1483414Skchow int flags = PGI_MT_RANGE0; 1484414Skchow pgcnt_t pgcnt = 0; 1485414Skchow 1486414Skchow mtype = mtype_func(mnode, mtype, flags); 1487414Skchow 1488414Skchow while (mtype != -1) { 14891385Skchow pgcnt += MTYPE_FREEMEM(mtype); 1490414Skchow mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); 1491414Skchow } 1492414Skchow return (pgcnt); 1493414Skchow } 1494414Skchow 1495414Skchow /* 14960Sstevel@tonic-gate * Initialize page coloring variables based on the l2 cache parameters. 14970Sstevel@tonic-gate * Calculate and return memory needed for page coloring data structures. 14980Sstevel@tonic-gate */ 14990Sstevel@tonic-gate size_t 15000Sstevel@tonic-gate page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 15010Sstevel@tonic-gate { 15020Sstevel@tonic-gate size_t colorsz = 0; 15030Sstevel@tonic-gate int i; 15040Sstevel@tonic-gate int colors; 15050Sstevel@tonic-gate 15065084Sjohnlev #if defined(__xpv) 15075084Sjohnlev /* 15085084Sjohnlev * Hypervisor domains currently don't have any concept of NUMA. 15095084Sjohnlev * Hence we'll act like there is only 1 memrange. 15105084Sjohnlev */ 15115084Sjohnlev i = memrange_num(1); 15125084Sjohnlev #else /* !__xpv */ 15130Sstevel@tonic-gate /* 15140Sstevel@tonic-gate * Reduce the memory ranges lists if we don't have large amounts 15150Sstevel@tonic-gate * of memory. This avoids searching known empty free lists. 15160Sstevel@tonic-gate */ 15170Sstevel@tonic-gate i = memrange_num(physmax); 15180Sstevel@tonic-gate #if defined(__i386) 15190Sstevel@tonic-gate if (i > 0) 15200Sstevel@tonic-gate restricted_kmemalloc = 0; 15210Sstevel@tonic-gate #endif 15220Sstevel@tonic-gate /* physmax greater than 4g */ 15230Sstevel@tonic-gate if (i == 0) 15240Sstevel@tonic-gate physmax4g = 1; 15255084Sjohnlev #endif /* !__xpv */ 15265084Sjohnlev memranges += i; 15275084Sjohnlev nranges -= i; 15280Sstevel@tonic-gate 15295349Skchow ASSERT(mmu_page_sizes <= MMU_PAGE_SIZES); 15305349Skchow 15310Sstevel@tonic-gate ASSERT(ISP2(l2_linesz)); 15320Sstevel@tonic-gate ASSERT(l2_sz > MMU_PAGESIZE); 15330Sstevel@tonic-gate 15340Sstevel@tonic-gate /* l2_assoc is 0 for fully associative l2 cache */ 15350Sstevel@tonic-gate if (l2_assoc) 15360Sstevel@tonic-gate l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 15370Sstevel@tonic-gate else 15380Sstevel@tonic-gate l2_colors = 1; 15390Sstevel@tonic-gate 15407069Svd224797 ASSERT(ISP2(l2_colors)); 15417069Svd224797 15420Sstevel@tonic-gate /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 15430Sstevel@tonic-gate page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 15440Sstevel@tonic-gate 15450Sstevel@tonic-gate /* 15460Sstevel@tonic-gate * cpu_page_colors is non-zero when a page color may be spread across 15470Sstevel@tonic-gate * multiple bins. 15480Sstevel@tonic-gate */ 15490Sstevel@tonic-gate if (l2_colors < page_colors) 15500Sstevel@tonic-gate cpu_page_colors = l2_colors; 15510Sstevel@tonic-gate 15520Sstevel@tonic-gate ASSERT(ISP2(page_colors)); 15530Sstevel@tonic-gate 15540Sstevel@tonic-gate page_colors_mask = page_colors - 1; 15550Sstevel@tonic-gate 15560Sstevel@tonic-gate ASSERT(ISP2(CPUSETSIZE())); 15570Sstevel@tonic-gate page_coloring_shift = lowbit(CPUSETSIZE()); 15580Sstevel@tonic-gate 15592961Sdp78419 /* initialize number of colors per page size */ 15602961Sdp78419 for (i = 0; i <= mmu.max_page_level; i++) { 15612961Sdp78419 hw_page_array[i].hp_size = LEVEL_SIZE(i); 15622961Sdp78419 hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 15632961Sdp78419 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 15642961Sdp78419 hw_page_array[i].hp_colors = (page_colors_mask >> 15652961Sdp78419 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift)) 15662961Sdp78419 + 1; 15673717Sdp78419 colorequivszc[i] = 0; 15682961Sdp78419 } 15692961Sdp78419 15702961Sdp78419 /* 15712961Sdp78419 * The value of cpu_page_colors determines if additional color bins 15722961Sdp78419 * need to be checked for a particular color in the page_get routines. 15732961Sdp78419 */ 15742961Sdp78419 if (cpu_page_colors != 0) { 15752961Sdp78419 15762961Sdp78419 int a = lowbit(page_colors) - lowbit(cpu_page_colors); 15772961Sdp78419 ASSERT(a > 0); 15782961Sdp78419 ASSERT(a < 16); 15792961Sdp78419 15802961Sdp78419 for (i = 0; i <= mmu.max_page_level; i++) { 15812961Sdp78419 if ((colors = hw_page_array[i].hp_colors) <= 1) { 15822961Sdp78419 colorequivszc[i] = 0; 15832961Sdp78419 continue; 15842961Sdp78419 } 15852961Sdp78419 while ((colors >> a) == 0) 15862961Sdp78419 a--; 15872961Sdp78419 ASSERT(a >= 0); 15882961Sdp78419 15892961Sdp78419 /* higher 4 bits encodes color equiv mask */ 15902961Sdp78419 colorequivszc[i] = (a << 4); 15912961Sdp78419 } 15922961Sdp78419 } 15932961Sdp78419 15945084Sjohnlev /* factor in colorequiv to check additional 'equivalent' bins. */ 15955084Sjohnlev if (colorequiv > 1) { 15965084Sjohnlev 15975084Sjohnlev int a = lowbit(colorequiv) - 1; 15985084Sjohnlev if (a > 15) 15995084Sjohnlev a = 15; 16005084Sjohnlev 16015084Sjohnlev for (i = 0; i <= mmu.max_page_level; i++) { 16025084Sjohnlev if ((colors = hw_page_array[i].hp_colors) <= 1) { 16035084Sjohnlev continue; 16045084Sjohnlev } 16055084Sjohnlev while ((colors >> a) == 0) 16065084Sjohnlev a--; 16075084Sjohnlev if ((a << 4) > colorequivszc[i]) { 16085084Sjohnlev colorequivszc[i] = (a << 4); 16095084Sjohnlev } 16105084Sjohnlev } 16115084Sjohnlev } 16125084Sjohnlev 16130Sstevel@tonic-gate /* size for mnoderanges */ 16142961Sdp78419 for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++) 16152961Sdp78419 mnoderangecnt += mnode_range_cnt(i); 16160Sstevel@tonic-gate colorsz = mnoderangecnt * sizeof (mnoderange_t); 16170Sstevel@tonic-gate 16180Sstevel@tonic-gate /* size for fpc_mutex and cpc_mutex */ 16190Sstevel@tonic-gate colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 16200Sstevel@tonic-gate 16210Sstevel@tonic-gate /* size of page_freelists */ 16220Sstevel@tonic-gate colorsz += mnoderangecnt * sizeof (page_t ***); 16230Sstevel@tonic-gate colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 16240Sstevel@tonic-gate 16250Sstevel@tonic-gate for (i = 0; i < mmu_page_sizes; i++) { 16260Sstevel@tonic-gate colors = page_get_pagecolors(i); 16270Sstevel@tonic-gate colorsz += mnoderangecnt * colors * sizeof (page_t *); 16280Sstevel@tonic-gate } 16290Sstevel@tonic-gate 16300Sstevel@tonic-gate /* size of page_cachelists */ 16310Sstevel@tonic-gate colorsz += mnoderangecnt * sizeof (page_t **); 16320Sstevel@tonic-gate colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 16330Sstevel@tonic-gate 16340Sstevel@tonic-gate return (colorsz); 16350Sstevel@tonic-gate } 16360Sstevel@tonic-gate 16370Sstevel@tonic-gate /* 16380Sstevel@tonic-gate * Called once at startup to configure page_coloring data structures and 16390Sstevel@tonic-gate * does the 1st page_free()/page_freelist_add(). 16400Sstevel@tonic-gate */ 16410Sstevel@tonic-gate void 16420Sstevel@tonic-gate page_coloring_setup(caddr_t pcmemaddr) 16430Sstevel@tonic-gate { 16440Sstevel@tonic-gate int i; 16450Sstevel@tonic-gate int j; 16460Sstevel@tonic-gate int k; 16470Sstevel@tonic-gate caddr_t addr; 16480Sstevel@tonic-gate int colors; 16490Sstevel@tonic-gate 16500Sstevel@tonic-gate /* 16510Sstevel@tonic-gate * do page coloring setup 16520Sstevel@tonic-gate */ 16530Sstevel@tonic-gate addr = pcmemaddr; 16540Sstevel@tonic-gate 16550Sstevel@tonic-gate mnoderanges = (mnoderange_t *)addr; 16560Sstevel@tonic-gate addr += (mnoderangecnt * sizeof (mnoderange_t)); 16570Sstevel@tonic-gate 16580Sstevel@tonic-gate mnode_range_setup(mnoderanges); 16590Sstevel@tonic-gate 16600Sstevel@tonic-gate if (physmax4g) 16610Sstevel@tonic-gate mtype4g = pfn_2_mtype(0xfffff); 16620Sstevel@tonic-gate 16630Sstevel@tonic-gate for (k = 0; k < NPC_MUTEX; k++) { 16640Sstevel@tonic-gate fpc_mutex[k] = (kmutex_t *)addr; 16650Sstevel@tonic-gate addr += (max_mem_nodes * sizeof (kmutex_t)); 16660Sstevel@tonic-gate } 16670Sstevel@tonic-gate for (k = 0; k < NPC_MUTEX; k++) { 16680Sstevel@tonic-gate cpc_mutex[k] = (kmutex_t *)addr; 16690Sstevel@tonic-gate addr += (max_mem_nodes * sizeof (kmutex_t)); 16700Sstevel@tonic-gate } 16710Sstevel@tonic-gate page_freelists = (page_t ****)addr; 16720Sstevel@tonic-gate addr += (mnoderangecnt * sizeof (page_t ***)); 16730Sstevel@tonic-gate 16740Sstevel@tonic-gate page_cachelists = (page_t ***)addr; 16750Sstevel@tonic-gate addr += (mnoderangecnt * sizeof (page_t **)); 16760Sstevel@tonic-gate 16770Sstevel@tonic-gate for (i = 0; i < mnoderangecnt; i++) { 16780Sstevel@tonic-gate page_freelists[i] = (page_t ***)addr; 16790Sstevel@tonic-gate addr += (mmu_page_sizes * sizeof (page_t **)); 16800Sstevel@tonic-gate 16810Sstevel@tonic-gate for (j = 0; j < mmu_page_sizes; j++) { 16820Sstevel@tonic-gate colors = page_get_pagecolors(j); 16830Sstevel@tonic-gate page_freelists[i][j] = (page_t **)addr; 16840Sstevel@tonic-gate addr += (colors * sizeof (page_t *)); 16850Sstevel@tonic-gate } 16860Sstevel@tonic-gate page_cachelists[i] = (page_t **)addr; 16870Sstevel@tonic-gate addr += (page_colors * sizeof (page_t *)); 16880Sstevel@tonic-gate } 16890Sstevel@tonic-gate } 16900Sstevel@tonic-gate 16915084Sjohnlev #if defined(__xpv) 16925084Sjohnlev /* 16935084Sjohnlev * Give back 10% of the io_pool pages to the free list. 16945084Sjohnlev * Don't shrink the pool below some absolute minimum. 16955084Sjohnlev */ 16965084Sjohnlev static void 16975084Sjohnlev page_io_pool_shrink() 16985084Sjohnlev { 16995084Sjohnlev int retcnt; 17005084Sjohnlev page_t *pp, *pp_first, *pp_last, **curpool; 17015084Sjohnlev mfn_t mfn; 17025084Sjohnlev int bothpools = 0; 17035084Sjohnlev 17045084Sjohnlev mutex_enter(&io_pool_lock); 17055084Sjohnlev io_pool_shrink_attempts++; /* should be a kstat? */ 17065084Sjohnlev retcnt = io_pool_cnt / 10; 17075084Sjohnlev if (io_pool_cnt - retcnt < io_pool_cnt_min) 17085084Sjohnlev retcnt = io_pool_cnt - io_pool_cnt_min; 17095084Sjohnlev if (retcnt <= 0) 17105084Sjohnlev goto done; 17115084Sjohnlev io_pool_shrinks++; /* should be a kstat? */ 17125084Sjohnlev curpool = &io_pool_4g; 17135084Sjohnlev domore: 17145084Sjohnlev /* 17155084Sjohnlev * Loop through taking pages from the end of the list 17165084Sjohnlev * (highest mfns) till amount to return reached. 17175084Sjohnlev */ 17185084Sjohnlev for (pp = *curpool; pp && retcnt > 0; ) { 17195084Sjohnlev pp_first = pp_last = pp->p_prev; 17205084Sjohnlev if (pp_first == *curpool) 17215084Sjohnlev break; 17225084Sjohnlev retcnt--; 17235084Sjohnlev io_pool_cnt--; 17245084Sjohnlev page_io_pool_sub(curpool, pp_first, pp_last); 17255084Sjohnlev if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn) 17265084Sjohnlev start_mfn = mfn; 17275084Sjohnlev page_free(pp_first, 1); 17285084Sjohnlev pp = *curpool; 17295084Sjohnlev } 17305084Sjohnlev if (retcnt != 0 && !bothpools) { 17315084Sjohnlev /* 17325084Sjohnlev * If not enough found in less constrained pool try the 17335084Sjohnlev * more constrained one. 17345084Sjohnlev */ 17355084Sjohnlev curpool = &io_pool_16m; 17365084Sjohnlev bothpools = 1; 17375084Sjohnlev goto domore; 17385084Sjohnlev } 17395084Sjohnlev done: 17405084Sjohnlev mutex_exit(&io_pool_lock); 17415084Sjohnlev } 17425084Sjohnlev 17435084Sjohnlev #endif /* __xpv */ 17445084Sjohnlev 17455084Sjohnlev uint_t 17465084Sjohnlev page_create_update_flags_x86(uint_t flags) 17475084Sjohnlev { 17485084Sjohnlev #if defined(__xpv) 17495084Sjohnlev /* 17505084Sjohnlev * Check this is an urgent allocation and free pages are depleted. 17515084Sjohnlev */ 17525084Sjohnlev if (!(flags & PG_WAIT) && freemem < desfree) 17535084Sjohnlev page_io_pool_shrink(); 17545084Sjohnlev #else /* !__xpv */ 17555084Sjohnlev /* 17565084Sjohnlev * page_create_get_something may call this because 4g memory may be 17575084Sjohnlev * depleted. Set flags to allow for relocation of base page below 17585084Sjohnlev * 4g if necessary. 17595084Sjohnlev */ 17605084Sjohnlev if (physmax4g) 17615084Sjohnlev flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI); 17625084Sjohnlev #endif /* __xpv */ 17635084Sjohnlev return (flags); 17645084Sjohnlev } 17655084Sjohnlev 17660Sstevel@tonic-gate /*ARGSUSED*/ 17670Sstevel@tonic-gate int 17680Sstevel@tonic-gate bp_color(struct buf *bp) 17690Sstevel@tonic-gate { 17700Sstevel@tonic-gate return (0); 17710Sstevel@tonic-gate } 17720Sstevel@tonic-gate 17735084Sjohnlev #if defined(__xpv) 17745084Sjohnlev 17755084Sjohnlev /* 17765084Sjohnlev * Take pages out of an io_pool 17775084Sjohnlev */ 17785084Sjohnlev static void 17795084Sjohnlev page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last) 17805084Sjohnlev { 17815084Sjohnlev if (*poolp == pp_first) { 17825084Sjohnlev *poolp = pp_last->p_next; 17835084Sjohnlev if (*poolp == pp_first) 17845084Sjohnlev *poolp = NULL; 17855084Sjohnlev } 17865084Sjohnlev pp_first->p_prev->p_next = pp_last->p_next; 17875084Sjohnlev pp_last->p_next->p_prev = pp_first->p_prev; 17885084Sjohnlev pp_first->p_prev = pp_last; 17895084Sjohnlev pp_last->p_next = pp_first; 17905084Sjohnlev } 17915084Sjohnlev 17925084Sjohnlev /* 17935084Sjohnlev * Put a page on the io_pool list. The list is ordered by increasing MFN. 17945084Sjohnlev */ 17955084Sjohnlev static void 17965084Sjohnlev page_io_pool_add(page_t **poolp, page_t *pp) 17975084Sjohnlev { 17985084Sjohnlev page_t *look; 17995084Sjohnlev mfn_t mfn = mfn_list[pp->p_pagenum]; 18005084Sjohnlev 18015084Sjohnlev if (*poolp == NULL) { 18025084Sjohnlev *poolp = pp; 18035084Sjohnlev pp->p_next = pp; 18045084Sjohnlev pp->p_prev = pp; 18055084Sjohnlev return; 18065084Sjohnlev } 18075084Sjohnlev 18085084Sjohnlev /* 18095084Sjohnlev * Since we try to take pages from the high end of the pool 18105084Sjohnlev * chances are good that the pages to be put on the list will 18115084Sjohnlev * go at or near the end of the list. so start at the end and 18125084Sjohnlev * work backwards. 18135084Sjohnlev */ 18145084Sjohnlev look = (*poolp)->p_prev; 18155084Sjohnlev while (mfn < mfn_list[look->p_pagenum]) { 18165084Sjohnlev look = look->p_prev; 18175084Sjohnlev if (look == (*poolp)->p_prev) 18185084Sjohnlev break; /* backed all the way to front of list */ 18195084Sjohnlev } 18205084Sjohnlev 18215084Sjohnlev /* insert after look */ 18225084Sjohnlev pp->p_prev = look; 18235084Sjohnlev pp->p_next = look->p_next; 18245084Sjohnlev pp->p_next->p_prev = pp; 18255084Sjohnlev look->p_next = pp; 18265084Sjohnlev if (mfn < mfn_list[(*poolp)->p_pagenum]) { 18275084Sjohnlev /* 18285084Sjohnlev * we inserted a new first list element 18295084Sjohnlev * adjust pool pointer to newly inserted element 18305084Sjohnlev */ 18315084Sjohnlev *poolp = pp; 18325084Sjohnlev } 18335084Sjohnlev } 18345084Sjohnlev 18355084Sjohnlev /* 18365084Sjohnlev * Add a page to the io_pool. Setting the force flag will force the page 18375084Sjohnlev * into the io_pool no matter what. 18385084Sjohnlev */ 18395084Sjohnlev static void 18405084Sjohnlev add_page_to_pool(page_t *pp, int force) 18415084Sjohnlev { 18425084Sjohnlev page_t *highest; 18435084Sjohnlev page_t *freep = NULL; 18445084Sjohnlev 18455084Sjohnlev mutex_enter(&io_pool_lock); 18465084Sjohnlev /* 18475084Sjohnlev * Always keep the scarce low memory pages 18485084Sjohnlev */ 18495084Sjohnlev if (mfn_list[pp->p_pagenum] < PFN_16MEG) { 18505084Sjohnlev ++io_pool_cnt; 18515084Sjohnlev page_io_pool_add(&io_pool_16m, pp); 18525084Sjohnlev goto done; 18535084Sjohnlev } 18546159Ssmaybe if (io_pool_cnt < io_pool_cnt_max || force || io_pool_4g == NULL) { 18555084Sjohnlev ++io_pool_cnt; 18565084Sjohnlev page_io_pool_add(&io_pool_4g, pp); 18575084Sjohnlev } else { 18585084Sjohnlev highest = io_pool_4g->p_prev; 18595084Sjohnlev if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) { 18605084Sjohnlev page_io_pool_sub(&io_pool_4g, highest, highest); 18615084Sjohnlev page_io_pool_add(&io_pool_4g, pp); 18625084Sjohnlev freep = highest; 18635084Sjohnlev } else { 18645084Sjohnlev freep = pp; 18655084Sjohnlev } 18665084Sjohnlev } 18675084Sjohnlev done: 18685084Sjohnlev mutex_exit(&io_pool_lock); 18695084Sjohnlev if (freep) 18705084Sjohnlev page_free(freep, 1); 18715084Sjohnlev } 18725084Sjohnlev 18735084Sjohnlev 18745084Sjohnlev int contig_pfn_cnt; /* no of pfns in the contig pfn list */ 18755084Sjohnlev int contig_pfn_max; /* capacity of the contig pfn list */ 18765084Sjohnlev int next_alloc_pfn; /* next position in list to start a contig search */ 18775084Sjohnlev int contig_pfnlist_updates; /* pfn list update count */ 18785084Sjohnlev int contig_pfnlist_builds; /* how many times have we (re)built list */ 18795084Sjohnlev int contig_pfnlist_buildfailed; /* how many times has list build failed */ 18805084Sjohnlev int create_contig_pending; /* nonzero means taskq creating contig list */ 18815084Sjohnlev pfn_t *contig_pfn_list = NULL; /* list of contig pfns in ascending mfn order */ 18825084Sjohnlev 18835084Sjohnlev /* 18845084Sjohnlev * Function to use in sorting a list of pfns by their underlying mfns. 18855084Sjohnlev */ 18865084Sjohnlev static int 18875084Sjohnlev mfn_compare(const void *pfnp1, const void *pfnp2) 18885084Sjohnlev { 18895084Sjohnlev mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1]; 18905084Sjohnlev mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2]; 18915084Sjohnlev 18925084Sjohnlev if (mfn1 > mfn2) 18935084Sjohnlev return (1); 18945084Sjohnlev if (mfn1 < mfn2) 18955084Sjohnlev return (-1); 18965084Sjohnlev return (0); 18975084Sjohnlev } 18985084Sjohnlev 18995084Sjohnlev /* 19005084Sjohnlev * Compact the contig_pfn_list by tossing all the non-contiguous 19015084Sjohnlev * elements from the list. 19025084Sjohnlev */ 19035084Sjohnlev static void 19045084Sjohnlev compact_contig_pfn_list(void) 19055084Sjohnlev { 19065084Sjohnlev pfn_t pfn, lapfn, prev_lapfn; 19075084Sjohnlev mfn_t mfn; 19085084Sjohnlev int i, newcnt = 0; 19095084Sjohnlev 19105084Sjohnlev prev_lapfn = 0; 19115084Sjohnlev for (i = 0; i < contig_pfn_cnt - 1; i++) { 19125084Sjohnlev pfn = contig_pfn_list[i]; 19135084Sjohnlev lapfn = contig_pfn_list[i + 1]; 19145084Sjohnlev mfn = mfn_list[pfn]; 19155084Sjohnlev /* 19165084Sjohnlev * See if next pfn is for a contig mfn 19175084Sjohnlev */ 19185084Sjohnlev if (mfn_list[lapfn] != mfn + 1) 19195084Sjohnlev continue; 19205084Sjohnlev /* 19215084Sjohnlev * pfn and lookahead are both put in list 19225084Sjohnlev * unless pfn is the previous lookahead. 19235084Sjohnlev */ 19245084Sjohnlev if (pfn != prev_lapfn) 19255084Sjohnlev contig_pfn_list[newcnt++] = pfn; 19265084Sjohnlev contig_pfn_list[newcnt++] = lapfn; 19275084Sjohnlev prev_lapfn = lapfn; 19285084Sjohnlev } 19295084Sjohnlev for (i = newcnt; i < contig_pfn_cnt; i++) 19305084Sjohnlev contig_pfn_list[i] = 0; 19315084Sjohnlev contig_pfn_cnt = newcnt; 19325084Sjohnlev } 19335084Sjohnlev 19345084Sjohnlev /*ARGSUSED*/ 19355084Sjohnlev static void 19365084Sjohnlev call_create_contiglist(void *arg) 19375084Sjohnlev { 19385084Sjohnlev (void) create_contig_pfnlist(PG_WAIT); 19395084Sjohnlev } 19405084Sjohnlev 19415084Sjohnlev /* 19425084Sjohnlev * Create list of freelist pfns that have underlying 19435084Sjohnlev * contiguous mfns. The list is kept in ascending mfn order. 19445084Sjohnlev * returns 1 if list created else 0. 19455084Sjohnlev */ 19465084Sjohnlev static int 19475084Sjohnlev create_contig_pfnlist(uint_t flags) 19485084Sjohnlev { 19495084Sjohnlev pfn_t pfn; 19505084Sjohnlev page_t *pp; 19515529Ssmaybe int ret = 1; 19525529Ssmaybe 19535529Ssmaybe mutex_enter(&contig_list_lock); 19545084Sjohnlev if (contig_pfn_list != NULL) 19555529Ssmaybe goto out; 19565084Sjohnlev contig_pfn_max = freemem + (freemem / 10); 19575084Sjohnlev contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t), 19585084Sjohnlev (flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP); 19595084Sjohnlev if (contig_pfn_list == NULL) { 19605084Sjohnlev /* 19615084Sjohnlev * If we could not create the contig list (because 19625084Sjohnlev * we could not sleep for memory). Dispatch a taskq that can 19635084Sjohnlev * sleep to get the memory. 19645084Sjohnlev */ 19655084Sjohnlev if (!create_contig_pending) { 19665084Sjohnlev if (taskq_dispatch(system_taskq, call_create_contiglist, 19675084Sjohnlev NULL, TQ_NOSLEEP) != NULL) 19685084Sjohnlev create_contig_pending = 1; 19695084Sjohnlev } 19705084Sjohnlev contig_pfnlist_buildfailed++; /* count list build failures */ 19715529Ssmaybe ret = 0; 19725529Ssmaybe goto out; 19735084Sjohnlev } 19745529Ssmaybe create_contig_pending = 0; 19755084Sjohnlev ASSERT(contig_pfn_cnt == 0); 19765084Sjohnlev for (pfn = 0; pfn < mfn_count; pfn++) { 19775084Sjohnlev pp = page_numtopp_nolock(pfn); 19785084Sjohnlev if (pp == NULL || !PP_ISFREE(pp)) 19795084Sjohnlev continue; 19805084Sjohnlev contig_pfn_list[contig_pfn_cnt] = pfn; 19815084Sjohnlev if (++contig_pfn_cnt == contig_pfn_max) 19825084Sjohnlev break; 19835084Sjohnlev } 19849010SStuart.Maybee@Sun.COM /* 19859010SStuart.Maybee@Sun.COM * Sanity check the new list. 19869010SStuart.Maybee@Sun.COM */ 19879010SStuart.Maybee@Sun.COM if (contig_pfn_cnt < 2) { /* no contig pfns */ 19889010SStuart.Maybee@Sun.COM contig_pfn_cnt = 0; 19899010SStuart.Maybee@Sun.COM contig_pfnlist_buildfailed++; 19909010SStuart.Maybee@Sun.COM kmem_free(contig_pfn_list, contig_pfn_max * sizeof (pfn_t)); 19919010SStuart.Maybee@Sun.COM contig_pfn_list = NULL; 19929010SStuart.Maybee@Sun.COM contig_pfn_max = 0; 19939010SStuart.Maybee@Sun.COM ret = 0; 19949010SStuart.Maybee@Sun.COM goto out; 19959010SStuart.Maybee@Sun.COM } 19965084Sjohnlev qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare); 19975084Sjohnlev compact_contig_pfn_list(); 19985084Sjohnlev /* 19995084Sjohnlev * Make sure next search of the newly created contiguous pfn 20005084Sjohnlev * list starts at the beginning of the list. 20015084Sjohnlev */ 20025084Sjohnlev next_alloc_pfn = 0; 20035084Sjohnlev contig_pfnlist_builds++; /* count list builds */ 20045529Ssmaybe out: 20055529Ssmaybe mutex_exit(&contig_list_lock); 20065529Ssmaybe return (ret); 20075084Sjohnlev } 20085084Sjohnlev 20095084Sjohnlev 20105084Sjohnlev /* 20115084Sjohnlev * Toss the current contig pfnlist. Someone is about to do a massive 20125084Sjohnlev * update to pfn<->mfn mappings. So we have them destroy the list and lock 20135084Sjohnlev * it till they are done with their update. 20145084Sjohnlev */ 20155084Sjohnlev void 20165084Sjohnlev clear_and_lock_contig_pfnlist() 20175084Sjohnlev { 20185084Sjohnlev pfn_t *listp = NULL; 20195084Sjohnlev size_t listsize; 20205084Sjohnlev 20215529Ssmaybe mutex_enter(&contig_list_lock); 20225084Sjohnlev if (contig_pfn_list != NULL) { 20235084Sjohnlev listp = contig_pfn_list; 20245084Sjohnlev listsize = contig_pfn_max * sizeof (pfn_t); 20255084Sjohnlev contig_pfn_list = NULL; 20265084Sjohnlev contig_pfn_max = contig_pfn_cnt = 0; 20275084Sjohnlev } 20285084Sjohnlev if (listp != NULL) 20295084Sjohnlev kmem_free(listp, listsize); 20305084Sjohnlev } 20315084Sjohnlev 20325084Sjohnlev /* 20335084Sjohnlev * Unlock the contig_pfn_list. The next attempted use of it will cause 20345084Sjohnlev * it to be re-created. 20355084Sjohnlev */ 20365084Sjohnlev void 20375084Sjohnlev unlock_contig_pfnlist() 20385084Sjohnlev { 20395529Ssmaybe mutex_exit(&contig_list_lock); 20405084Sjohnlev } 20415084Sjohnlev 20425084Sjohnlev /* 20435084Sjohnlev * Update the contiguous pfn list in response to a pfn <-> mfn reassignment 20445084Sjohnlev */ 20455084Sjohnlev void 20465084Sjohnlev update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn) 20475084Sjohnlev { 20485084Sjohnlev int probe_hi, probe_lo, probe_pos, insert_after, insert_point; 20495084Sjohnlev pfn_t probe_pfn; 20505084Sjohnlev mfn_t probe_mfn; 20515529Ssmaybe int drop_lock = 0; 20525529Ssmaybe 20535529Ssmaybe if (mutex_owner(&contig_list_lock) != curthread) { 20545529Ssmaybe drop_lock = 1; 20555529Ssmaybe mutex_enter(&contig_list_lock); 20565529Ssmaybe } 20575084Sjohnlev if (contig_pfn_list == NULL) 20585529Ssmaybe goto done; 20595084Sjohnlev contig_pfnlist_updates++; 20605084Sjohnlev /* 20615084Sjohnlev * Find the pfn in the current list. Use a binary chop to locate it. 20625084Sjohnlev */ 20635084Sjohnlev probe_hi = contig_pfn_cnt - 1; 20645084Sjohnlev probe_lo = 0; 20655084Sjohnlev probe_pos = (probe_hi + probe_lo) / 2; 20665084Sjohnlev while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) { 20675084Sjohnlev if (probe_pos == probe_lo) { /* pfn not in list */ 20685084Sjohnlev probe_pos = -1; 20695084Sjohnlev break; 20705084Sjohnlev } 20715084Sjohnlev if (pfn_to_mfn(probe_pfn) <= oldmfn) 20725084Sjohnlev probe_lo = probe_pos; 20735084Sjohnlev else 20745084Sjohnlev probe_hi = probe_pos; 20755084Sjohnlev probe_pos = (probe_hi + probe_lo) / 2; 20765084Sjohnlev } 20779010SStuart.Maybee@Sun.COM if (probe_pos >= 0) { 20789010SStuart.Maybee@Sun.COM /* 20799010SStuart.Maybee@Sun.COM * Remove pfn from list and ensure next alloc 20809010SStuart.Maybee@Sun.COM * position stays in bounds. 20819010SStuart.Maybee@Sun.COM */ 20829010SStuart.Maybee@Sun.COM if (--contig_pfn_cnt <= next_alloc_pfn) 20839010SStuart.Maybee@Sun.COM next_alloc_pfn = 0; 20845084Sjohnlev ovbcopy(&contig_pfn_list[probe_pos + 1], 20855084Sjohnlev &contig_pfn_list[probe_pos], 20865084Sjohnlev (contig_pfn_cnt - probe_pos) * sizeof (pfn_t)); 20875084Sjohnlev } 20885084Sjohnlev if (newmfn == MFN_INVALID) 20895084Sjohnlev goto done; 20905084Sjohnlev /* 20915084Sjohnlev * Check if new mfn has adjacent mfns in the list 20925084Sjohnlev */ 20935084Sjohnlev probe_hi = contig_pfn_cnt - 1; 20945084Sjohnlev probe_lo = 0; 20955084Sjohnlev insert_after = -2; 20965084Sjohnlev do { 20975084Sjohnlev probe_pos = (probe_hi + probe_lo) / 2; 20985084Sjohnlev probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]); 20995084Sjohnlev if (newmfn == probe_mfn + 1) 21005084Sjohnlev insert_after = probe_pos; 21015084Sjohnlev else if (newmfn == probe_mfn - 1) 21025084Sjohnlev insert_after = probe_pos - 1; 21035084Sjohnlev if (probe_pos == probe_lo) 21045084Sjohnlev break; 21055084Sjohnlev if (probe_mfn <= newmfn) 21065084Sjohnlev probe_lo = probe_pos; 21075084Sjohnlev else 21085084Sjohnlev probe_hi = probe_pos; 21095084Sjohnlev } while (insert_after == -2); 21105084Sjohnlev /* 21115084Sjohnlev * If there is space in the list and there are adjacent mfns 21125084Sjohnlev * insert the pfn in to its proper place in the list. 21135084Sjohnlev */ 21145084Sjohnlev if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) { 21155084Sjohnlev insert_point = insert_after + 1; 21165084Sjohnlev ovbcopy(&contig_pfn_list[insert_point], 21175084Sjohnlev &contig_pfn_list[insert_point + 1], 21185084Sjohnlev (contig_pfn_cnt - insert_point) * sizeof (pfn_t)); 21195084Sjohnlev contig_pfn_list[insert_point] = pfn; 21205084Sjohnlev contig_pfn_cnt++; 21215084Sjohnlev } 21225084Sjohnlev done: 21235529Ssmaybe if (drop_lock) 21245529Ssmaybe mutex_exit(&contig_list_lock); 21255084Sjohnlev } 21265084Sjohnlev 21275084Sjohnlev /* 21285084Sjohnlev * Called to (re-)populate the io_pool from the free page lists. 21295084Sjohnlev */ 21305084Sjohnlev long 21315084Sjohnlev populate_io_pool(void) 21325084Sjohnlev { 21335084Sjohnlev pfn_t pfn; 21345084Sjohnlev mfn_t mfn, max_mfn; 21355084Sjohnlev page_t *pp; 21365084Sjohnlev 21375084Sjohnlev /* 21385084Sjohnlev * Figure out the bounds of the pool on first invocation. 21395084Sjohnlev * We use a percentage of memory for the io pool size. 21405084Sjohnlev * we allow that to shrink, but not to less than a fixed minimum 21415084Sjohnlev */ 21425084Sjohnlev if (io_pool_cnt_max == 0) { 21435084Sjohnlev io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct); 21445084Sjohnlev io_pool_cnt_lowater = io_pool_cnt_max; 21455084Sjohnlev /* 21465084Sjohnlev * This is the first time in populate_io_pool, grab a va to use 21475084Sjohnlev * when we need to allocate pages. 21485084Sjohnlev */ 21495084Sjohnlev io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); 21505084Sjohnlev } 21515084Sjohnlev /* 21525084Sjohnlev * If we are out of pages in the pool, then grow the size of the pool 21535084Sjohnlev */ 21546159Ssmaybe if (io_pool_cnt == 0) { 21556159Ssmaybe /* 21566159Ssmaybe * Grow the max size of the io pool by 5%, but never more than 21576159Ssmaybe * 25% of physical memory. 21586159Ssmaybe */ 21596159Ssmaybe if (io_pool_cnt_max < physmem / 4) 21606159Ssmaybe io_pool_cnt_max += io_pool_cnt_max / 20; 21616159Ssmaybe } 21625084Sjohnlev io_pool_grows++; /* should be a kstat? */ 21635084Sjohnlev 21645084Sjohnlev /* 21655084Sjohnlev * Get highest mfn on this platform, but limit to the 32 bit DMA max. 21665084Sjohnlev */ 21675084Sjohnlev (void) mfn_to_pfn(start_mfn); 21685084Sjohnlev max_mfn = MIN(cached_max_mfn, PFN_4GIG); 21695084Sjohnlev for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) { 21705084Sjohnlev pfn = mfn_to_pfn(mfn); 21715084Sjohnlev if (pfn & PFN_IS_FOREIGN_MFN) 21725084Sjohnlev continue; 21735084Sjohnlev /* 21745084Sjohnlev * try to allocate it from free pages 21755084Sjohnlev */ 21765084Sjohnlev pp = page_numtopp_alloc(pfn); 21775084Sjohnlev if (pp == NULL) 21785084Sjohnlev continue; 21795084Sjohnlev PP_CLRFREE(pp); 21805084Sjohnlev add_page_to_pool(pp, 1); 21815084Sjohnlev if (io_pool_cnt >= io_pool_cnt_max) 21825084Sjohnlev break; 21835084Sjohnlev } 21845084Sjohnlev 21855084Sjohnlev return (io_pool_cnt); 21865084Sjohnlev } 21875084Sjohnlev 21885084Sjohnlev /* 21895084Sjohnlev * Destroy a page that was being used for DMA I/O. It may or 21905084Sjohnlev * may not actually go back to the io_pool. 21915084Sjohnlev */ 21925084Sjohnlev void 21935084Sjohnlev page_destroy_io(page_t *pp) 21945084Sjohnlev { 21955084Sjohnlev mfn_t mfn = mfn_list[pp->p_pagenum]; 21965084Sjohnlev 21975084Sjohnlev /* 21985084Sjohnlev * When the page was alloc'd a reservation was made, release it now 21995084Sjohnlev */ 22005084Sjohnlev page_unresv(1); 22015084Sjohnlev /* 22025084Sjohnlev * Unload translations, if any, then hash out the 22035084Sjohnlev * page to erase its identity. 22045084Sjohnlev */ 22055084Sjohnlev (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 22065084Sjohnlev page_hashout(pp, NULL); 22075084Sjohnlev 22085084Sjohnlev /* 22095084Sjohnlev * If the page came from the free lists, just put it back to them. 22105084Sjohnlev * DomU pages always go on the free lists as well. 22115084Sjohnlev */ 22125084Sjohnlev if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) { 22135084Sjohnlev page_free(pp, 1); 22145084Sjohnlev return; 22155084Sjohnlev } 22165084Sjohnlev 22175084Sjohnlev add_page_to_pool(pp, 0); 22185084Sjohnlev } 22195084Sjohnlev 22205084Sjohnlev 22215084Sjohnlev long contig_searches; /* count of times contig pages requested */ 22225084Sjohnlev long contig_search_restarts; /* count of contig ranges tried */ 22235084Sjohnlev long contig_search_failed; /* count of contig alloc failures */ 22245084Sjohnlev 22255084Sjohnlev /* 22265084Sjohnlev * Look thru the contiguous pfns that are not part of the io_pool for 22275084Sjohnlev * contiguous free pages. Return a list of the found pages or NULL. 22285084Sjohnlev */ 22295084Sjohnlev page_t * 22306282Ssmaybe find_contig_free(uint_t npages, uint_t flags, uint64_t pfnseg) 22315084Sjohnlev { 22325084Sjohnlev page_t *pp, *plist = NULL; 22336282Ssmaybe mfn_t mfn, prev_mfn, start_mfn; 22345084Sjohnlev pfn_t pfn; 22355084Sjohnlev int pages_needed, pages_requested; 22365084Sjohnlev int search_start; 22375084Sjohnlev 22385084Sjohnlev /* 22395084Sjohnlev * create the contig pfn list if not already done 22405084Sjohnlev */ 22415529Ssmaybe retry: 22425529Ssmaybe mutex_enter(&contig_list_lock); 22435084Sjohnlev if (contig_pfn_list == NULL) { 22445529Ssmaybe mutex_exit(&contig_list_lock); 22455529Ssmaybe if (!create_contig_pfnlist(flags)) { 22465084Sjohnlev return (NULL); 22475084Sjohnlev } 22485529Ssmaybe goto retry; 22495084Sjohnlev } 22505084Sjohnlev contig_searches++; 22515084Sjohnlev /* 22525084Sjohnlev * Search contiguous pfn list for physically contiguous pages not in 22535084Sjohnlev * the io_pool. Start the search where the last search left off. 22545084Sjohnlev */ 22555843Ssmaybe pages_requested = pages_needed = npages; 22565084Sjohnlev search_start = next_alloc_pfn; 22576282Ssmaybe start_mfn = prev_mfn = 0; 22585084Sjohnlev while (pages_needed) { 22595084Sjohnlev pfn = contig_pfn_list[next_alloc_pfn]; 22605084Sjohnlev mfn = pfn_to_mfn(pfn); 22616282Ssmaybe /* 22626282Ssmaybe * Check if mfn is first one or contig to previous one and 22636282Ssmaybe * if page corresponding to mfn is free and that mfn 22646282Ssmaybe * range is not crossing a segment boundary. 22656282Ssmaybe */ 22665084Sjohnlev if ((prev_mfn == 0 || mfn == prev_mfn + 1) && 22676282Ssmaybe (pp = page_numtopp_alloc(pfn)) != NULL && 22686282Ssmaybe !((mfn & pfnseg) < (start_mfn & pfnseg))) { 22695084Sjohnlev PP_CLRFREE(pp); 22705084Sjohnlev page_io_pool_add(&plist, pp); 22715084Sjohnlev pages_needed--; 22726282Ssmaybe if (prev_mfn == 0) 22736282Ssmaybe start_mfn = mfn; 22745084Sjohnlev prev_mfn = mfn; 22755084Sjohnlev } else { 22765084Sjohnlev contig_search_restarts++; 22775084Sjohnlev /* 22785084Sjohnlev * free partial page list 22795084Sjohnlev */ 22805084Sjohnlev while (plist != NULL) { 22815084Sjohnlev pp = plist; 22825084Sjohnlev page_io_pool_sub(&plist, pp, pp); 22835084Sjohnlev page_free(pp, 1); 22845084Sjohnlev } 22855084Sjohnlev pages_needed = pages_requested; 22866282Ssmaybe start_mfn = prev_mfn = 0; 22875084Sjohnlev } 22885084Sjohnlev if (++next_alloc_pfn == contig_pfn_cnt) 22895084Sjohnlev next_alloc_pfn = 0; 22905084Sjohnlev if (next_alloc_pfn == search_start) 22915084Sjohnlev break; /* all pfns searched */ 22925084Sjohnlev } 22935529Ssmaybe mutex_exit(&contig_list_lock); 22945084Sjohnlev if (pages_needed) { 22955084Sjohnlev contig_search_failed++; 22965084Sjohnlev /* 22975084Sjohnlev * Failed to find enough contig pages. 22985084Sjohnlev * free partial page list 22995084Sjohnlev */ 23005084Sjohnlev while (plist != NULL) { 23015084Sjohnlev pp = plist; 23025084Sjohnlev page_io_pool_sub(&plist, pp, pp); 23035084Sjohnlev page_free(pp, 1); 23045084Sjohnlev } 23055084Sjohnlev } 23065084Sjohnlev return (plist); 23075084Sjohnlev } 23085084Sjohnlev 23095084Sjohnlev /* 23105843Ssmaybe * Search the reserved io pool pages for a page range with the 23115843Ssmaybe * desired characteristics. 23125084Sjohnlev */ 23135084Sjohnlev page_t * 23145843Ssmaybe page_io_pool_alloc(ddi_dma_attr_t *mattr, int contig, pgcnt_t minctg) 23155084Sjohnlev { 23165843Ssmaybe page_t *pp_first, *pp_last; 23175843Ssmaybe page_t *pp, **poolp; 23185843Ssmaybe pgcnt_t nwanted, pfnalign; 23195084Sjohnlev uint64_t pfnseg; 23205843Ssmaybe mfn_t mfn, tmfn, hi_mfn, lo_mfn; 23215843Ssmaybe int align, attempt = 0; 23225843Ssmaybe 23235843Ssmaybe if (minctg == 1) 23245843Ssmaybe contig = 0; 23255084Sjohnlev lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 23265084Sjohnlev hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 23275843Ssmaybe pfnseg = mmu_btop(mattr->dma_attr_seg); 23285084Sjohnlev align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 23295084Sjohnlev if (align > MMU_PAGESIZE) 23305084Sjohnlev pfnalign = mmu_btop(align); 23315843Ssmaybe else 23325843Ssmaybe pfnalign = 0; 23335843Ssmaybe 23345084Sjohnlev try_again: 23355084Sjohnlev /* 23365084Sjohnlev * See if we want pages for a legacy device 23375084Sjohnlev */ 23385084Sjohnlev if (hi_mfn < PFN_16MEG) 23395084Sjohnlev poolp = &io_pool_16m; 23405084Sjohnlev else 23415084Sjohnlev poolp = &io_pool_4g; 23425084Sjohnlev try_smaller: 23435084Sjohnlev /* 23445843Ssmaybe * Take pages from I/O pool. We'll use pages from the highest 23455843Ssmaybe * MFN range possible. 23465084Sjohnlev */ 23475084Sjohnlev pp_first = pp_last = NULL; 23485084Sjohnlev mutex_enter(&io_pool_lock); 23495843Ssmaybe nwanted = minctg; 23505843Ssmaybe for (pp = *poolp; pp && nwanted > 0; ) { 23515084Sjohnlev pp = pp->p_prev; 23525084Sjohnlev 23535084Sjohnlev /* 23545084Sjohnlev * skip pages above allowable range 23555084Sjohnlev */ 23565084Sjohnlev mfn = mfn_list[pp->p_pagenum]; 23575084Sjohnlev if (hi_mfn < mfn) 23585084Sjohnlev goto skip; 23595084Sjohnlev 23605084Sjohnlev /* 23615084Sjohnlev * stop at pages below allowable range 23625084Sjohnlev */ 23635084Sjohnlev if (lo_mfn > mfn) 23645084Sjohnlev break; 23655084Sjohnlev restart: 23665084Sjohnlev if (pp_last == NULL) { 23675084Sjohnlev /* 23685084Sjohnlev * Check alignment 23695084Sjohnlev */ 23705843Ssmaybe tmfn = mfn - (minctg - 1); 23715843Ssmaybe if (pfnalign && tmfn != P2ROUNDUP(tmfn, pfnalign)) 23725843Ssmaybe goto skip; /* not properly aligned */ 23735084Sjohnlev /* 23745084Sjohnlev * Check segment 23755084Sjohnlev */ 23765084Sjohnlev if ((mfn & pfnseg) < (tmfn & pfnseg)) 23775843Ssmaybe goto skip; /* crosses seg boundary */ 23785084Sjohnlev /* 23795084Sjohnlev * Start building page list 23805084Sjohnlev */ 23815084Sjohnlev pp_first = pp_last = pp; 23825843Ssmaybe nwanted--; 23835084Sjohnlev } else { 23845084Sjohnlev /* 23855084Sjohnlev * check physical contiguity if required 23865084Sjohnlev */ 23875084Sjohnlev if (contig && 23885084Sjohnlev mfn_list[pp_first->p_pagenum] != mfn + 1) { 23895084Sjohnlev /* 23905084Sjohnlev * not a contiguous page, restart list. 23915084Sjohnlev */ 23925084Sjohnlev pp_last = NULL; 23935843Ssmaybe nwanted = minctg; 23945084Sjohnlev goto restart; 23955084Sjohnlev } else { /* add page to list */ 23965084Sjohnlev pp_first = pp; 23975843Ssmaybe nwanted--; 23985084Sjohnlev } 23995084Sjohnlev } 24005084Sjohnlev skip: 24015084Sjohnlev if (pp == *poolp) 24025084Sjohnlev break; 24035084Sjohnlev } 24045084Sjohnlev 24055084Sjohnlev /* 24065084Sjohnlev * If we didn't find memory. Try the more constrained pool, then 24075843Ssmaybe * sweep free pages into the DMA pool and try again. 24085084Sjohnlev */ 24095843Ssmaybe if (nwanted != 0) { 24105084Sjohnlev mutex_exit(&io_pool_lock); 24115084Sjohnlev /* 24125843Ssmaybe * If we were looking in the less constrained pool and 24135843Ssmaybe * didn't find pages, try the more constrained pool. 24145084Sjohnlev */ 24155084Sjohnlev if (poolp == &io_pool_4g) { 24165084Sjohnlev poolp = &io_pool_16m; 24175084Sjohnlev goto try_smaller; 24185084Sjohnlev } 24195084Sjohnlev kmem_reap(); 24205084Sjohnlev if (++attempt < 4) { 24215084Sjohnlev /* 24225084Sjohnlev * Grab some more io_pool pages 24235084Sjohnlev */ 24245084Sjohnlev (void) populate_io_pool(); 24255843Ssmaybe goto try_again; /* go around and retry */ 24265084Sjohnlev } 24275843Ssmaybe return (NULL); 24285084Sjohnlev } 24295084Sjohnlev /* 24305084Sjohnlev * Found the pages, now snip them from the list 24315084Sjohnlev */ 24325084Sjohnlev page_io_pool_sub(poolp, pp_first, pp_last); 24335843Ssmaybe io_pool_cnt -= minctg; 24345843Ssmaybe /* 24355843Ssmaybe * reset low water mark 24365843Ssmaybe */ 24375084Sjohnlev if (io_pool_cnt < io_pool_cnt_lowater) 24385843Ssmaybe io_pool_cnt_lowater = io_pool_cnt; 24395084Sjohnlev mutex_exit(&io_pool_lock); 24405843Ssmaybe return (pp_first); 24415843Ssmaybe } 24425843Ssmaybe 24435843Ssmaybe page_t * 24445843Ssmaybe page_swap_with_hypervisor(struct vnode *vp, u_offset_t off, caddr_t vaddr, 24455843Ssmaybe ddi_dma_attr_t *mattr, uint_t flags, pgcnt_t minctg) 24465843Ssmaybe { 24475843Ssmaybe uint_t kflags; 24485843Ssmaybe int order, extra, extpages, i, contig, nbits, extents; 24495843Ssmaybe page_t *pp, *expp, *pp_first, **pplist = NULL; 24505843Ssmaybe mfn_t *mfnlist = NULL; 24515843Ssmaybe 24525843Ssmaybe contig = flags & PG_PHYSCONTIG; 24535843Ssmaybe if (minctg == 1) 24545843Ssmaybe contig = 0; 24555843Ssmaybe flags &= ~PG_PHYSCONTIG; 24565843Ssmaybe kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP; 24575843Ssmaybe /* 24585843Ssmaybe * Hypervisor will allocate extents, if we want contig 24595843Ssmaybe * pages extent must be >= minctg 24605843Ssmaybe */ 24615843Ssmaybe if (contig) { 24625843Ssmaybe order = highbit(minctg) - 1; 24635843Ssmaybe if (minctg & ((1 << order) - 1)) 24645843Ssmaybe order++; 24655843Ssmaybe extpages = 1 << order; 24665843Ssmaybe } else { 24675843Ssmaybe order = 0; 24685843Ssmaybe extpages = minctg; 24695843Ssmaybe } 24705843Ssmaybe if (extpages > minctg) { 24715843Ssmaybe extra = extpages - minctg; 24725843Ssmaybe if (!page_resv(extra, kflags)) 24735843Ssmaybe return (NULL); 24745843Ssmaybe } 24755843Ssmaybe pp_first = NULL; 24765843Ssmaybe pplist = kmem_alloc(extpages * sizeof (page_t *), kflags); 24775843Ssmaybe if (pplist == NULL) 24785843Ssmaybe goto balloon_fail; 24795843Ssmaybe mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags); 24805843Ssmaybe if (mfnlist == NULL) 24815843Ssmaybe goto balloon_fail; 24825843Ssmaybe pp = page_create_va(vp, off, minctg * PAGESIZE, flags, &kvseg, vaddr); 24835843Ssmaybe if (pp == NULL) 24845843Ssmaybe goto balloon_fail; 24855843Ssmaybe pp_first = pp; 24865843Ssmaybe if (extpages > minctg) { 24875843Ssmaybe /* 24885843Ssmaybe * fill out the rest of extent pages to swap 24895843Ssmaybe * with the hypervisor 24905843Ssmaybe */ 24915843Ssmaybe for (i = 0; i < extra; i++) { 24925843Ssmaybe expp = page_create_va(vp, 24935843Ssmaybe (u_offset_t)(uintptr_t)io_pool_kva, 24945843Ssmaybe PAGESIZE, flags, &kvseg, io_pool_kva); 24955843Ssmaybe if (expp == NULL) 24965843Ssmaybe goto balloon_fail; 24975843Ssmaybe (void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD); 24985843Ssmaybe page_io_unlock(expp); 24995843Ssmaybe page_hashout(expp, NULL); 25005843Ssmaybe page_io_lock(expp); 25015843Ssmaybe /* 25025843Ssmaybe * add page to end of list 25035843Ssmaybe */ 25045843Ssmaybe expp->p_prev = pp_first->p_prev; 25055843Ssmaybe expp->p_next = pp_first; 25065843Ssmaybe expp->p_prev->p_next = expp; 25075843Ssmaybe pp_first->p_prev = expp; 25085084Sjohnlev } 25095843Ssmaybe 25105843Ssmaybe } 25115843Ssmaybe for (i = 0; i < extpages; i++) { 25125843Ssmaybe pplist[i] = pp; 25135084Sjohnlev pp = pp->p_next; 25145843Ssmaybe } 25155843Ssmaybe nbits = highbit(mattr->dma_attr_addr_hi); 25165843Ssmaybe extents = contig ? 1 : minctg; 25175843Ssmaybe if (balloon_replace_pages(extents, pplist, nbits, order, 25185843Ssmaybe mfnlist) != extents) { 25195843Ssmaybe if (ioalloc_dbg) 25205843Ssmaybe cmn_err(CE_NOTE, "request to hypervisor" 25215843Ssmaybe " for %d pages, maxaddr %" PRIx64 " failed", 25225843Ssmaybe extpages, mattr->dma_attr_addr_hi); 25235843Ssmaybe goto balloon_fail; 25245843Ssmaybe } 25255843Ssmaybe 25265843Ssmaybe kmem_free(pplist, extpages * sizeof (page_t *)); 25275843Ssmaybe kmem_free(mfnlist, extpages * sizeof (mfn_t)); 25285843Ssmaybe /* 25295843Ssmaybe * Return any excess pages to free list 25305843Ssmaybe */ 25315843Ssmaybe if (extpages > minctg) { 25325843Ssmaybe for (i = 0; i < extra; i++) { 25335843Ssmaybe pp = pp_first->p_prev; 25345843Ssmaybe page_sub(&pp_first, pp); 25355843Ssmaybe page_io_unlock(pp); 25365843Ssmaybe page_unresv(1); 25375843Ssmaybe page_free(pp, 1); 25385843Ssmaybe } 25395843Ssmaybe } 25405084Sjohnlev return (pp_first); 25415084Sjohnlev balloon_fail: 25425084Sjohnlev /* 25435084Sjohnlev * Return pages to free list and return failure 25445084Sjohnlev */ 25455084Sjohnlev while (pp_first != NULL) { 25465084Sjohnlev pp = pp_first; 25475084Sjohnlev page_sub(&pp_first, pp); 25485084Sjohnlev page_io_unlock(pp); 25495084Sjohnlev if (pp->p_vnode != NULL) 25505084Sjohnlev page_hashout(pp, NULL); 25515084Sjohnlev page_free(pp, 1); 25525084Sjohnlev } 25535084Sjohnlev if (pplist) 25545084Sjohnlev kmem_free(pplist, extpages * sizeof (page_t *)); 25555084Sjohnlev if (mfnlist) 25565084Sjohnlev kmem_free(mfnlist, extpages * sizeof (mfn_t)); 25575843Ssmaybe page_unresv(extpages - minctg); 25585843Ssmaybe return (NULL); 25595843Ssmaybe } 25605843Ssmaybe 25615843Ssmaybe static void 25625843Ssmaybe return_partial_alloc(page_t *plist) 25635843Ssmaybe { 25645843Ssmaybe page_t *pp; 25655843Ssmaybe 25665843Ssmaybe while (plist != NULL) { 25675843Ssmaybe pp = plist; 25685843Ssmaybe page_sub(&plist, pp); 25697173Smrj page_io_unlock(pp); 25705843Ssmaybe page_destroy_io(pp); 25715843Ssmaybe } 25725843Ssmaybe } 25735843Ssmaybe 25745843Ssmaybe static page_t * 25755843Ssmaybe page_get_contigpages( 25765843Ssmaybe struct vnode *vp, 25775843Ssmaybe u_offset_t off, 25785843Ssmaybe int *npagesp, 25795843Ssmaybe uint_t flags, 25805843Ssmaybe caddr_t vaddr, 25815843Ssmaybe ddi_dma_attr_t *mattr) 25825843Ssmaybe { 25835843Ssmaybe mfn_t max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); 25845843Ssmaybe page_t *plist; /* list to return */ 25855843Ssmaybe page_t *pp, *mcpl; 25865843Ssmaybe int contig, anyaddr, npages, getone = 0; 25875843Ssmaybe mfn_t lo_mfn; 25885843Ssmaybe mfn_t hi_mfn; 25895843Ssmaybe pgcnt_t pfnalign = 0; 25905843Ssmaybe int align, sgllen; 25915843Ssmaybe uint64_t pfnseg; 25925843Ssmaybe pgcnt_t minctg; 25935843Ssmaybe 25945843Ssmaybe npages = *npagesp; 25955843Ssmaybe ASSERT(mattr != NULL); 25965843Ssmaybe lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 25975843Ssmaybe hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 25985843Ssmaybe sgllen = mattr->dma_attr_sgllen; 25995843Ssmaybe pfnseg = mmu_btop(mattr->dma_attr_seg); 26005843Ssmaybe align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 26015843Ssmaybe if (align > MMU_PAGESIZE) 26025843Ssmaybe pfnalign = mmu_btop(align); 26035843Ssmaybe 26045843Ssmaybe /* 26055843Ssmaybe * Clear the contig flag if only one page is needed. 26065843Ssmaybe */ 26075843Ssmaybe contig = flags & PG_PHYSCONTIG; 26085843Ssmaybe if (npages == 1) { 26095843Ssmaybe getone = 1; 26105843Ssmaybe contig = 0; 26115843Ssmaybe } 26125843Ssmaybe 26135843Ssmaybe /* 26145843Ssmaybe * Check if any page in the system is fine. 26155843Ssmaybe */ 26165843Ssmaybe anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign; 26175843Ssmaybe if (!contig && anyaddr) { 26185843Ssmaybe flags &= ~PG_PHYSCONTIG; 26195843Ssmaybe plist = page_create_va(vp, off, npages * MMU_PAGESIZE, 26205843Ssmaybe flags, &kvseg, vaddr); 26215843Ssmaybe if (plist != NULL) { 26225843Ssmaybe *npagesp = 0; 26235843Ssmaybe return (plist); 26245843Ssmaybe } 26255843Ssmaybe } 26265843Ssmaybe plist = NULL; 26275843Ssmaybe minctg = howmany(npages, sgllen); 26285843Ssmaybe while (npages > sgllen || getone) { 26296015Ssmaybe if (minctg > npages) 26306015Ssmaybe minctg = npages; 26316015Ssmaybe mcpl = NULL; 26325843Ssmaybe /* 26335843Ssmaybe * We could just want unconstrained but contig pages. 26345843Ssmaybe */ 26356282Ssmaybe if (anyaddr && contig) { 26365843Ssmaybe /* 26375843Ssmaybe * Look for free contig pages to satisfy the request. 26385843Ssmaybe */ 26396282Ssmaybe mcpl = find_contig_free(minctg, flags, pfnseg); 26405843Ssmaybe } 26415843Ssmaybe /* 26425843Ssmaybe * Try the reserved io pools next 26435843Ssmaybe */ 26445843Ssmaybe if (mcpl == NULL) 26455843Ssmaybe mcpl = page_io_pool_alloc(mattr, contig, minctg); 26465843Ssmaybe if (mcpl != NULL) { 26475843Ssmaybe pp = mcpl; 26485843Ssmaybe do { 26495843Ssmaybe if (!page_hashin(pp, vp, off, NULL)) { 26505843Ssmaybe panic("page_get_contigpages:" 26515843Ssmaybe " hashin failed" 26525843Ssmaybe " pp %p, vp %p, off %llx", 26535843Ssmaybe (void *)pp, (void *)vp, off); 26545843Ssmaybe } 26555843Ssmaybe off += MMU_PAGESIZE; 26565843Ssmaybe PP_CLRFREE(pp); 26575843Ssmaybe PP_CLRAGED(pp); 26585843Ssmaybe page_set_props(pp, P_REF); 26595843Ssmaybe page_io_lock(pp); 26605843Ssmaybe pp = pp->p_next; 26615843Ssmaybe } while (pp != mcpl); 26625843Ssmaybe } else { 26635843Ssmaybe /* 26645843Ssmaybe * Hypervisor exchange doesn't handle segment or 26655843Ssmaybe * alignment constraints 26665843Ssmaybe */ 26675843Ssmaybe if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi || 26685843Ssmaybe pfnalign) 26695843Ssmaybe goto fail; 26705843Ssmaybe /* 26715843Ssmaybe * Try exchanging pages with the hypervisor 26725843Ssmaybe */ 26735843Ssmaybe mcpl = page_swap_with_hypervisor(vp, off, vaddr, mattr, 26745843Ssmaybe flags, minctg); 26755843Ssmaybe if (mcpl == NULL) 26765843Ssmaybe goto fail; 26775843Ssmaybe off += minctg * MMU_PAGESIZE; 26785843Ssmaybe } 26795843Ssmaybe check_dma(mattr, mcpl, minctg); 26805843Ssmaybe /* 26815843Ssmaybe * Here with a minctg run of contiguous pages, add them to the 26825843Ssmaybe * list we will return for this request. 26835843Ssmaybe */ 26845843Ssmaybe page_list_concat(&plist, &mcpl); 26855843Ssmaybe npages -= minctg; 26865843Ssmaybe *npagesp = npages; 26875843Ssmaybe sgllen--; 26886015Ssmaybe if (getone) 26896015Ssmaybe break; 26905843Ssmaybe } 26915843Ssmaybe return (plist); 26925843Ssmaybe fail: 26935843Ssmaybe return_partial_alloc(plist); 26945843Ssmaybe return (NULL); 26955843Ssmaybe } 26965843Ssmaybe 26975843Ssmaybe /* 26985843Ssmaybe * Allocator for domain 0 I/O pages. We match the required 26995843Ssmaybe * DMA attributes and contiguity constraints. 27005843Ssmaybe */ 27015843Ssmaybe /*ARGSUSED*/ 27025843Ssmaybe page_t * 27035843Ssmaybe page_create_io( 27045843Ssmaybe struct vnode *vp, 27055843Ssmaybe u_offset_t off, 27065843Ssmaybe uint_t bytes, 27075843Ssmaybe uint_t flags, 27085843Ssmaybe struct as *as, 27095843Ssmaybe caddr_t vaddr, 27105843Ssmaybe ddi_dma_attr_t *mattr) 27115843Ssmaybe { 27125843Ssmaybe page_t *plist = NULL, *pp; 27135843Ssmaybe int npages = 0, contig, anyaddr, pages_req; 27145843Ssmaybe mfn_t lo_mfn; 27155843Ssmaybe mfn_t hi_mfn; 27165843Ssmaybe pgcnt_t pfnalign = 0; 27175843Ssmaybe int align; 27185843Ssmaybe int is_domu = 0; 27195843Ssmaybe int dummy, bytes_got; 27205843Ssmaybe mfn_t max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); 27215843Ssmaybe 27225843Ssmaybe ASSERT(mattr != NULL); 27235843Ssmaybe lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 27245843Ssmaybe hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 27255843Ssmaybe align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 27265843Ssmaybe if (align > MMU_PAGESIZE) 27275843Ssmaybe pfnalign = mmu_btop(align); 27285843Ssmaybe 27295843Ssmaybe /* 27305843Ssmaybe * Clear the contig flag if only one page is needed or the scatter 27315843Ssmaybe * gather list length is >= npages. 27325843Ssmaybe */ 27335843Ssmaybe pages_req = npages = mmu_btopr(bytes); 27345843Ssmaybe contig = (flags & PG_PHYSCONTIG); 27355843Ssmaybe bytes = P2ROUNDUP(bytes, MMU_PAGESIZE); 27365843Ssmaybe if (bytes == MMU_PAGESIZE || mattr->dma_attr_sgllen >= npages) 27375843Ssmaybe contig = 0; 27385843Ssmaybe 27395843Ssmaybe /* 27405843Ssmaybe * Check if any old page in the system is fine. 27415843Ssmaybe * DomU should always go down this path. 27425843Ssmaybe */ 27435843Ssmaybe is_domu = !DOMAIN_IS_INITDOMAIN(xen_info); 27445843Ssmaybe anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign; 27455843Ssmaybe if ((!contig && anyaddr) || is_domu) { 27465843Ssmaybe flags &= ~PG_PHYSCONTIG; 27475843Ssmaybe plist = page_create_va(vp, off, bytes, flags, &kvseg, vaddr); 27485843Ssmaybe if (plist != NULL) 27495843Ssmaybe return (plist); 27505843Ssmaybe else if (is_domu) 27515843Ssmaybe return (NULL); /* no memory available */ 27525843Ssmaybe } 27535843Ssmaybe /* 27545843Ssmaybe * DomU should never reach here 27555843Ssmaybe */ 27565843Ssmaybe if (contig) { 27575843Ssmaybe plist = page_get_contigpages(vp, off, &npages, flags, vaddr, 27585843Ssmaybe mattr); 27595843Ssmaybe if (plist == NULL) 27605843Ssmaybe goto fail; 27615843Ssmaybe bytes_got = (pages_req - npages) << MMU_PAGESHIFT; 27625843Ssmaybe vaddr += bytes_got; 27635843Ssmaybe off += bytes_got; 27645843Ssmaybe /* 27655843Ssmaybe * We now have all the contiguous pages we need, but 27665843Ssmaybe * we may still need additional non-contiguous pages. 27675843Ssmaybe */ 27685843Ssmaybe } 27695843Ssmaybe /* 27705843Ssmaybe * now loop collecting the requested number of pages, these do 27715843Ssmaybe * not have to be contiguous pages but we will use the contig 27725843Ssmaybe * page alloc code to get the pages since it will honor any 27735843Ssmaybe * other constraints the pages may have. 27745843Ssmaybe */ 27755843Ssmaybe while (npages--) { 27765843Ssmaybe dummy = 1; 27775843Ssmaybe pp = page_get_contigpages(vp, off, &dummy, flags, vaddr, mattr); 27785843Ssmaybe if (pp == NULL) 27795843Ssmaybe goto fail; 27805843Ssmaybe page_add(&plist, pp); 27815843Ssmaybe vaddr += MMU_PAGESIZE; 27825843Ssmaybe off += MMU_PAGESIZE; 27835843Ssmaybe } 27845843Ssmaybe return (plist); 27855843Ssmaybe fail: 27865843Ssmaybe /* 27875843Ssmaybe * Failed to get enough pages, return ones we did get 27885843Ssmaybe */ 27895843Ssmaybe return_partial_alloc(plist); 27905084Sjohnlev return (NULL); 27915084Sjohnlev } 27925084Sjohnlev 27935084Sjohnlev /* 27945084Sjohnlev * Lock and return the page with the highest mfn that we can find. last_mfn 27955084Sjohnlev * holds the last one found, so the next search can start from there. We 27965084Sjohnlev * also keep a counter so that we don't loop forever if the machine has no 27975084Sjohnlev * free pages. 27985084Sjohnlev * 27995084Sjohnlev * This is called from the balloon thread to find pages to give away. new_high 28005084Sjohnlev * is used when new mfn's have been added to the system - we will reset our 28015084Sjohnlev * search if the new mfn's are higher than our current search position. 28025084Sjohnlev */ 28035084Sjohnlev page_t * 28045084Sjohnlev page_get_high_mfn(mfn_t new_high) 28055084Sjohnlev { 28065084Sjohnlev static mfn_t last_mfn = 0; 28075084Sjohnlev pfn_t pfn; 28085084Sjohnlev page_t *pp; 28095084Sjohnlev ulong_t loop_count = 0; 28105084Sjohnlev 28115084Sjohnlev if (new_high > last_mfn) 28125084Sjohnlev last_mfn = new_high; 28135084Sjohnlev 28145084Sjohnlev for (; loop_count < mfn_count; loop_count++, last_mfn--) { 28155084Sjohnlev if (last_mfn == 0) { 28165084Sjohnlev last_mfn = cached_max_mfn; 28175084Sjohnlev } 28185084Sjohnlev 28195084Sjohnlev pfn = mfn_to_pfn(last_mfn); 28205084Sjohnlev if (pfn & PFN_IS_FOREIGN_MFN) 28215084Sjohnlev continue; 28225084Sjohnlev 28235084Sjohnlev /* See if the page is free. If so, lock it. */ 28245084Sjohnlev pp = page_numtopp_alloc(pfn); 28255084Sjohnlev if (pp == NULL) 28265084Sjohnlev continue; 28275084Sjohnlev PP_CLRFREE(pp); 28285084Sjohnlev 28295084Sjohnlev ASSERT(PAGE_EXCL(pp)); 28305084Sjohnlev ASSERT(pp->p_vnode == NULL); 28315084Sjohnlev ASSERT(!hat_page_is_mapped(pp)); 28325084Sjohnlev last_mfn--; 28335084Sjohnlev return (pp); 28345084Sjohnlev } 28355084Sjohnlev return (NULL); 28365084Sjohnlev } 28375084Sjohnlev 28385084Sjohnlev #else /* !__xpv */ 28395084Sjohnlev 28400Sstevel@tonic-gate /* 28410Sstevel@tonic-gate * get a page from any list with the given mnode 28420Sstevel@tonic-gate */ 28435084Sjohnlev static page_t * 28440Sstevel@tonic-gate page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 28450Sstevel@tonic-gate int mnode, int mtype, ddi_dma_attr_t *dma_attr) 28460Sstevel@tonic-gate { 28472961Sdp78419 kmutex_t *pcm; 28482961Sdp78419 int i; 28492961Sdp78419 page_t *pp; 28502961Sdp78419 page_t *first_pp; 28512961Sdp78419 uint64_t pgaddr; 28522961Sdp78419 ulong_t bin; 28532961Sdp78419 int mtypestart; 28542961Sdp78419 int plw_initialized; 28552961Sdp78419 page_list_walker_t plw; 28560Sstevel@tonic-gate 28570Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_alloc); 28580Sstevel@tonic-gate 28590Sstevel@tonic-gate ASSERT((flags & PG_MATCH_COLOR) == 0); 28600Sstevel@tonic-gate ASSERT(szc == 0); 28610Sstevel@tonic-gate ASSERT(dma_attr != NULL); 28620Sstevel@tonic-gate 28630Sstevel@tonic-gate MTYPE_START(mnode, mtype, flags); 28640Sstevel@tonic-gate if (mtype < 0) { 28650Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocempty); 28660Sstevel@tonic-gate return (NULL); 28670Sstevel@tonic-gate } 28680Sstevel@tonic-gate 28690Sstevel@tonic-gate mtypestart = mtype; 28700Sstevel@tonic-gate 28710Sstevel@tonic-gate bin = origbin; 28720Sstevel@tonic-gate 28730Sstevel@tonic-gate /* 28740Sstevel@tonic-gate * check up to page_colors + 1 bins - origbin may be checked twice 28750Sstevel@tonic-gate * because of BIN_STEP skip 28760Sstevel@tonic-gate */ 28770Sstevel@tonic-gate do { 28782961Sdp78419 plw_initialized = 0; 28792961Sdp78419 28802961Sdp78419 for (plw.plw_count = 0; 28812961Sdp78419 plw.plw_count < page_colors; plw.plw_count++) { 28822961Sdp78419 28830Sstevel@tonic-gate if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 28840Sstevel@tonic-gate goto nextfreebin; 28850Sstevel@tonic-gate 28860Sstevel@tonic-gate pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 28870Sstevel@tonic-gate mutex_enter(pcm); 28880Sstevel@tonic-gate pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 28890Sstevel@tonic-gate first_pp = pp; 28900Sstevel@tonic-gate while (pp != NULL) { 28910Sstevel@tonic-gate if (page_trylock(pp, SE_EXCL) == 0) { 28920Sstevel@tonic-gate pp = pp->p_next; 28930Sstevel@tonic-gate if (pp == first_pp) { 28940Sstevel@tonic-gate pp = NULL; 28950Sstevel@tonic-gate } 28960Sstevel@tonic-gate continue; 28970Sstevel@tonic-gate } 28980Sstevel@tonic-gate 28990Sstevel@tonic-gate ASSERT(PP_ISFREE(pp)); 29000Sstevel@tonic-gate ASSERT(PP_ISAGED(pp)); 29010Sstevel@tonic-gate ASSERT(pp->p_vnode == NULL); 29020Sstevel@tonic-gate ASSERT(pp->p_hash == NULL); 29030Sstevel@tonic-gate ASSERT(pp->p_offset == (u_offset_t)-1); 29040Sstevel@tonic-gate ASSERT(pp->p_szc == szc); 29050Sstevel@tonic-gate ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 29060Sstevel@tonic-gate /* check if page within DMA attributes */ 29073446Smrj pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 29080Sstevel@tonic-gate if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 29090Sstevel@tonic-gate (pgaddr + MMU_PAGESIZE - 1 <= 29100Sstevel@tonic-gate dma_attr->dma_attr_addr_hi)) { 29110Sstevel@tonic-gate break; 29120Sstevel@tonic-gate } 29130Sstevel@tonic-gate 29140Sstevel@tonic-gate /* continue looking */ 29150Sstevel@tonic-gate page_unlock(pp); 29160Sstevel@tonic-gate pp = pp->p_next; 29170Sstevel@tonic-gate if (pp == first_pp) 29180Sstevel@tonic-gate pp = NULL; 29190Sstevel@tonic-gate 29200Sstevel@tonic-gate } 29210Sstevel@tonic-gate if (pp != NULL) { 29220Sstevel@tonic-gate ASSERT(mtype == PP_2_MTYPE(pp)); 29230Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 29240Sstevel@tonic-gate 29250Sstevel@tonic-gate /* found a page with specified DMA attributes */ 29260Sstevel@tonic-gate page_sub(&PAGE_FREELISTS(mnode, szc, bin, 29270Sstevel@tonic-gate mtype), pp); 2928414Skchow page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 29290Sstevel@tonic-gate 29300Sstevel@tonic-gate if ((PP_ISFREE(pp) == 0) || 29310Sstevel@tonic-gate (PP_ISAGED(pp) == 0)) { 29320Sstevel@tonic-gate cmn_err(CE_PANIC, "page %p is not free", 29330Sstevel@tonic-gate (void *)pp); 29340Sstevel@tonic-gate } 29350Sstevel@tonic-gate 29360Sstevel@tonic-gate mutex_exit(pcm); 29370Sstevel@tonic-gate check_dma(dma_attr, pp, 1); 29380Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocok); 29390Sstevel@tonic-gate return (pp); 29400Sstevel@tonic-gate } 29410Sstevel@tonic-gate mutex_exit(pcm); 29420Sstevel@tonic-gate nextfreebin: 29432961Sdp78419 if (plw_initialized == 0) { 29442961Sdp78419 page_list_walk_init(szc, 0, bin, 1, 0, &plw); 29452961Sdp78419 ASSERT(plw.plw_ceq_dif == page_colors); 29462961Sdp78419 plw_initialized = 1; 29472961Sdp78419 } 29480Sstevel@tonic-gate 29492961Sdp78419 if (plw.plw_do_split) { 29502961Sdp78419 pp = page_freelist_split(szc, bin, mnode, 29512961Sdp78419 mtype, 29527656SSherry.Moore@Sun.COM mmu_btop(dma_attr->dma_attr_addr_lo), 29532961Sdp78419 mmu_btop(dma_attr->dma_attr_addr_hi + 1), 29542961Sdp78419 &plw); 29557656SSherry.Moore@Sun.COM if (pp != NULL) { 29567656SSherry.Moore@Sun.COM check_dma(dma_attr, pp, 1); 29572961Sdp78419 return (pp); 29587656SSherry.Moore@Sun.COM } 29592961Sdp78419 } 29602961Sdp78419 29612961Sdp78419 bin = page_list_walk_next_bin(szc, bin, &plw); 29620Sstevel@tonic-gate } 29632961Sdp78419 2964414Skchow MTYPE_NEXT(mnode, mtype, flags); 2965414Skchow } while (mtype >= 0); 29660Sstevel@tonic-gate 29670Sstevel@tonic-gate /* failed to find a page in the freelist; try it in the cachelist */ 29680Sstevel@tonic-gate 29690Sstevel@tonic-gate /* reset mtype start for cachelist search */ 29700Sstevel@tonic-gate mtype = mtypestart; 29710Sstevel@tonic-gate ASSERT(mtype >= 0); 29720Sstevel@tonic-gate 29730Sstevel@tonic-gate /* start with the bin of matching color */ 29740Sstevel@tonic-gate bin = origbin; 29750Sstevel@tonic-gate 29760Sstevel@tonic-gate do { 29770Sstevel@tonic-gate for (i = 0; i <= page_colors; i++) { 29780Sstevel@tonic-gate if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 29790Sstevel@tonic-gate goto nextcachebin; 29800Sstevel@tonic-gate pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 29810Sstevel@tonic-gate mutex_enter(pcm); 29820Sstevel@tonic-gate pp = PAGE_CACHELISTS(mnode, bin, mtype); 29830Sstevel@tonic-gate first_pp = pp; 29840Sstevel@tonic-gate while (pp != NULL) { 29850Sstevel@tonic-gate if (page_trylock(pp, SE_EXCL) == 0) { 29860Sstevel@tonic-gate pp = pp->p_next; 29870Sstevel@tonic-gate if (pp == first_pp) 2988*9244SSherry.Moore@Sun.COM pp = NULL; 29890Sstevel@tonic-gate continue; 29900Sstevel@tonic-gate } 29910Sstevel@tonic-gate ASSERT(pp->p_vnode); 29920Sstevel@tonic-gate ASSERT(PP_ISAGED(pp) == 0); 29930Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 29940Sstevel@tonic-gate ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 29950Sstevel@tonic-gate 29960Sstevel@tonic-gate /* check if page within DMA attributes */ 29970Sstevel@tonic-gate 29983446Smrj pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 29990Sstevel@tonic-gate if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 30000Sstevel@tonic-gate (pgaddr + MMU_PAGESIZE - 1 <= 30010Sstevel@tonic-gate dma_attr->dma_attr_addr_hi)) { 30020Sstevel@tonic-gate break; 30030Sstevel@tonic-gate } 30040Sstevel@tonic-gate 30050Sstevel@tonic-gate /* continue looking */ 30060Sstevel@tonic-gate page_unlock(pp); 30070Sstevel@tonic-gate pp = pp->p_next; 30080Sstevel@tonic-gate if (pp == first_pp) 30090Sstevel@tonic-gate pp = NULL; 30100Sstevel@tonic-gate } 30110Sstevel@tonic-gate 30120Sstevel@tonic-gate if (pp != NULL) { 30130Sstevel@tonic-gate ASSERT(mtype == PP_2_MTYPE(pp)); 30140Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 30150Sstevel@tonic-gate 30160Sstevel@tonic-gate /* found a page with specified DMA attributes */ 30170Sstevel@tonic-gate page_sub(&PAGE_CACHELISTS(mnode, bin, 30180Sstevel@tonic-gate mtype), pp); 3019414Skchow page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 30200Sstevel@tonic-gate 30210Sstevel@tonic-gate mutex_exit(pcm); 30220Sstevel@tonic-gate ASSERT(pp->p_vnode); 30230Sstevel@tonic-gate ASSERT(PP_ISAGED(pp) == 0); 30240Sstevel@tonic-gate check_dma(dma_attr, pp, 1); 30250Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocok); 30260Sstevel@tonic-gate return (pp); 30270Sstevel@tonic-gate } 30280Sstevel@tonic-gate mutex_exit(pcm); 30290Sstevel@tonic-gate nextcachebin: 30300Sstevel@tonic-gate bin += (i == 0) ? BIN_STEP : 1; 30310Sstevel@tonic-gate bin &= page_colors_mask; 30320Sstevel@tonic-gate } 3033414Skchow MTYPE_NEXT(mnode, mtype, flags); 3034414Skchow } while (mtype >= 0); 30350Sstevel@tonic-gate 30360Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 30370Sstevel@tonic-gate return (NULL); 30380Sstevel@tonic-gate } 30390Sstevel@tonic-gate 30400Sstevel@tonic-gate /* 30410Sstevel@tonic-gate * This function is similar to page_get_freelist()/page_get_cachelist() 30420Sstevel@tonic-gate * but it searches both the lists to find a page with the specified 30430Sstevel@tonic-gate * color (or no color) and DMA attributes. The search is done in the 30440Sstevel@tonic-gate * freelist first and then in the cache list within the highest memory 30450Sstevel@tonic-gate * range (based on DMA attributes) before searching in the lower 30460Sstevel@tonic-gate * memory ranges. 30470Sstevel@tonic-gate * 30480Sstevel@tonic-gate * Note: This function is called only by page_create_io(). 30490Sstevel@tonic-gate */ 30500Sstevel@tonic-gate /*ARGSUSED*/ 30515084Sjohnlev static page_t * 30520Sstevel@tonic-gate page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 30530Sstevel@tonic-gate size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 30540Sstevel@tonic-gate { 30550Sstevel@tonic-gate uint_t bin; 30560Sstevel@tonic-gate int mtype; 30570Sstevel@tonic-gate page_t *pp; 30580Sstevel@tonic-gate int n; 30590Sstevel@tonic-gate int m; 30600Sstevel@tonic-gate int szc; 30610Sstevel@tonic-gate int fullrange; 30620Sstevel@tonic-gate int mnode; 30630Sstevel@tonic-gate int local_failed_stat = 0; 30640Sstevel@tonic-gate lgrp_mnode_cookie_t lgrp_cookie; 30650Sstevel@tonic-gate 30660Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_alloc); 30670Sstevel@tonic-gate 30680Sstevel@tonic-gate /* only base pagesize currently supported */ 30690Sstevel@tonic-gate if (size != MMU_PAGESIZE) 30700Sstevel@tonic-gate return (NULL); 30710Sstevel@tonic-gate 30720Sstevel@tonic-gate /* 30730Sstevel@tonic-gate * If we're passed a specific lgroup, we use it. Otherwise, 30740Sstevel@tonic-gate * assume first-touch placement is desired. 30750Sstevel@tonic-gate */ 30760Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp)) 30770Sstevel@tonic-gate lgrp = lgrp_home_lgrp(); 30780Sstevel@tonic-gate 30790Sstevel@tonic-gate /* LINTED */ 30802961Sdp78419 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 30810Sstevel@tonic-gate 30820Sstevel@tonic-gate /* 30830Sstevel@tonic-gate * Only hold one freelist or cachelist lock at a time, that way we 30840Sstevel@tonic-gate * can start anywhere and not have to worry about lock 30850Sstevel@tonic-gate * ordering. 30860Sstevel@tonic-gate */ 30870Sstevel@tonic-gate if (dma_attr == NULL) { 30880Sstevel@tonic-gate n = 0; 30890Sstevel@tonic-gate m = mnoderangecnt - 1; 30900Sstevel@tonic-gate fullrange = 1; 30910Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 30920Sstevel@tonic-gate } else { 30930Sstevel@tonic-gate pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 30940Sstevel@tonic-gate pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 30950Sstevel@tonic-gate 30960Sstevel@tonic-gate /* 30970Sstevel@tonic-gate * We can guarantee alignment only for page boundary. 30980Sstevel@tonic-gate */ 30990Sstevel@tonic-gate if (dma_attr->dma_attr_align > MMU_PAGESIZE) 31000Sstevel@tonic-gate return (NULL); 31010Sstevel@tonic-gate 31020Sstevel@tonic-gate n = pfn_2_mtype(pfnlo); 31030Sstevel@tonic-gate m = pfn_2_mtype(pfnhi); 31040Sstevel@tonic-gate 31050Sstevel@tonic-gate fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 31060Sstevel@tonic-gate (pfnhi >= mnoderanges[m].mnr_pfnhi)); 31070Sstevel@tonic-gate } 31080Sstevel@tonic-gate VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 31090Sstevel@tonic-gate 31100Sstevel@tonic-gate if (n > m) 31110Sstevel@tonic-gate return (NULL); 31120Sstevel@tonic-gate 31130Sstevel@tonic-gate szc = 0; 31140Sstevel@tonic-gate 31150Sstevel@tonic-gate /* cylcing thru mtype handled by RANGE0 if n == 0 */ 31160Sstevel@tonic-gate if (n == 0) { 31170Sstevel@tonic-gate flags |= PGI_MT_RANGE0; 31180Sstevel@tonic-gate n = m; 31190Sstevel@tonic-gate } 31200Sstevel@tonic-gate 31210Sstevel@tonic-gate /* 31220Sstevel@tonic-gate * Try local memory node first, but try remote if we can't 31230Sstevel@tonic-gate * get a page of the right color. 31240Sstevel@tonic-gate */ 31250Sstevel@tonic-gate LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 31260Sstevel@tonic-gate while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 31270Sstevel@tonic-gate /* 31280Sstevel@tonic-gate * allocate pages from high pfn to low. 31290Sstevel@tonic-gate */ 31300Sstevel@tonic-gate for (mtype = m; mtype >= n; mtype--) { 31310Sstevel@tonic-gate if (fullrange != 0) { 31320Sstevel@tonic-gate pp = page_get_mnode_freelist(mnode, 31330Sstevel@tonic-gate bin, mtype, szc, flags); 31340Sstevel@tonic-gate if (pp == NULL) { 31350Sstevel@tonic-gate pp = page_get_mnode_cachelist( 31365084Sjohnlev bin, flags, mnode, mtype); 31370Sstevel@tonic-gate } 31380Sstevel@tonic-gate } else { 31390Sstevel@tonic-gate pp = page_get_mnode_anylist(bin, szc, 31400Sstevel@tonic-gate flags, mnode, mtype, dma_attr); 31410Sstevel@tonic-gate } 31420Sstevel@tonic-gate if (pp != NULL) { 31430Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_allocok); 31440Sstevel@tonic-gate check_dma(dma_attr, pp, 1); 31450Sstevel@tonic-gate return (pp); 31460Sstevel@tonic-gate } 31470Sstevel@tonic-gate } 31480Sstevel@tonic-gate if (!local_failed_stat) { 31490Sstevel@tonic-gate lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 31500Sstevel@tonic-gate local_failed_stat = 1; 31510Sstevel@tonic-gate } 31520Sstevel@tonic-gate } 31530Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_allocfailed); 31540Sstevel@tonic-gate 31550Sstevel@tonic-gate return (NULL); 31560Sstevel@tonic-gate } 31570Sstevel@tonic-gate 31580Sstevel@tonic-gate /* 31590Sstevel@tonic-gate * page_create_io() 31600Sstevel@tonic-gate * 31610Sstevel@tonic-gate * This function is a copy of page_create_va() with an additional 31620Sstevel@tonic-gate * argument 'mattr' that specifies DMA memory requirements to 31630Sstevel@tonic-gate * the page list functions. This function is used by the segkmem 31640Sstevel@tonic-gate * allocator so it is only to create new pages (i.e PG_EXCL is 31650Sstevel@tonic-gate * set). 31660Sstevel@tonic-gate * 31670Sstevel@tonic-gate * Note: This interface is currently used by x86 PSM only and is 31680Sstevel@tonic-gate * not fully specified so the commitment level is only for 31690Sstevel@tonic-gate * private interface specific to x86. This interface uses PSM 31700Sstevel@tonic-gate * specific page_get_anylist() interface. 31710Sstevel@tonic-gate */ 31720Sstevel@tonic-gate 31730Sstevel@tonic-gate #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 31740Sstevel@tonic-gate for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 31750Sstevel@tonic-gate if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 31760Sstevel@tonic-gate break; \ 31770Sstevel@tonic-gate } \ 31780Sstevel@tonic-gate } 31790Sstevel@tonic-gate 31800Sstevel@tonic-gate 31810Sstevel@tonic-gate page_t * 31820Sstevel@tonic-gate page_create_io( 31830Sstevel@tonic-gate struct vnode *vp, 31840Sstevel@tonic-gate u_offset_t off, 31850Sstevel@tonic-gate uint_t bytes, 31860Sstevel@tonic-gate uint_t flags, 31870Sstevel@tonic-gate struct as *as, 31880Sstevel@tonic-gate caddr_t vaddr, 31890Sstevel@tonic-gate ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 31900Sstevel@tonic-gate { 31910Sstevel@tonic-gate page_t *plist = NULL; 31920Sstevel@tonic-gate uint_t plist_len = 0; 31930Sstevel@tonic-gate pgcnt_t npages; 31940Sstevel@tonic-gate page_t *npp = NULL; 31950Sstevel@tonic-gate uint_t pages_req; 31960Sstevel@tonic-gate page_t *pp; 31970Sstevel@tonic-gate kmutex_t *phm = NULL; 31980Sstevel@tonic-gate uint_t index; 31990Sstevel@tonic-gate 32000Sstevel@tonic-gate TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 32015084Sjohnlev "page_create_start:vp %p off %llx bytes %u flags %x", 32025084Sjohnlev vp, off, bytes, flags); 32030Sstevel@tonic-gate 32040Sstevel@tonic-gate ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 32050Sstevel@tonic-gate 32060Sstevel@tonic-gate pages_req = npages = mmu_btopr(bytes); 32070Sstevel@tonic-gate 32080Sstevel@tonic-gate /* 32090Sstevel@tonic-gate * Do the freemem and pcf accounting. 32100Sstevel@tonic-gate */ 32110Sstevel@tonic-gate if (!page_create_wait(npages, flags)) { 32120Sstevel@tonic-gate return (NULL); 32130Sstevel@tonic-gate } 32140Sstevel@tonic-gate 32150Sstevel@tonic-gate TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 32165084Sjohnlev "page_create_success:vp %p off %llx", vp, off); 32170Sstevel@tonic-gate 32180Sstevel@tonic-gate /* 32190Sstevel@tonic-gate * If satisfying this request has left us with too little 32200Sstevel@tonic-gate * memory, start the wheels turning to get some back. The 32210Sstevel@tonic-gate * first clause of the test prevents waking up the pageout 32220Sstevel@tonic-gate * daemon in situations where it would decide that there's 32230Sstevel@tonic-gate * nothing to do. 32240Sstevel@tonic-gate */ 32250Sstevel@tonic-gate if (nscan < desscan && freemem < minfree) { 32260Sstevel@tonic-gate TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 32275084Sjohnlev "pageout_cv_signal:freemem %ld", freemem); 32280Sstevel@tonic-gate cv_signal(&proc_pageout->p_cv); 32290Sstevel@tonic-gate } 32300Sstevel@tonic-gate 32310Sstevel@tonic-gate if (flags & PG_PHYSCONTIG) { 32320Sstevel@tonic-gate 32330Sstevel@tonic-gate plist = page_get_contigpage(&npages, mattr, 1); 32340Sstevel@tonic-gate if (plist == NULL) { 32350Sstevel@tonic-gate page_create_putback(npages); 32360Sstevel@tonic-gate return (NULL); 32370Sstevel@tonic-gate } 32380Sstevel@tonic-gate 32390Sstevel@tonic-gate pp = plist; 32400Sstevel@tonic-gate 32410Sstevel@tonic-gate do { 32420Sstevel@tonic-gate if (!page_hashin(pp, vp, off, NULL)) { 32430Sstevel@tonic-gate panic("pg_creat_io: hashin failed %p %p %llx", 32440Sstevel@tonic-gate (void *)pp, (void *)vp, off); 32450Sstevel@tonic-gate } 32460Sstevel@tonic-gate VM_STAT_ADD(page_create_new); 32470Sstevel@tonic-gate off += MMU_PAGESIZE; 32480Sstevel@tonic-gate PP_CLRFREE(pp); 32490Sstevel@tonic-gate PP_CLRAGED(pp); 32500Sstevel@tonic-gate page_set_props(pp, P_REF); 32510Sstevel@tonic-gate pp = pp->p_next; 32520Sstevel@tonic-gate } while (pp != plist); 32530Sstevel@tonic-gate 32540Sstevel@tonic-gate if (!npages) { 32550Sstevel@tonic-gate check_dma(mattr, plist, pages_req); 32560Sstevel@tonic-gate return (plist); 32570Sstevel@tonic-gate } else { 32580Sstevel@tonic-gate vaddr += (pages_req - npages) << MMU_PAGESHIFT; 32590Sstevel@tonic-gate } 32600Sstevel@tonic-gate 32610Sstevel@tonic-gate /* 32620Sstevel@tonic-gate * fall-thru: 32630Sstevel@tonic-gate * 32640Sstevel@tonic-gate * page_get_contigpage returns when npages <= sgllen. 32650Sstevel@tonic-gate * Grab the rest of the non-contig pages below from anylist. 32660Sstevel@tonic-gate */ 32670Sstevel@tonic-gate } 32680Sstevel@tonic-gate 32690Sstevel@tonic-gate /* 32700Sstevel@tonic-gate * Loop around collecting the requested number of pages. 32710Sstevel@tonic-gate * Most of the time, we have to `create' a new page. With 32720Sstevel@tonic-gate * this in mind, pull the page off the free list before 32730Sstevel@tonic-gate * getting the hash lock. This will minimize the hash 32740Sstevel@tonic-gate * lock hold time, nesting, and the like. If it turns 32750Sstevel@tonic-gate * out we don't need the page, we put it back at the end. 32760Sstevel@tonic-gate */ 32770Sstevel@tonic-gate while (npages--) { 32780Sstevel@tonic-gate phm = NULL; 32790Sstevel@tonic-gate 32800Sstevel@tonic-gate index = PAGE_HASH_FUNC(vp, off); 32810Sstevel@tonic-gate top: 32820Sstevel@tonic-gate ASSERT(phm == NULL); 32830Sstevel@tonic-gate ASSERT(index == PAGE_HASH_FUNC(vp, off)); 32840Sstevel@tonic-gate ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 32850Sstevel@tonic-gate 32860Sstevel@tonic-gate if (npp == NULL) { 32870Sstevel@tonic-gate /* 32880Sstevel@tonic-gate * Try to get the page of any color either from 32890Sstevel@tonic-gate * the freelist or from the cache list. 32900Sstevel@tonic-gate */ 32910Sstevel@tonic-gate npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 32920Sstevel@tonic-gate flags & ~PG_MATCH_COLOR, mattr, NULL); 32930Sstevel@tonic-gate if (npp == NULL) { 32940Sstevel@tonic-gate if (mattr == NULL) { 32950Sstevel@tonic-gate /* 32960Sstevel@tonic-gate * Not looking for a special page; 32970Sstevel@tonic-gate * panic! 32980Sstevel@tonic-gate */ 32990Sstevel@tonic-gate panic("no page found %d", (int)npages); 33000Sstevel@tonic-gate } 33010Sstevel@tonic-gate /* 33020Sstevel@tonic-gate * No page found! This can happen 33030Sstevel@tonic-gate * if we are looking for a page 33040Sstevel@tonic-gate * within a specific memory range 33050Sstevel@tonic-gate * for DMA purposes. If PG_WAIT is 33060Sstevel@tonic-gate * specified then we wait for a 33070Sstevel@tonic-gate * while and then try again. The 33080Sstevel@tonic-gate * wait could be forever if we 33090Sstevel@tonic-gate * don't get the page(s) we need. 33100Sstevel@tonic-gate * 33110Sstevel@tonic-gate * Note: XXX We really need a mechanism 33120Sstevel@tonic-gate * to wait for pages in the desired 33130Sstevel@tonic-gate * range. For now, we wait for any 33140Sstevel@tonic-gate * pages and see if we can use it. 33150Sstevel@tonic-gate */ 33160Sstevel@tonic-gate 33170Sstevel@tonic-gate if ((mattr != NULL) && (flags & PG_WAIT)) { 33180Sstevel@tonic-gate delay(10); 33190Sstevel@tonic-gate goto top; 33200Sstevel@tonic-gate } 33210Sstevel@tonic-gate goto fail; /* undo accounting stuff */ 33220Sstevel@tonic-gate } 33230Sstevel@tonic-gate 33240Sstevel@tonic-gate if (PP_ISAGED(npp) == 0) { 33250Sstevel@tonic-gate /* 33260Sstevel@tonic-gate * Since this page came from the 33270Sstevel@tonic-gate * cachelist, we must destroy the 33280Sstevel@tonic-gate * old vnode association. 33290Sstevel@tonic-gate */ 33300Sstevel@tonic-gate page_hashout(npp, (kmutex_t *)NULL); 33310Sstevel@tonic-gate } 33320Sstevel@tonic-gate } 33330Sstevel@tonic-gate 33340Sstevel@tonic-gate /* 33350Sstevel@tonic-gate * We own this page! 33360Sstevel@tonic-gate */ 33370Sstevel@tonic-gate ASSERT(PAGE_EXCL(npp)); 33380Sstevel@tonic-gate ASSERT(npp->p_vnode == NULL); 33390Sstevel@tonic-gate ASSERT(!hat_page_is_mapped(npp)); 33400Sstevel@tonic-gate PP_CLRFREE(npp); 33410Sstevel@tonic-gate PP_CLRAGED(npp); 33420Sstevel@tonic-gate 33430Sstevel@tonic-gate /* 33440Sstevel@tonic-gate * Here we have a page in our hot little mits and are 33450Sstevel@tonic-gate * just waiting to stuff it on the appropriate lists. 33460Sstevel@tonic-gate * Get the mutex and check to see if it really does 33470Sstevel@tonic-gate * not exist. 33480Sstevel@tonic-gate */ 33490Sstevel@tonic-gate phm = PAGE_HASH_MUTEX(index); 33500Sstevel@tonic-gate mutex_enter(phm); 33510Sstevel@tonic-gate PAGE_HASH_SEARCH(index, pp, vp, off); 33520Sstevel@tonic-gate if (pp == NULL) { 33530Sstevel@tonic-gate VM_STAT_ADD(page_create_new); 33540Sstevel@tonic-gate pp = npp; 33550Sstevel@tonic-gate npp = NULL; 33560Sstevel@tonic-gate if (!page_hashin(pp, vp, off, phm)) { 33570Sstevel@tonic-gate /* 33580Sstevel@tonic-gate * Since we hold the page hash mutex and 33590Sstevel@tonic-gate * just searched for this page, page_hashin 33600Sstevel@tonic-gate * had better not fail. If it does, that 33610Sstevel@tonic-gate * means somethread did not follow the 33620Sstevel@tonic-gate * page hash mutex rules. Panic now and 33630Sstevel@tonic-gate * get it over with. As usual, go down 33640Sstevel@tonic-gate * holding all the locks. 33650Sstevel@tonic-gate */ 33660Sstevel@tonic-gate ASSERT(MUTEX_HELD(phm)); 33670Sstevel@tonic-gate panic("page_create: hashin fail %p %p %llx %p", 33680Sstevel@tonic-gate (void *)pp, (void *)vp, off, (void *)phm); 33690Sstevel@tonic-gate 33700Sstevel@tonic-gate } 33710Sstevel@tonic-gate ASSERT(MUTEX_HELD(phm)); 33720Sstevel@tonic-gate mutex_exit(phm); 33730Sstevel@tonic-gate phm = NULL; 33740Sstevel@tonic-gate 33750Sstevel@tonic-gate /* 33760Sstevel@tonic-gate * Hat layer locking need not be done to set 33770Sstevel@tonic-gate * the following bits since the page is not hashed 33780Sstevel@tonic-gate * and was on the free list (i.e., had no mappings). 33790Sstevel@tonic-gate * 33800Sstevel@tonic-gate * Set the reference bit to protect 33810Sstevel@tonic-gate * against immediate pageout 33820Sstevel@tonic-gate * 33830Sstevel@tonic-gate * XXXmh modify freelist code to set reference 33840Sstevel@tonic-gate * bit so we don't have to do it here. 33850Sstevel@tonic-gate */ 33860Sstevel@tonic-gate page_set_props(pp, P_REF); 33870Sstevel@tonic-gate } else { 33880Sstevel@tonic-gate ASSERT(MUTEX_HELD(phm)); 33890Sstevel@tonic-gate mutex_exit(phm); 33900Sstevel@tonic-gate phm = NULL; 33910Sstevel@tonic-gate /* 33920Sstevel@tonic-gate * NOTE: This should not happen for pages associated 33930Sstevel@tonic-gate * with kernel vnode 'kvp'. 33940Sstevel@tonic-gate */ 33950Sstevel@tonic-gate /* XX64 - to debug why this happens! */ 33963290Sjohansen ASSERT(!VN_ISKAS(vp)); 33973290Sjohansen if (VN_ISKAS(vp)) 33980Sstevel@tonic-gate cmn_err(CE_NOTE, 33990Sstevel@tonic-gate "page_create: page not expected " 34000Sstevel@tonic-gate "in hash list for kernel vnode - pp 0x%p", 34010Sstevel@tonic-gate (void *)pp); 34020Sstevel@tonic-gate VM_STAT_ADD(page_create_exists); 34030Sstevel@tonic-gate goto fail; 34040Sstevel@tonic-gate } 34050Sstevel@tonic-gate 34060Sstevel@tonic-gate /* 34070Sstevel@tonic-gate * Got a page! It is locked. Acquire the i/o 34080Sstevel@tonic-gate * lock since we are going to use the p_next and 34090Sstevel@tonic-gate * p_prev fields to link the requested pages together. 34100Sstevel@tonic-gate */ 34110Sstevel@tonic-gate page_io_lock(pp); 34120Sstevel@tonic-gate page_add(&plist, pp); 34130Sstevel@tonic-gate plist = plist->p_next; 34140Sstevel@tonic-gate off += MMU_PAGESIZE; 34150Sstevel@tonic-gate vaddr += MMU_PAGESIZE; 34160Sstevel@tonic-gate } 34170Sstevel@tonic-gate 34180Sstevel@tonic-gate check_dma(mattr, plist, pages_req); 34190Sstevel@tonic-gate return (plist); 34200Sstevel@tonic-gate 34210Sstevel@tonic-gate fail: 34220Sstevel@tonic-gate if (npp != NULL) { 34230Sstevel@tonic-gate /* 34240Sstevel@tonic-gate * Did not need this page after all. 34250Sstevel@tonic-gate * Put it back on the free list. 34260Sstevel@tonic-gate */ 34270Sstevel@tonic-gate VM_STAT_ADD(page_create_putbacks); 34280Sstevel@tonic-gate PP_SETFREE(npp); 34290Sstevel@tonic-gate PP_SETAGED(npp); 34300Sstevel@tonic-gate npp->p_offset = (u_offset_t)-1; 34310Sstevel@tonic-gate page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 34320Sstevel@tonic-gate page_unlock(npp); 34330Sstevel@tonic-gate } 34340Sstevel@tonic-gate 34350Sstevel@tonic-gate /* 34360Sstevel@tonic-gate * Give up the pages we already got. 34370Sstevel@tonic-gate */ 34380Sstevel@tonic-gate while (plist != NULL) { 34390Sstevel@tonic-gate pp = plist; 34400Sstevel@tonic-gate page_sub(&plist, pp); 34410Sstevel@tonic-gate page_io_unlock(pp); 34420Sstevel@tonic-gate plist_len++; 34430Sstevel@tonic-gate /*LINTED: constant in conditional ctx*/ 34440Sstevel@tonic-gate VN_DISPOSE(pp, B_INVAL, 0, kcred); 34450Sstevel@tonic-gate } 34460Sstevel@tonic-gate 34470Sstevel@tonic-gate /* 34480Sstevel@tonic-gate * VN_DISPOSE does freemem accounting for the pages in plist 34490Sstevel@tonic-gate * by calling page_free. So, we need to undo the pcf accounting 34500Sstevel@tonic-gate * for only the remaining pages. 34510Sstevel@tonic-gate */ 34520Sstevel@tonic-gate VM_STAT_ADD(page_create_putbacks); 34530Sstevel@tonic-gate page_create_putback(pages_req - plist_len); 34540Sstevel@tonic-gate 34550Sstevel@tonic-gate return (NULL); 34560Sstevel@tonic-gate } 34575084Sjohnlev #endif /* !__xpv */ 34580Sstevel@tonic-gate 34590Sstevel@tonic-gate 34600Sstevel@tonic-gate /* 34610Sstevel@tonic-gate * Copy the data from the physical page represented by "frompp" to 34620Sstevel@tonic-gate * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 34630Sstevel@tonic-gate * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 34640Sstevel@tonic-gate * level and no one sleeps with an active mapping there. 34650Sstevel@tonic-gate * 34660Sstevel@tonic-gate * Note that the ref/mod bits in the page_t's are not affected by 34670Sstevel@tonic-gate * this operation, hence it is up to the caller to update them appropriately. 34680Sstevel@tonic-gate */ 34693253Smec int 34700Sstevel@tonic-gate ppcopy(page_t *frompp, page_t *topp) 34710Sstevel@tonic-gate { 34720Sstevel@tonic-gate caddr_t pp_addr1; 34730Sstevel@tonic-gate caddr_t pp_addr2; 34743446Smrj hat_mempte_t pte1; 34753446Smrj hat_mempte_t pte2; 34760Sstevel@tonic-gate kmutex_t *ppaddr_mutex; 34773253Smec label_t ljb; 34783253Smec int ret = 1; 34790Sstevel@tonic-gate 34800Sstevel@tonic-gate ASSERT_STACK_ALIGNED(); 34810Sstevel@tonic-gate ASSERT(PAGE_LOCKED(frompp)); 34820Sstevel@tonic-gate ASSERT(PAGE_LOCKED(topp)); 34830Sstevel@tonic-gate 34840Sstevel@tonic-gate if (kpm_enable) { 34850Sstevel@tonic-gate pp_addr1 = hat_kpm_page2va(frompp, 0); 34860Sstevel@tonic-gate pp_addr2 = hat_kpm_page2va(topp, 0); 34870Sstevel@tonic-gate kpreempt_disable(); 34880Sstevel@tonic-gate } else { 34890Sstevel@tonic-gate /* 34900Sstevel@tonic-gate * disable pre-emption so that CPU can't change 34910Sstevel@tonic-gate */ 34920Sstevel@tonic-gate kpreempt_disable(); 34930Sstevel@tonic-gate 34940Sstevel@tonic-gate pp_addr1 = CPU->cpu_caddr1; 34950Sstevel@tonic-gate pp_addr2 = CPU->cpu_caddr2; 34963446Smrj pte1 = CPU->cpu_caddr1pte; 34973446Smrj pte2 = CPU->cpu_caddr2pte; 34980Sstevel@tonic-gate 34990Sstevel@tonic-gate ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 35000Sstevel@tonic-gate mutex_enter(ppaddr_mutex); 35010Sstevel@tonic-gate 35020Sstevel@tonic-gate hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 35030Sstevel@tonic-gate PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 35040Sstevel@tonic-gate hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 35050Sstevel@tonic-gate PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 35060Sstevel@tonic-gate HAT_LOAD_NOCONSIST); 35070Sstevel@tonic-gate } 35080Sstevel@tonic-gate 35093253Smec if (on_fault(&ljb)) { 35103253Smec ret = 0; 35113253Smec goto faulted; 35123253Smec } 35130Sstevel@tonic-gate if (use_sse_pagecopy) 35145084Sjohnlev #ifdef __xpv 35155084Sjohnlev page_copy_no_xmm(pp_addr2, pp_addr1); 35165084Sjohnlev #else 35170Sstevel@tonic-gate hwblkpagecopy(pp_addr1, pp_addr2); 35185084Sjohnlev #endif 35190Sstevel@tonic-gate else 35200Sstevel@tonic-gate bcopy(pp_addr1, pp_addr2, PAGESIZE); 35210Sstevel@tonic-gate 35223253Smec no_fault(); 35233253Smec faulted: 35243446Smrj if (!kpm_enable) { 35255084Sjohnlev #ifdef __xpv 35265084Sjohnlev /* 35275217Sjosephb * We can't leave unused mappings laying about under the 35285217Sjosephb * hypervisor, so blow them away. 35295084Sjohnlev */ 35305217Sjosephb if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1, 0, 35315217Sjosephb UVMF_INVLPG | UVMF_LOCAL) < 0) 35325217Sjosephb panic("HYPERVISOR_update_va_mapping() failed"); 35335084Sjohnlev if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0, 35345084Sjohnlev UVMF_INVLPG | UVMF_LOCAL) < 0) 35355084Sjohnlev panic("HYPERVISOR_update_va_mapping() failed"); 35365084Sjohnlev #endif 35370Sstevel@tonic-gate mutex_exit(ppaddr_mutex); 35383446Smrj } 35390Sstevel@tonic-gate kpreempt_enable(); 35403253Smec return (ret); 35410Sstevel@tonic-gate } 35420Sstevel@tonic-gate 35435262Srscott void 35445262Srscott pagezero(page_t *pp, uint_t off, uint_t len) 35455262Srscott { 35465262Srscott ASSERT(PAGE_LOCKED(pp)); 35475262Srscott pfnzero(page_pptonum(pp), off, len); 35485262Srscott } 35495262Srscott 35500Sstevel@tonic-gate /* 35515262Srscott * Zero the physical page from off to off + len given by pfn 35520Sstevel@tonic-gate * without changing the reference and modified bits of page. 35530Sstevel@tonic-gate * 35540Sstevel@tonic-gate * We use this using CPU private page address #2, see ppcopy() for more info. 35555262Srscott * pfnzero() must not be called at interrupt level. 35560Sstevel@tonic-gate */ 35570Sstevel@tonic-gate void 35585262Srscott pfnzero(pfn_t pfn, uint_t off, uint_t len) 35590Sstevel@tonic-gate { 35600Sstevel@tonic-gate caddr_t pp_addr2; 35613446Smrj hat_mempte_t pte2; 35625262Srscott kmutex_t *ppaddr_mutex = NULL; 35630Sstevel@tonic-gate 35640Sstevel@tonic-gate ASSERT_STACK_ALIGNED(); 35650Sstevel@tonic-gate ASSERT(len <= MMU_PAGESIZE); 35660Sstevel@tonic-gate ASSERT(off <= MMU_PAGESIZE); 35670Sstevel@tonic-gate ASSERT(off + len <= MMU_PAGESIZE); 35685262Srscott 35695262Srscott if (kpm_enable && !pfn_is_foreign(pfn)) { 35705262Srscott pp_addr2 = hat_kpm_pfn2va(pfn); 35710Sstevel@tonic-gate kpreempt_disable(); 35720Sstevel@tonic-gate } else { 35730Sstevel@tonic-gate kpreempt_disable(); 35740Sstevel@tonic-gate 35750Sstevel@tonic-gate pp_addr2 = CPU->cpu_caddr2; 35763446Smrj pte2 = CPU->cpu_caddr2pte; 35770Sstevel@tonic-gate 35780Sstevel@tonic-gate ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 35790Sstevel@tonic-gate mutex_enter(ppaddr_mutex); 35800Sstevel@tonic-gate 35815262Srscott hat_mempte_remap(pfn, pp_addr2, pte2, 35820Sstevel@tonic-gate PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 35830Sstevel@tonic-gate HAT_LOAD_NOCONSIST); 35840Sstevel@tonic-gate } 35850Sstevel@tonic-gate 35863446Smrj if (use_sse_pagezero) { 35875084Sjohnlev #ifdef __xpv 35885084Sjohnlev uint_t rem; 35895084Sjohnlev 35905084Sjohnlev /* 35915084Sjohnlev * zero a byte at a time until properly aligned for 35925084Sjohnlev * block_zero_no_xmm(). 35935084Sjohnlev */ 35945084Sjohnlev while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0) 35955084Sjohnlev pp_addr2[off++] = 0; 35965084Sjohnlev 35975084Sjohnlev /* 35985084Sjohnlev * Now use faster block_zero_no_xmm() for any range 35995084Sjohnlev * that is properly aligned and sized. 36005084Sjohnlev */ 36015084Sjohnlev rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN)); 36025084Sjohnlev len -= rem; 36035084Sjohnlev if (len != 0) { 36045084Sjohnlev block_zero_no_xmm(pp_addr2 + off, len); 36055084Sjohnlev off += len; 36065084Sjohnlev } 36075084Sjohnlev 36085084Sjohnlev /* 36095084Sjohnlev * zero remainder with byte stores. 36105084Sjohnlev */ 36115084Sjohnlev while (rem-- > 0) 36125084Sjohnlev pp_addr2[off++] = 0; 36135084Sjohnlev #else 36140Sstevel@tonic-gate hwblkclr(pp_addr2 + off, len); 36155084Sjohnlev #endif 36163446Smrj } else { 36170Sstevel@tonic-gate bzero(pp_addr2 + off, len); 36183446Smrj } 36190Sstevel@tonic-gate 36205262Srscott if (!kpm_enable || pfn_is_foreign(pfn)) { 36215084Sjohnlev #ifdef __xpv 36225262Srscott /* 36235262Srscott * On the hypervisor this page might get used for a page 36245262Srscott * table before any intervening change to this mapping, 36255262Srscott * so blow it away. 36265262Srscott */ 36275262Srscott if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0, 36285262Srscott UVMF_INVLPG) < 0) 36295262Srscott panic("HYPERVISOR_update_va_mapping() failed"); 36305084Sjohnlev #endif 36310Sstevel@tonic-gate mutex_exit(ppaddr_mutex); 36325262Srscott } 36335262Srscott 36340Sstevel@tonic-gate kpreempt_enable(); 36350Sstevel@tonic-gate } 36360Sstevel@tonic-gate 36370Sstevel@tonic-gate /* 36380Sstevel@tonic-gate * Platform-dependent page scrub call. 36390Sstevel@tonic-gate */ 36400Sstevel@tonic-gate void 36410Sstevel@tonic-gate pagescrub(page_t *pp, uint_t off, uint_t len) 36420Sstevel@tonic-gate { 36430Sstevel@tonic-gate /* 36440Sstevel@tonic-gate * For now, we rely on the fact that pagezero() will 36450Sstevel@tonic-gate * always clear UEs. 36460Sstevel@tonic-gate */ 36470Sstevel@tonic-gate pagezero(pp, off, len); 36480Sstevel@tonic-gate } 36490Sstevel@tonic-gate 36500Sstevel@tonic-gate /* 36510Sstevel@tonic-gate * set up two private addresses for use on a given CPU for use in ppcopy() 36520Sstevel@tonic-gate */ 36530Sstevel@tonic-gate void 36540Sstevel@tonic-gate setup_vaddr_for_ppcopy(struct cpu *cpup) 36550Sstevel@tonic-gate { 36560Sstevel@tonic-gate void *addr; 36573446Smrj hat_mempte_t pte_pa; 36580Sstevel@tonic-gate 36590Sstevel@tonic-gate addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 36603446Smrj pte_pa = hat_mempte_setup(addr); 36610Sstevel@tonic-gate cpup->cpu_caddr1 = addr; 36623446Smrj cpup->cpu_caddr1pte = pte_pa; 36630Sstevel@tonic-gate 36640Sstevel@tonic-gate addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 36653446Smrj pte_pa = hat_mempte_setup(addr); 36660Sstevel@tonic-gate cpup->cpu_caddr2 = addr; 36673446Smrj cpup->cpu_caddr2pte = pte_pa; 36680Sstevel@tonic-gate 36690Sstevel@tonic-gate mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 36700Sstevel@tonic-gate } 36710Sstevel@tonic-gate 36723446Smrj /* 36733446Smrj * Undo setup_vaddr_for_ppcopy 36743446Smrj */ 36753446Smrj void 36763446Smrj teardown_vaddr_for_ppcopy(struct cpu *cpup) 36773446Smrj { 36783446Smrj mutex_destroy(&cpup->cpu_ppaddr_mutex); 36793446Smrj 36803446Smrj hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte); 36813446Smrj cpup->cpu_caddr2pte = 0; 36823446Smrj vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1)); 36833446Smrj cpup->cpu_caddr2 = 0; 36843446Smrj 36853446Smrj hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte); 36863446Smrj cpup->cpu_caddr1pte = 0; 36873446Smrj vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1)); 36883446Smrj cpup->cpu_caddr1 = 0; 36893446Smrj } 36900Sstevel@tonic-gate 36910Sstevel@tonic-gate /* 36920Sstevel@tonic-gate * Create the pageout scanner thread. The thread has to 36930Sstevel@tonic-gate * start at procedure with process pp and priority pri. 36940Sstevel@tonic-gate */ 36950Sstevel@tonic-gate void 36960Sstevel@tonic-gate pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 36970Sstevel@tonic-gate { 36980Sstevel@tonic-gate (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 36990Sstevel@tonic-gate } 37000Sstevel@tonic-gate 37010Sstevel@tonic-gate /* 37020Sstevel@tonic-gate * Function for flushing D-cache when performing module relocations 37030Sstevel@tonic-gate * to an alternate mapping. Unnecessary on Intel / AMD platforms. 37040Sstevel@tonic-gate */ 37050Sstevel@tonic-gate void 37060Sstevel@tonic-gate dcache_flushall() 37070Sstevel@tonic-gate {} 37083177Sdp78419 37093177Sdp78419 size_t 37103177Sdp78419 exec_get_spslew(void) 37113177Sdp78419 { 37123177Sdp78419 return (0); 37133177Sdp78419 } 37143446Smrj 37153446Smrj /* 37163446Smrj * Allocate a memory page. The argument 'seed' can be any pseudo-random 37173446Smrj * number to vary where the pages come from. This is quite a hacked up 37183446Smrj * method -- it works for now, but really needs to be fixed up a bit. 37193446Smrj * 37203446Smrj * We currently use page_create_va() on the kvp with fake offsets, 37213446Smrj * segments and virt address. This is pretty bogus, but was copied from the 37223446Smrj * old hat_i86.c code. A better approach would be to specify either mnode 37233446Smrj * random or mnode local and takes a page from whatever color has the MOST 37243446Smrj * available - this would have a minimal impact on page coloring. 37253446Smrj */ 37263446Smrj page_t * 37279062SVikram.Hegde@Sun.COM page_get_physical(uintptr_t seed) 37283446Smrj { 37293446Smrj page_t *pp; 37309062SVikram.Hegde@Sun.COM u_offset_t offset; 37313446Smrj static struct seg tmpseg; 37323446Smrj static uintptr_t ctr = 0; 37333446Smrj 37343446Smrj /* 37353446Smrj * This code is gross, we really need a simpler page allocator. 37363446Smrj * 37379062SVikram.Hegde@Sun.COM * We need to assign an offset for the page to call page_create_va() 37383446Smrj * To avoid conflicts with other pages, we get creative with the offset. 37397589SVikram.Hegde@Sun.COM * For 32 bits, we need an offset > 4Gig 37407589SVikram.Hegde@Sun.COM * For 64 bits, need an offset somewhere in the VA hole. 37413446Smrj */ 37429062SVikram.Hegde@Sun.COM offset = seed; 37439062SVikram.Hegde@Sun.COM if (offset > kernelbase) 37449062SVikram.Hegde@Sun.COM offset -= kernelbase; 37459062SVikram.Hegde@Sun.COM offset <<= MMU_PAGESHIFT; 37469062SVikram.Hegde@Sun.COM #if defined(__amd64) 37479062SVikram.Hegde@Sun.COM offset += mmu.hole_start; /* something in VA hole */ 37489062SVikram.Hegde@Sun.COM #else 37499062SVikram.Hegde@Sun.COM offset += 1ULL << 40; /* something > 4 Gig */ 37509062SVikram.Hegde@Sun.COM #endif 37519062SVikram.Hegde@Sun.COM 37529062SVikram.Hegde@Sun.COM if (page_resv(1, KM_NOSLEEP) == 0) 37533446Smrj return (NULL); 37543446Smrj 37553446Smrj #ifdef DEBUG 37563446Smrj pp = page_exists(&kvp, offset); 37573446Smrj if (pp != NULL) 37587240Srh87107 panic("page already exists %p", (void *)pp); 37593446Smrj #endif 37603446Smrj 37615084Sjohnlev pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL, 37623446Smrj &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE)); /* changing VA usage */ 37637589SVikram.Hegde@Sun.COM if (pp != NULL) { 37647589SVikram.Hegde@Sun.COM page_io_unlock(pp); 37657589SVikram.Hegde@Sun.COM page_hashout(pp, NULL); 37668198SVikram.Hegde@Sun.COM page_downgrade(pp); 37677589SVikram.Hegde@Sun.COM } 37683446Smrj return (pp); 37693446Smrj } 3770