10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 51443Skchow * Common Development and Distribution License (the "License"). 61443Skchow * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate /* 223446Smrj * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 230Sstevel@tonic-gate * Use is subject to license terms. 240Sstevel@tonic-gate */ 250Sstevel@tonic-gate 260Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 270Sstevel@tonic-gate /* All Rights Reserved */ 280Sstevel@tonic-gate 290Sstevel@tonic-gate /* 300Sstevel@tonic-gate * Portions of this source code were derived from Berkeley 4.3 BSD 310Sstevel@tonic-gate * under license from the Regents of the University of California. 320Sstevel@tonic-gate */ 330Sstevel@tonic-gate 340Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 350Sstevel@tonic-gate 360Sstevel@tonic-gate /* 370Sstevel@tonic-gate * UNIX machine dependent virtual memory support. 380Sstevel@tonic-gate */ 390Sstevel@tonic-gate 400Sstevel@tonic-gate #include <sys/types.h> 410Sstevel@tonic-gate #include <sys/param.h> 420Sstevel@tonic-gate #include <sys/systm.h> 430Sstevel@tonic-gate #include <sys/user.h> 440Sstevel@tonic-gate #include <sys/proc.h> 450Sstevel@tonic-gate #include <sys/kmem.h> 460Sstevel@tonic-gate #include <sys/vmem.h> 470Sstevel@tonic-gate #include <sys/buf.h> 480Sstevel@tonic-gate #include <sys/cpuvar.h> 490Sstevel@tonic-gate #include <sys/lgrp.h> 500Sstevel@tonic-gate #include <sys/disp.h> 510Sstevel@tonic-gate #include <sys/vm.h> 520Sstevel@tonic-gate #include <sys/mman.h> 530Sstevel@tonic-gate #include <sys/vnode.h> 540Sstevel@tonic-gate #include <sys/cred.h> 550Sstevel@tonic-gate #include <sys/exec.h> 560Sstevel@tonic-gate #include <sys/exechdr.h> 570Sstevel@tonic-gate #include <sys/debug.h> 582991Ssusans #include <sys/vmsystm.h> 590Sstevel@tonic-gate 600Sstevel@tonic-gate #include <vm/hat.h> 610Sstevel@tonic-gate #include <vm/as.h> 620Sstevel@tonic-gate #include <vm/seg.h> 630Sstevel@tonic-gate #include <vm/seg_kp.h> 640Sstevel@tonic-gate #include <vm/seg_vn.h> 650Sstevel@tonic-gate #include <vm/page.h> 660Sstevel@tonic-gate #include <vm/seg_kmem.h> 670Sstevel@tonic-gate #include <vm/seg_kpm.h> 680Sstevel@tonic-gate #include <vm/vm_dep.h> 690Sstevel@tonic-gate 700Sstevel@tonic-gate #include <sys/cpu.h> 710Sstevel@tonic-gate #include <sys/vm_machparam.h> 720Sstevel@tonic-gate #include <sys/memlist.h> 730Sstevel@tonic-gate #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 740Sstevel@tonic-gate #include <vm/hat_i86.h> 750Sstevel@tonic-gate #include <sys/x86_archext.h> 760Sstevel@tonic-gate #include <sys/elf_386.h> 770Sstevel@tonic-gate #include <sys/cmn_err.h> 780Sstevel@tonic-gate #include <sys/archsystm.h> 790Sstevel@tonic-gate #include <sys/machsystm.h> 800Sstevel@tonic-gate 810Sstevel@tonic-gate #include <sys/vtrace.h> 820Sstevel@tonic-gate #include <sys/ddidmareq.h> 830Sstevel@tonic-gate #include <sys/promif.h> 840Sstevel@tonic-gate #include <sys/memnode.h> 850Sstevel@tonic-gate #include <sys/stack.h> 86*5084Sjohnlev #include <util/qsort.h> 87*5084Sjohnlev #include <sys/taskq.h> 88*5084Sjohnlev 89*5084Sjohnlev #ifdef __xpv 90*5084Sjohnlev 91*5084Sjohnlev #include <sys/hypervisor.h> 92*5084Sjohnlev #include <sys/xen_mmu.h> 93*5084Sjohnlev #include <sys/balloon_impl.h> 94*5084Sjohnlev 95*5084Sjohnlev /* 96*5084Sjohnlev * domain 0 pages usable for DMA are kept pre-allocated and kept in 97*5084Sjohnlev * distinct lists, ordered by increasing mfn. 98*5084Sjohnlev */ 99*5084Sjohnlev static kmutex_t io_pool_lock; 100*5084Sjohnlev static page_t *io_pool_4g; /* pool for 32 bit dma limited devices */ 101*5084Sjohnlev static page_t *io_pool_16m; /* pool for 24 bit dma limited legacy devices */ 102*5084Sjohnlev static long io_pool_cnt; 103*5084Sjohnlev static long io_pool_cnt_max = 0; 104*5084Sjohnlev #define DEFAULT_IO_POOL_MIN 128 105*5084Sjohnlev static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN; 106*5084Sjohnlev static long io_pool_cnt_lowater = 0; 107*5084Sjohnlev static long io_pool_shrink_attempts; /* how many times did we try to shrink */ 108*5084Sjohnlev static long io_pool_shrinks; /* how many times did we really shrink */ 109*5084Sjohnlev static long io_pool_grows; /* how many times did we grow */ 110*5084Sjohnlev static mfn_t start_mfn = 1; 111*5084Sjohnlev static caddr_t io_pool_kva; /* use to alloc pages when needed */ 112*5084Sjohnlev 113*5084Sjohnlev static int create_contig_pfnlist(uint_t); 114*5084Sjohnlev 115*5084Sjohnlev /* 116*5084Sjohnlev * percentage of phys mem to hold in the i/o pool 117*5084Sjohnlev */ 118*5084Sjohnlev #define DEFAULT_IO_POOL_PCT 2 119*5084Sjohnlev static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT; 120*5084Sjohnlev static void page_io_pool_sub(page_t **, page_t *, page_t *); 121*5084Sjohnlev 122*5084Sjohnlev #endif /* __xpv */ 1230Sstevel@tonic-gate 1242961Sdp78419 uint_t vac_colors = 1; 1250Sstevel@tonic-gate 1260Sstevel@tonic-gate int largepagesupport = 0; 1270Sstevel@tonic-gate extern uint_t page_create_new; 1280Sstevel@tonic-gate extern uint_t page_create_exists; 1290Sstevel@tonic-gate extern uint_t page_create_putbacks; 1300Sstevel@tonic-gate extern uint_t page_create_putbacks; 1313446Smrj /* 1323446Smrj * Allow users to disable the kernel's use of SSE. 1333446Smrj */ 1343446Smrj extern int use_sse_pagecopy, use_sse_pagezero; 1350Sstevel@tonic-gate 136*5084Sjohnlev /* 137*5084Sjohnlev * combined memory ranges from mnode and memranges[] to manage single 138*5084Sjohnlev * mnode/mtype dimension in the page lists. 139*5084Sjohnlev */ 140*5084Sjohnlev typedef struct { 141*5084Sjohnlev pfn_t mnr_pfnlo; 142*5084Sjohnlev pfn_t mnr_pfnhi; 143*5084Sjohnlev int mnr_mnode; 144*5084Sjohnlev int mnr_memrange; /* index into memranges[] */ 145*5084Sjohnlev /* maintain page list stats */ 146*5084Sjohnlev pgcnt_t mnr_mt_clpgcnt; /* cache list cnt */ 147*5084Sjohnlev pgcnt_t mnr_mt_flpgcnt; /* free list cnt - small pages */ 148*5084Sjohnlev pgcnt_t mnr_mt_lgpgcnt; /* free list cnt - large pages */ 149*5084Sjohnlev #ifdef DEBUG 150*5084Sjohnlev struct mnr_mts { /* mnode/mtype szc stats */ 151*5084Sjohnlev pgcnt_t mnr_mts_pgcnt; 152*5084Sjohnlev int mnr_mts_colors; 153*5084Sjohnlev pgcnt_t *mnr_mtsc_pgcnt; 154*5084Sjohnlev } *mnr_mts; 155*5084Sjohnlev #endif 156*5084Sjohnlev } mnoderange_t; 157*5084Sjohnlev 158*5084Sjohnlev #define MEMRANGEHI(mtype) \ 159*5084Sjohnlev ((mtype > 0) ? memranges[mtype - 1] - 1: physmax) 160*5084Sjohnlev #define MEMRANGELO(mtype) (memranges[mtype]) 161*5084Sjohnlev 162*5084Sjohnlev #define MTYPE_FREEMEM(mt) \ 163*5084Sjohnlev (mnoderanges[mt].mnr_mt_clpgcnt + \ 164*5084Sjohnlev mnoderanges[mt].mnr_mt_flpgcnt + \ 165*5084Sjohnlev mnoderanges[mt].mnr_mt_lgpgcnt) 166*5084Sjohnlev 167*5084Sjohnlev /* 168*5084Sjohnlev * As the PC architecture evolved memory up was clumped into several 169*5084Sjohnlev * ranges for various historical I/O devices to do DMA. 170*5084Sjohnlev * < 16Meg - ISA bus 171*5084Sjohnlev * < 2Gig - ??? 172*5084Sjohnlev * < 4Gig - PCI bus or drivers that don't understand PAE mode 173*5084Sjohnlev * 174*5084Sjohnlev * These are listed in reverse order, so that we can skip over unused 175*5084Sjohnlev * ranges on machines with small memories. 176*5084Sjohnlev * 177*5084Sjohnlev * For now under the Hypervisor, we'll only ever have one memrange. 178*5084Sjohnlev */ 179*5084Sjohnlev #define PFN_4GIG 0x100000 180*5084Sjohnlev #define PFN_16MEG 0x1000 181*5084Sjohnlev static pfn_t arch_memranges[NUM_MEM_RANGES] = { 182*5084Sjohnlev PFN_4GIG, /* pfn range for 4G and above */ 183*5084Sjohnlev 0x80000, /* pfn range for 2G-4G */ 184*5084Sjohnlev PFN_16MEG, /* pfn range for 16M-2G */ 185*5084Sjohnlev 0x00000, /* pfn range for 0-16M */ 186*5084Sjohnlev }; 187*5084Sjohnlev pfn_t *memranges = &arch_memranges[0]; 188*5084Sjohnlev int nranges = NUM_MEM_RANGES; 189*5084Sjohnlev 190*5084Sjohnlev /* 191*5084Sjohnlev * This combines mem_node_config and memranges into one data 192*5084Sjohnlev * structure to be used for page list management. 193*5084Sjohnlev */ 194*5084Sjohnlev mnoderange_t *mnoderanges; 195*5084Sjohnlev int mnoderangecnt; 196*5084Sjohnlev int mtype4g; 197*5084Sjohnlev 198*5084Sjohnlev /* 199*5084Sjohnlev * 4g memory management variables for systems with more than 4g of memory: 200*5084Sjohnlev * 201*5084Sjohnlev * physical memory below 4g is required for 32bit dma devices and, currently, 202*5084Sjohnlev * for kmem memory. On systems with more than 4g of memory, the pool of memory 203*5084Sjohnlev * below 4g can be depleted without any paging activity given that there is 204*5084Sjohnlev * likely to be sufficient memory above 4g. 205*5084Sjohnlev * 206*5084Sjohnlev * physmax4g is set true if the largest pfn is over 4g. The rest of the 207*5084Sjohnlev * 4g memory management code is enabled only when physmax4g is true. 208*5084Sjohnlev * 209*5084Sjohnlev * maxmem4g is the count of the maximum number of pages on the page lists 210*5084Sjohnlev * with physical addresses below 4g. It can be a lot less then 4g given that 211*5084Sjohnlev * BIOS may reserve large chunks of space below 4g for hot plug pci devices, 212*5084Sjohnlev * agp aperture etc. 213*5084Sjohnlev * 214*5084Sjohnlev * freemem4g maintains the count of the number of available pages on the 215*5084Sjohnlev * page lists with physical addresses below 4g. 216*5084Sjohnlev * 217*5084Sjohnlev * DESFREE4G specifies the desired amount of below 4g memory. It defaults to 218*5084Sjohnlev * 6% (desfree4gshift = 4) of maxmem4g. 219*5084Sjohnlev * 220*5084Sjohnlev * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G 221*5084Sjohnlev * and the amount of physical memory above 4g is greater than freemem4g. 222*5084Sjohnlev * In this case, page_get_* routines will restrict below 4g allocations 223*5084Sjohnlev * for requests that don't specifically require it. 224*5084Sjohnlev */ 225*5084Sjohnlev 226*5084Sjohnlev #define LOTSFREE4G (maxmem4g >> lotsfree4gshift) 227*5084Sjohnlev #define DESFREE4G (maxmem4g >> desfree4gshift) 228*5084Sjohnlev 229*5084Sjohnlev #define RESTRICT4G_ALLOC \ 230*5084Sjohnlev (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem)) 231*5084Sjohnlev 232*5084Sjohnlev static pgcnt_t maxmem4g; 233*5084Sjohnlev static pgcnt_t freemem4g; 234*5084Sjohnlev static int physmax4g; 235*5084Sjohnlev static int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 236*5084Sjohnlev static int lotsfree4gshift = 3; 237*5084Sjohnlev 238*5084Sjohnlev /* 239*5084Sjohnlev * 16m memory management: 240*5084Sjohnlev * 241*5084Sjohnlev * reserve some amount of physical memory below 16m for legacy devices. 242*5084Sjohnlev * 243*5084Sjohnlev * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above 244*5084Sjohnlev * 16m or if the 16m pool drops below DESFREE16M. 245*5084Sjohnlev * 246*5084Sjohnlev * In this case, general page allocations via page_get_{free,cache}list 247*5084Sjohnlev * routines will be restricted from allocating from the 16m pool. Allocations 248*5084Sjohnlev * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations 249*5084Sjohnlev * are not restricted. 250*5084Sjohnlev */ 251*5084Sjohnlev 252*5084Sjohnlev #define FREEMEM16M MTYPE_FREEMEM(0) 253*5084Sjohnlev #define DESFREE16M desfree16m 254*5084Sjohnlev #define RESTRICT16M_ALLOC(freemem, pgcnt, flags) \ 255*5084Sjohnlev ((freemem != 0) && ((flags & PG_PANIC) == 0) && \ 256*5084Sjohnlev ((freemem >= (FREEMEM16M)) || \ 257*5084Sjohnlev (FREEMEM16M < (DESFREE16M + pgcnt)))) 258*5084Sjohnlev 259*5084Sjohnlev static pgcnt_t desfree16m = 0x380; 260*5084Sjohnlev 261*5084Sjohnlev /* 262*5084Sjohnlev * This can be patched via /etc/system to allow old non-PAE aware device 263*5084Sjohnlev * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 264*5084Sjohnlev */ 265*5084Sjohnlev int restricted_kmemalloc = 0; 2661385Skchow 2670Sstevel@tonic-gate #ifdef VM_STATS 2680Sstevel@tonic-gate struct { 2690Sstevel@tonic-gate ulong_t pga_alloc; 2700Sstevel@tonic-gate ulong_t pga_notfullrange; 2710Sstevel@tonic-gate ulong_t pga_nulldmaattr; 2720Sstevel@tonic-gate ulong_t pga_allocok; 2730Sstevel@tonic-gate ulong_t pga_allocfailed; 2740Sstevel@tonic-gate ulong_t pgma_alloc; 2750Sstevel@tonic-gate ulong_t pgma_allocok; 2760Sstevel@tonic-gate ulong_t pgma_allocfailed; 2770Sstevel@tonic-gate ulong_t pgma_allocempty; 2780Sstevel@tonic-gate } pga_vmstats; 2790Sstevel@tonic-gate #endif 2800Sstevel@tonic-gate 2810Sstevel@tonic-gate uint_t mmu_page_sizes; 2820Sstevel@tonic-gate 2830Sstevel@tonic-gate /* How many page sizes the users can see */ 2840Sstevel@tonic-gate uint_t mmu_exported_page_sizes; 2850Sstevel@tonic-gate 286423Sdavemq /* 287423Sdavemq * Number of pages in 1 GB. Don't enable automatic large pages if we have 288423Sdavemq * fewer than this many pages. 289423Sdavemq */ 2902991Ssusans pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 2912991Ssusans pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 2922991Ssusans 2932991Ssusans /* 2942991Ssusans * Maximum and default segment size tunables for user private 2952991Ssusans * and shared anon memory, and user text and initialized data. 2962991Ssusans * These can be patched via /etc/system to allow large pages 2972991Ssusans * to be used for mapping application private and shared anon memory. 2982991Ssusans */ 2992991Ssusans size_t mcntl0_lpsize = MMU_PAGESIZE; 3002991Ssusans size_t max_uheap_lpsize = MMU_PAGESIZE; 3012991Ssusans size_t default_uheap_lpsize = MMU_PAGESIZE; 3022991Ssusans size_t max_ustack_lpsize = MMU_PAGESIZE; 3032991Ssusans size_t default_ustack_lpsize = MMU_PAGESIZE; 3042991Ssusans size_t max_privmap_lpsize = MMU_PAGESIZE; 3052991Ssusans size_t max_uidata_lpsize = MMU_PAGESIZE; 3062991Ssusans size_t max_utext_lpsize = MMU_PAGESIZE; 3072991Ssusans size_t max_shm_lpsize = MMU_PAGESIZE; 3080Sstevel@tonic-gate 309*5084Sjohnlev 310*5084Sjohnlev /* 311*5084Sjohnlev * initialized by page_coloring_init(). 312*5084Sjohnlev */ 313*5084Sjohnlev uint_t page_colors; 314*5084Sjohnlev uint_t page_colors_mask; 315*5084Sjohnlev uint_t page_coloring_shift; 316*5084Sjohnlev int cpu_page_colors; 317*5084Sjohnlev static uint_t l2_colors; 318*5084Sjohnlev 319*5084Sjohnlev /* 320*5084Sjohnlev * Page freelists and cachelists are dynamically allocated once mnoderangecnt 321*5084Sjohnlev * and page_colors are calculated from the l2 cache n-way set size. Within a 322*5084Sjohnlev * mnode range, the page freelist and cachelist are hashed into bins based on 323*5084Sjohnlev * color. This makes it easier to search for a page within a specific memory 324*5084Sjohnlev * range. 325*5084Sjohnlev */ 326*5084Sjohnlev #define PAGE_COLORS_MIN 16 327*5084Sjohnlev 328*5084Sjohnlev page_t ****page_freelists; 329*5084Sjohnlev page_t ***page_cachelists; 330*5084Sjohnlev 331*5084Sjohnlev 332*5084Sjohnlev /* 333*5084Sjohnlev * Used by page layer to know about page sizes 334*5084Sjohnlev */ 335*5084Sjohnlev hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 336*5084Sjohnlev 337*5084Sjohnlev kmutex_t *fpc_mutex[NPC_MUTEX]; 338*5084Sjohnlev kmutex_t *cpc_mutex[NPC_MUTEX]; 339*5084Sjohnlev 340*5084Sjohnlev /* 341*5084Sjohnlev * Only let one thread at a time try to coalesce large pages, to 342*5084Sjohnlev * prevent them from working against each other. 343*5084Sjohnlev */ 344*5084Sjohnlev static kmutex_t contig_lock; 345*5084Sjohnlev #define CONTIG_LOCK() mutex_enter(&contig_lock); 346*5084Sjohnlev #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 347*5084Sjohnlev 348*5084Sjohnlev #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 349*5084Sjohnlev 3500Sstevel@tonic-gate /* 3510Sstevel@tonic-gate * Return the optimum page size for a given mapping 3520Sstevel@tonic-gate */ 3530Sstevel@tonic-gate /*ARGSUSED*/ 3540Sstevel@tonic-gate size_t 3552991Ssusans map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl) 3560Sstevel@tonic-gate { 3572991Ssusans level_t l = 0; 3582991Ssusans size_t pgsz = MMU_PAGESIZE; 3592991Ssusans size_t max_lpsize; 3602991Ssusans uint_t mszc; 3610Sstevel@tonic-gate 3622991Ssusans ASSERT(maptype != MAPPGSZ_VA); 3632991Ssusans 3642991Ssusans if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) { 3652991Ssusans return (MMU_PAGESIZE); 3662991Ssusans } 3670Sstevel@tonic-gate 3680Sstevel@tonic-gate switch (maptype) { 3692991Ssusans case MAPPGSZ_HEAP: 3700Sstevel@tonic-gate case MAPPGSZ_STK: 3712991Ssusans max_lpsize = memcntl ? mcntl0_lpsize : (maptype == 3722991Ssusans MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize); 3732991Ssusans if (max_lpsize == MMU_PAGESIZE) { 3742991Ssusans return (MMU_PAGESIZE); 3752991Ssusans } 3762991Ssusans if (len == 0) { 3772991Ssusans len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase + 3782991Ssusans p->p_brksize - p->p_bssbase : p->p_stksize; 3792991Ssusans } 3802991Ssusans len = (maptype == MAPPGSZ_HEAP) ? MAX(len, 3812991Ssusans default_uheap_lpsize) : MAX(len, default_ustack_lpsize); 3822991Ssusans 3830Sstevel@tonic-gate /* 3840Sstevel@tonic-gate * use the pages size that best fits len 3850Sstevel@tonic-gate */ 3860Sstevel@tonic-gate for (l = mmu.max_page_level; l > 0; --l) { 3872991Ssusans if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) { 3880Sstevel@tonic-gate continue; 3892991Ssusans } else { 3902991Ssusans pgsz = LEVEL_SIZE(l); 3912991Ssusans } 3920Sstevel@tonic-gate break; 3930Sstevel@tonic-gate } 3942991Ssusans 3952991Ssusans mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc : 3962991Ssusans p->p_stkpageszc); 3972991Ssusans if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) { 3982991Ssusans pgsz = hw_page_array[mszc].hp_size; 3992991Ssusans } 4002991Ssusans return (pgsz); 4010Sstevel@tonic-gate 4020Sstevel@tonic-gate /* 4030Sstevel@tonic-gate * for ISM use the 1st large page size. 4040Sstevel@tonic-gate */ 4050Sstevel@tonic-gate case MAPPGSZ_ISM: 4060Sstevel@tonic-gate if (mmu.max_page_level == 0) 4070Sstevel@tonic-gate return (MMU_PAGESIZE); 4080Sstevel@tonic-gate return (LEVEL_SIZE(1)); 4090Sstevel@tonic-gate } 4102991Ssusans return (pgsz); 4110Sstevel@tonic-gate } 4120Sstevel@tonic-gate 4132991Ssusans static uint_t 4142991Ssusans map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize, 4152991Ssusans size_t min_physmem) 4162991Ssusans { 4172991Ssusans caddr_t eaddr = addr + size; 4182991Ssusans uint_t szcvec = 0; 4192991Ssusans caddr_t raddr; 4202991Ssusans caddr_t readdr; 4212991Ssusans size_t pgsz; 4222991Ssusans int i; 4232991Ssusans 4242991Ssusans if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) { 4252991Ssusans return (0); 4262991Ssusans } 4272991Ssusans 4282991Ssusans for (i = mmu_page_sizes - 1; i > 0; i--) { 4292991Ssusans pgsz = page_get_pagesize(i); 4302991Ssusans if (pgsz > max_lpsize) { 4312991Ssusans continue; 4322991Ssusans } 4332991Ssusans raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 4342991Ssusans readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 4352991Ssusans if (raddr < addr || raddr >= readdr) { 4362991Ssusans continue; 4372991Ssusans } 4382991Ssusans if (P2PHASE((uintptr_t)addr ^ off, pgsz)) { 4392991Ssusans continue; 4402991Ssusans } 4412991Ssusans /* 4422991Ssusans * Set szcvec to the remaining page sizes. 4432991Ssusans */ 4442991Ssusans szcvec = ((1 << (i + 1)) - 1) & ~1; 4452991Ssusans break; 4462991Ssusans } 4472991Ssusans return (szcvec); 4482991Ssusans } 4490Sstevel@tonic-gate 4500Sstevel@tonic-gate /* 4510Sstevel@tonic-gate * Return a bit vector of large page size codes that 4520Sstevel@tonic-gate * can be used to map [addr, addr + len) region. 4530Sstevel@tonic-gate */ 4540Sstevel@tonic-gate /*ARGSUSED*/ 4550Sstevel@tonic-gate uint_t 4562991Ssusans map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type, 4572991Ssusans int memcntl) 4580Sstevel@tonic-gate { 4592991Ssusans size_t max_lpsize = mcntl0_lpsize; 4600Sstevel@tonic-gate 4612991Ssusans if (mmu.max_page_level == 0) 4620Sstevel@tonic-gate return (0); 4630Sstevel@tonic-gate 4642991Ssusans if (flags & MAP_TEXT) { 465*5084Sjohnlev if (!memcntl) 466*5084Sjohnlev max_lpsize = max_utext_lpsize; 467*5084Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 4682991Ssusans shm_lpg_min_physmem)); 4692991Ssusans 4702991Ssusans } else if (flags & MAP_INITDATA) { 471*5084Sjohnlev if (!memcntl) 472*5084Sjohnlev max_lpsize = max_uidata_lpsize; 473*5084Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 4742991Ssusans privm_lpg_min_physmem)); 4752991Ssusans 4762991Ssusans } else if (type == MAPPGSZC_SHM) { 477*5084Sjohnlev if (!memcntl) 478*5084Sjohnlev max_lpsize = max_shm_lpsize; 479*5084Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 4802991Ssusans shm_lpg_min_physmem)); 4810Sstevel@tonic-gate 4822991Ssusans } else if (type == MAPPGSZC_HEAP) { 483*5084Sjohnlev if (!memcntl) 484*5084Sjohnlev max_lpsize = max_uheap_lpsize; 485*5084Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 4862991Ssusans privm_lpg_min_physmem)); 4872414Saguzovsk 4882991Ssusans } else if (type == MAPPGSZC_STACK) { 489*5084Sjohnlev if (!memcntl) 490*5084Sjohnlev max_lpsize = max_ustack_lpsize; 491*5084Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 4922991Ssusans privm_lpg_min_physmem)); 4932991Ssusans 4942991Ssusans } else { 495*5084Sjohnlev if (!memcntl) 496*5084Sjohnlev max_lpsize = max_privmap_lpsize; 497*5084Sjohnlev return (map_szcvec(addr, size, off, max_lpsize, 4982991Ssusans privm_lpg_min_physmem)); 4992414Saguzovsk } 5002414Saguzovsk } 5012414Saguzovsk 5020Sstevel@tonic-gate /* 5030Sstevel@tonic-gate * Handle a pagefault. 5040Sstevel@tonic-gate */ 5050Sstevel@tonic-gate faultcode_t 5060Sstevel@tonic-gate pagefault( 5070Sstevel@tonic-gate caddr_t addr, 5080Sstevel@tonic-gate enum fault_type type, 5090Sstevel@tonic-gate enum seg_rw rw, 5100Sstevel@tonic-gate int iskernel) 5110Sstevel@tonic-gate { 5120Sstevel@tonic-gate struct as *as; 5130Sstevel@tonic-gate struct hat *hat; 5140Sstevel@tonic-gate struct proc *p; 5150Sstevel@tonic-gate kthread_t *t; 5160Sstevel@tonic-gate faultcode_t res; 5170Sstevel@tonic-gate caddr_t base; 5180Sstevel@tonic-gate size_t len; 5190Sstevel@tonic-gate int err; 5200Sstevel@tonic-gate int mapped_red; 5210Sstevel@tonic-gate uintptr_t ea; 5220Sstevel@tonic-gate 5230Sstevel@tonic-gate ASSERT_STACK_ALIGNED(); 5240Sstevel@tonic-gate 5250Sstevel@tonic-gate if (INVALID_VADDR(addr)) 5260Sstevel@tonic-gate return (FC_NOMAP); 5270Sstevel@tonic-gate 5280Sstevel@tonic-gate mapped_red = segkp_map_red(); 5290Sstevel@tonic-gate 5300Sstevel@tonic-gate if (iskernel) { 5310Sstevel@tonic-gate as = &kas; 5320Sstevel@tonic-gate hat = as->a_hat; 5330Sstevel@tonic-gate } else { 5340Sstevel@tonic-gate t = curthread; 5350Sstevel@tonic-gate p = ttoproc(t); 5360Sstevel@tonic-gate as = p->p_as; 5370Sstevel@tonic-gate hat = as->a_hat; 5380Sstevel@tonic-gate } 5390Sstevel@tonic-gate 5400Sstevel@tonic-gate /* 5410Sstevel@tonic-gate * Dispatch pagefault. 5420Sstevel@tonic-gate */ 5430Sstevel@tonic-gate res = as_fault(hat, as, addr, 1, type, rw); 5440Sstevel@tonic-gate 5450Sstevel@tonic-gate /* 5460Sstevel@tonic-gate * If this isn't a potential unmapped hole in the user's 5470Sstevel@tonic-gate * UNIX data or stack segments, just return status info. 5480Sstevel@tonic-gate */ 5490Sstevel@tonic-gate if (res != FC_NOMAP || iskernel) 5500Sstevel@tonic-gate goto out; 5510Sstevel@tonic-gate 5520Sstevel@tonic-gate /* 5530Sstevel@tonic-gate * Check to see if we happened to faulted on a currently unmapped 5540Sstevel@tonic-gate * part of the UNIX data or stack segments. If so, create a zfod 5550Sstevel@tonic-gate * mapping there and then try calling the fault routine again. 5560Sstevel@tonic-gate */ 5570Sstevel@tonic-gate base = p->p_brkbase; 5580Sstevel@tonic-gate len = p->p_brksize; 5590Sstevel@tonic-gate 5600Sstevel@tonic-gate if (addr < base || addr >= base + len) { /* data seg? */ 5610Sstevel@tonic-gate base = (caddr_t)p->p_usrstack - p->p_stksize; 5620Sstevel@tonic-gate len = p->p_stksize; 5630Sstevel@tonic-gate if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 5640Sstevel@tonic-gate /* not in either UNIX data or stack segments */ 5650Sstevel@tonic-gate res = FC_NOMAP; 5660Sstevel@tonic-gate goto out; 5670Sstevel@tonic-gate } 5680Sstevel@tonic-gate } 5690Sstevel@tonic-gate 5700Sstevel@tonic-gate /* 5710Sstevel@tonic-gate * the rest of this function implements a 3.X 4.X 5.X compatibility 5720Sstevel@tonic-gate * This code is probably not needed anymore 5730Sstevel@tonic-gate */ 5740Sstevel@tonic-gate if (p->p_model == DATAMODEL_ILP32) { 5750Sstevel@tonic-gate 5760Sstevel@tonic-gate /* expand the gap to the page boundaries on each side */ 5770Sstevel@tonic-gate ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 5780Sstevel@tonic-gate base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 5790Sstevel@tonic-gate len = ea - (uintptr_t)base; 5800Sstevel@tonic-gate 5810Sstevel@tonic-gate as_rangelock(as); 5820Sstevel@tonic-gate if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 5830Sstevel@tonic-gate 0) { 5840Sstevel@tonic-gate err = as_map(as, base, len, segvn_create, zfod_argsp); 5850Sstevel@tonic-gate as_rangeunlock(as); 5860Sstevel@tonic-gate if (err) { 5870Sstevel@tonic-gate res = FC_MAKE_ERR(err); 5880Sstevel@tonic-gate goto out; 5890Sstevel@tonic-gate } 5900Sstevel@tonic-gate } else { 5910Sstevel@tonic-gate /* 5920Sstevel@tonic-gate * This page is already mapped by another thread after 5930Sstevel@tonic-gate * we returned from as_fault() above. We just fall 5940Sstevel@tonic-gate * through as_fault() below. 5950Sstevel@tonic-gate */ 5960Sstevel@tonic-gate as_rangeunlock(as); 5970Sstevel@tonic-gate } 5980Sstevel@tonic-gate 5990Sstevel@tonic-gate res = as_fault(hat, as, addr, 1, F_INVAL, rw); 6000Sstevel@tonic-gate } 6010Sstevel@tonic-gate 6020Sstevel@tonic-gate out: 6030Sstevel@tonic-gate if (mapped_red) 6040Sstevel@tonic-gate segkp_unmap_red(); 6050Sstevel@tonic-gate 6060Sstevel@tonic-gate return (res); 6070Sstevel@tonic-gate } 6080Sstevel@tonic-gate 6090Sstevel@tonic-gate void 6100Sstevel@tonic-gate map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 6110Sstevel@tonic-gate { 6120Sstevel@tonic-gate struct proc *p = curproc; 6130Sstevel@tonic-gate caddr_t userlimit = (flags & _MAP_LOW32) ? 6140Sstevel@tonic-gate (caddr_t)_userlimit32 : p->p_as->a_userlimit; 6150Sstevel@tonic-gate 6160Sstevel@tonic-gate map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 6170Sstevel@tonic-gate } 6180Sstevel@tonic-gate 6190Sstevel@tonic-gate /*ARGSUSED*/ 6200Sstevel@tonic-gate int 6210Sstevel@tonic-gate map_addr_vacalign_check(caddr_t addr, u_offset_t off) 6220Sstevel@tonic-gate { 6230Sstevel@tonic-gate return (0); 6240Sstevel@tonic-gate } 6250Sstevel@tonic-gate 6260Sstevel@tonic-gate /* 6270Sstevel@tonic-gate * map_addr_proc() is the routine called when the system is to 6280Sstevel@tonic-gate * choose an address for the user. We will pick an address 6293446Smrj * range which is the highest available below userlimit. 6300Sstevel@tonic-gate * 6310Sstevel@tonic-gate * addrp is a value/result parameter. 6320Sstevel@tonic-gate * On input it is a hint from the user to be used in a completely 6330Sstevel@tonic-gate * machine dependent fashion. We decide to completely ignore this hint. 6340Sstevel@tonic-gate * 6350Sstevel@tonic-gate * On output it is NULL if no address can be found in the current 6360Sstevel@tonic-gate * processes address space or else an address that is currently 6370Sstevel@tonic-gate * not mapped for len bytes with a page of red zone on either side. 6380Sstevel@tonic-gate * 6390Sstevel@tonic-gate * align is not needed on x86 (it's for viturally addressed caches) 6400Sstevel@tonic-gate */ 6410Sstevel@tonic-gate /*ARGSUSED*/ 6420Sstevel@tonic-gate void 6430Sstevel@tonic-gate map_addr_proc( 6440Sstevel@tonic-gate caddr_t *addrp, 6450Sstevel@tonic-gate size_t len, 6460Sstevel@tonic-gate offset_t off, 6470Sstevel@tonic-gate int vacalign, 6480Sstevel@tonic-gate caddr_t userlimit, 6490Sstevel@tonic-gate struct proc *p, 6500Sstevel@tonic-gate uint_t flags) 6510Sstevel@tonic-gate { 6520Sstevel@tonic-gate struct as *as = p->p_as; 6530Sstevel@tonic-gate caddr_t addr; 6540Sstevel@tonic-gate caddr_t base; 6550Sstevel@tonic-gate size_t slen; 6560Sstevel@tonic-gate size_t align_amount; 6570Sstevel@tonic-gate 6580Sstevel@tonic-gate ASSERT32(userlimit == as->a_userlimit); 6590Sstevel@tonic-gate 6600Sstevel@tonic-gate base = p->p_brkbase; 6610Sstevel@tonic-gate #if defined(__amd64) 6620Sstevel@tonic-gate /* 6630Sstevel@tonic-gate * XX64 Yes, this needs more work. 6640Sstevel@tonic-gate */ 6650Sstevel@tonic-gate if (p->p_model == DATAMODEL_NATIVE) { 6660Sstevel@tonic-gate if (userlimit < as->a_userlimit) { 6670Sstevel@tonic-gate /* 6680Sstevel@tonic-gate * This happens when a program wants to map 6690Sstevel@tonic-gate * something in a range that's accessible to a 6700Sstevel@tonic-gate * program in a smaller address space. For example, 6710Sstevel@tonic-gate * a 64-bit program calling mmap32(2) to guarantee 6720Sstevel@tonic-gate * that the returned address is below 4Gbytes. 6730Sstevel@tonic-gate */ 6740Sstevel@tonic-gate ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 6750Sstevel@tonic-gate 6760Sstevel@tonic-gate if (userlimit > base) 6770Sstevel@tonic-gate slen = userlimit - base; 6780Sstevel@tonic-gate else { 6790Sstevel@tonic-gate *addrp = NULL; 6800Sstevel@tonic-gate return; 6810Sstevel@tonic-gate } 6820Sstevel@tonic-gate } else { 6830Sstevel@tonic-gate /* 6840Sstevel@tonic-gate * XX64 This layout is probably wrong .. but in 6850Sstevel@tonic-gate * the event we make the amd64 address space look 6860Sstevel@tonic-gate * like sparcv9 i.e. with the stack -above- the 6870Sstevel@tonic-gate * heap, this bit of code might even be correct. 6880Sstevel@tonic-gate */ 6890Sstevel@tonic-gate slen = p->p_usrstack - base - 6900Sstevel@tonic-gate (((size_t)rctl_enforced_value( 6910Sstevel@tonic-gate rctlproc_legacy[RLIMIT_STACK], 6920Sstevel@tonic-gate p->p_rctls, p) + PAGEOFFSET) & PAGEMASK); 6930Sstevel@tonic-gate } 6940Sstevel@tonic-gate } else 6950Sstevel@tonic-gate #endif 6960Sstevel@tonic-gate slen = userlimit - base; 6970Sstevel@tonic-gate 6980Sstevel@tonic-gate len = (len + PAGEOFFSET) & PAGEMASK; 6990Sstevel@tonic-gate 7000Sstevel@tonic-gate /* 7010Sstevel@tonic-gate * Redzone for each side of the request. This is done to leave 7020Sstevel@tonic-gate * one page unmapped between segments. This is not required, but 7030Sstevel@tonic-gate * it's useful for the user because if their program strays across 7040Sstevel@tonic-gate * a segment boundary, it will catch a fault immediately making 7050Sstevel@tonic-gate * debugging a little easier. 7060Sstevel@tonic-gate */ 7070Sstevel@tonic-gate len += 2 * MMU_PAGESIZE; 7080Sstevel@tonic-gate 7090Sstevel@tonic-gate /* 7100Sstevel@tonic-gate * figure out what the alignment should be 7110Sstevel@tonic-gate * 7120Sstevel@tonic-gate * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 7130Sstevel@tonic-gate */ 7140Sstevel@tonic-gate if (len <= ELF_386_MAXPGSZ) { 7150Sstevel@tonic-gate /* 7160Sstevel@tonic-gate * Align virtual addresses to ensure that ELF shared libraries 7170Sstevel@tonic-gate * are mapped with the appropriate alignment constraints by 7180Sstevel@tonic-gate * the run-time linker. 7190Sstevel@tonic-gate */ 7200Sstevel@tonic-gate align_amount = ELF_386_MAXPGSZ; 7210Sstevel@tonic-gate } else { 7220Sstevel@tonic-gate int l = mmu.max_page_level; 7230Sstevel@tonic-gate 7240Sstevel@tonic-gate while (l && len < LEVEL_SIZE(l)) 7250Sstevel@tonic-gate --l; 7260Sstevel@tonic-gate 7270Sstevel@tonic-gate align_amount = LEVEL_SIZE(l); 7280Sstevel@tonic-gate } 7290Sstevel@tonic-gate 7300Sstevel@tonic-gate if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 7310Sstevel@tonic-gate align_amount = (uintptr_t)*addrp; 7320Sstevel@tonic-gate 7330Sstevel@tonic-gate len += align_amount; 7340Sstevel@tonic-gate 7350Sstevel@tonic-gate /* 7360Sstevel@tonic-gate * Look for a large enough hole starting below userlimit. 7370Sstevel@tonic-gate * After finding it, use the upper part. Addition of PAGESIZE 7380Sstevel@tonic-gate * is for the redzone as described above. 7390Sstevel@tonic-gate */ 7400Sstevel@tonic-gate if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) { 7410Sstevel@tonic-gate caddr_t as_addr; 7420Sstevel@tonic-gate 7430Sstevel@tonic-gate addr = base + slen - len + MMU_PAGESIZE; 7440Sstevel@tonic-gate as_addr = addr; 7450Sstevel@tonic-gate /* 7460Sstevel@tonic-gate * Round address DOWN to the alignment amount, 7470Sstevel@tonic-gate * add the offset, and if this address is less 7480Sstevel@tonic-gate * than the original address, add alignment amount. 7490Sstevel@tonic-gate */ 7500Sstevel@tonic-gate addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 7510Sstevel@tonic-gate addr += (uintptr_t)(off & (align_amount - 1)); 7520Sstevel@tonic-gate if (addr < as_addr) 7530Sstevel@tonic-gate addr += align_amount; 7540Sstevel@tonic-gate 7550Sstevel@tonic-gate ASSERT(addr <= (as_addr + align_amount)); 7560Sstevel@tonic-gate ASSERT(((uintptr_t)addr & (align_amount - 1)) == 7570Sstevel@tonic-gate ((uintptr_t)(off & (align_amount - 1)))); 7580Sstevel@tonic-gate *addrp = addr; 7590Sstevel@tonic-gate } else { 7600Sstevel@tonic-gate *addrp = NULL; /* no more virtual space */ 7610Sstevel@tonic-gate } 7620Sstevel@tonic-gate } 7630Sstevel@tonic-gate 7640Sstevel@tonic-gate /* 7650Sstevel@tonic-gate * Determine whether [base, base+len] contains a valid range of 7660Sstevel@tonic-gate * addresses at least minlen long. base and len are adjusted if 7670Sstevel@tonic-gate * required to provide a valid range. 7680Sstevel@tonic-gate */ 7690Sstevel@tonic-gate /*ARGSUSED3*/ 7700Sstevel@tonic-gate int 7710Sstevel@tonic-gate valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 7720Sstevel@tonic-gate { 7730Sstevel@tonic-gate uintptr_t hi, lo; 7740Sstevel@tonic-gate 7750Sstevel@tonic-gate lo = (uintptr_t)*basep; 7760Sstevel@tonic-gate hi = lo + *lenp; 7770Sstevel@tonic-gate 7780Sstevel@tonic-gate /* 7790Sstevel@tonic-gate * If hi rolled over the top, try cutting back. 7800Sstevel@tonic-gate */ 7810Sstevel@tonic-gate if (hi < lo) { 7820Sstevel@tonic-gate if (0 - lo + hi < minlen) 7830Sstevel@tonic-gate return (0); 7840Sstevel@tonic-gate if (0 - lo < minlen) 7850Sstevel@tonic-gate return (0); 7860Sstevel@tonic-gate *lenp = 0 - lo; 7870Sstevel@tonic-gate } else if (hi - lo < minlen) { 7880Sstevel@tonic-gate return (0); 7890Sstevel@tonic-gate } 7900Sstevel@tonic-gate #if defined(__amd64) 7910Sstevel@tonic-gate /* 7920Sstevel@tonic-gate * Deal with a possible hole in the address range between 7930Sstevel@tonic-gate * hole_start and hole_end that should never be mapped. 7940Sstevel@tonic-gate */ 7950Sstevel@tonic-gate if (lo < hole_start) { 7960Sstevel@tonic-gate if (hi > hole_start) { 7970Sstevel@tonic-gate if (hi < hole_end) { 7980Sstevel@tonic-gate hi = hole_start; 7990Sstevel@tonic-gate } else { 8000Sstevel@tonic-gate /* lo < hole_start && hi >= hole_end */ 8010Sstevel@tonic-gate if (dir == AH_LO) { 8020Sstevel@tonic-gate /* 8030Sstevel@tonic-gate * prefer lowest range 8040Sstevel@tonic-gate */ 8050Sstevel@tonic-gate if (hole_start - lo >= minlen) 8060Sstevel@tonic-gate hi = hole_start; 8070Sstevel@tonic-gate else if (hi - hole_end >= minlen) 8080Sstevel@tonic-gate lo = hole_end; 8090Sstevel@tonic-gate else 8100Sstevel@tonic-gate return (0); 8110Sstevel@tonic-gate } else { 8120Sstevel@tonic-gate /* 8130Sstevel@tonic-gate * prefer highest range 8140Sstevel@tonic-gate */ 8150Sstevel@tonic-gate if (hi - hole_end >= minlen) 8160Sstevel@tonic-gate lo = hole_end; 8170Sstevel@tonic-gate else if (hole_start - lo >= minlen) 8180Sstevel@tonic-gate hi = hole_start; 8190Sstevel@tonic-gate else 8200Sstevel@tonic-gate return (0); 8210Sstevel@tonic-gate } 8220Sstevel@tonic-gate } 8230Sstevel@tonic-gate } 8240Sstevel@tonic-gate } else { 8250Sstevel@tonic-gate /* lo >= hole_start */ 8260Sstevel@tonic-gate if (hi < hole_end) 8270Sstevel@tonic-gate return (0); 8280Sstevel@tonic-gate if (lo < hole_end) 8290Sstevel@tonic-gate lo = hole_end; 8300Sstevel@tonic-gate } 8310Sstevel@tonic-gate 8320Sstevel@tonic-gate if (hi - lo < minlen) 8330Sstevel@tonic-gate return (0); 8340Sstevel@tonic-gate 8350Sstevel@tonic-gate *basep = (caddr_t)lo; 8360Sstevel@tonic-gate *lenp = hi - lo; 8370Sstevel@tonic-gate #endif 8380Sstevel@tonic-gate return (1); 8390Sstevel@tonic-gate } 8400Sstevel@tonic-gate 8410Sstevel@tonic-gate /* 8420Sstevel@tonic-gate * Determine whether [addr, addr+len] are valid user addresses. 8430Sstevel@tonic-gate */ 8440Sstevel@tonic-gate /*ARGSUSED*/ 8450Sstevel@tonic-gate int 8460Sstevel@tonic-gate valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 8470Sstevel@tonic-gate caddr_t userlimit) 8480Sstevel@tonic-gate { 8490Sstevel@tonic-gate caddr_t eaddr = addr + len; 8500Sstevel@tonic-gate 8510Sstevel@tonic-gate if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 8520Sstevel@tonic-gate return (RANGE_BADADDR); 8530Sstevel@tonic-gate 8540Sstevel@tonic-gate #if defined(__amd64) 8550Sstevel@tonic-gate /* 8560Sstevel@tonic-gate * Check for the VA hole 8570Sstevel@tonic-gate */ 8580Sstevel@tonic-gate if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 8590Sstevel@tonic-gate return (RANGE_BADADDR); 8600Sstevel@tonic-gate #endif 8610Sstevel@tonic-gate 8620Sstevel@tonic-gate return (RANGE_OKAY); 8630Sstevel@tonic-gate } 8640Sstevel@tonic-gate 8650Sstevel@tonic-gate /* 8660Sstevel@tonic-gate * Return 1 if the page frame is onboard memory, else 0. 8670Sstevel@tonic-gate */ 8680Sstevel@tonic-gate int 8690Sstevel@tonic-gate pf_is_memory(pfn_t pf) 8700Sstevel@tonic-gate { 8713446Smrj if (pfn_is_foreign(pf)) 8723446Smrj return (0); 8733446Smrj return (address_in_memlist(phys_install, pfn_to_pa(pf), 1)); 8740Sstevel@tonic-gate } 8750Sstevel@tonic-gate 8760Sstevel@tonic-gate /* 8770Sstevel@tonic-gate * return the memrange containing pfn 8780Sstevel@tonic-gate */ 8790Sstevel@tonic-gate int 8800Sstevel@tonic-gate memrange_num(pfn_t pfn) 8810Sstevel@tonic-gate { 8820Sstevel@tonic-gate int n; 8830Sstevel@tonic-gate 8840Sstevel@tonic-gate for (n = 0; n < nranges - 1; ++n) { 8850Sstevel@tonic-gate if (pfn >= memranges[n]) 8860Sstevel@tonic-gate break; 8870Sstevel@tonic-gate } 8880Sstevel@tonic-gate return (n); 8890Sstevel@tonic-gate } 8900Sstevel@tonic-gate 8910Sstevel@tonic-gate /* 8920Sstevel@tonic-gate * return the mnoderange containing pfn 8930Sstevel@tonic-gate */ 894*5084Sjohnlev /*ARGSUSED*/ 8950Sstevel@tonic-gate int 8960Sstevel@tonic-gate pfn_2_mtype(pfn_t pfn) 8970Sstevel@tonic-gate { 898*5084Sjohnlev #if defined(__xpv) 899*5084Sjohnlev return (0); 900*5084Sjohnlev #else 9010Sstevel@tonic-gate int n; 9020Sstevel@tonic-gate 9030Sstevel@tonic-gate for (n = mnoderangecnt - 1; n >= 0; n--) { 9040Sstevel@tonic-gate if (pfn >= mnoderanges[n].mnr_pfnlo) { 9050Sstevel@tonic-gate break; 9060Sstevel@tonic-gate } 9070Sstevel@tonic-gate } 9080Sstevel@tonic-gate return (n); 909*5084Sjohnlev #endif 9100Sstevel@tonic-gate } 9110Sstevel@tonic-gate 912*5084Sjohnlev #if !defined(__xpv) 9130Sstevel@tonic-gate /* 9140Sstevel@tonic-gate * is_contigpage_free: 9150Sstevel@tonic-gate * returns a page list of contiguous pages. It minimally has to return 9160Sstevel@tonic-gate * minctg pages. Caller determines minctg based on the scatter-gather 9170Sstevel@tonic-gate * list length. 9180Sstevel@tonic-gate * 9190Sstevel@tonic-gate * pfnp is set to the next page frame to search on return. 9200Sstevel@tonic-gate */ 9210Sstevel@tonic-gate static page_t * 9220Sstevel@tonic-gate is_contigpage_free( 9230Sstevel@tonic-gate pfn_t *pfnp, 9240Sstevel@tonic-gate pgcnt_t *pgcnt, 9250Sstevel@tonic-gate pgcnt_t minctg, 9260Sstevel@tonic-gate uint64_t pfnseg, 9270Sstevel@tonic-gate int iolock) 9280Sstevel@tonic-gate { 9290Sstevel@tonic-gate int i = 0; 9300Sstevel@tonic-gate pfn_t pfn = *pfnp; 9310Sstevel@tonic-gate page_t *pp; 9320Sstevel@tonic-gate page_t *plist = NULL; 9330Sstevel@tonic-gate 9340Sstevel@tonic-gate /* 9350Sstevel@tonic-gate * fail if pfn + minctg crosses a segment boundary. 9360Sstevel@tonic-gate * Adjust for next starting pfn to begin at segment boundary. 9370Sstevel@tonic-gate */ 9380Sstevel@tonic-gate 9390Sstevel@tonic-gate if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 9400Sstevel@tonic-gate *pfnp = roundup(*pfnp, pfnseg + 1); 9410Sstevel@tonic-gate return (NULL); 9420Sstevel@tonic-gate } 9430Sstevel@tonic-gate 9440Sstevel@tonic-gate do { 9450Sstevel@tonic-gate retry: 9460Sstevel@tonic-gate pp = page_numtopp_nolock(pfn + i); 9470Sstevel@tonic-gate if ((pp == NULL) || 9480Sstevel@tonic-gate (page_trylock(pp, SE_EXCL) == 0)) { 9490Sstevel@tonic-gate (*pfnp)++; 9500Sstevel@tonic-gate break; 9510Sstevel@tonic-gate } 9520Sstevel@tonic-gate if (page_pptonum(pp) != pfn + i) { 9530Sstevel@tonic-gate page_unlock(pp); 9540Sstevel@tonic-gate goto retry; 9550Sstevel@tonic-gate } 9560Sstevel@tonic-gate 9570Sstevel@tonic-gate if (!(PP_ISFREE(pp))) { 9580Sstevel@tonic-gate page_unlock(pp); 9590Sstevel@tonic-gate (*pfnp)++; 9600Sstevel@tonic-gate break; 9610Sstevel@tonic-gate } 9620Sstevel@tonic-gate 9630Sstevel@tonic-gate if (!PP_ISAGED(pp)) { 9640Sstevel@tonic-gate page_list_sub(pp, PG_CACHE_LIST); 9650Sstevel@tonic-gate page_hashout(pp, (kmutex_t *)NULL); 9660Sstevel@tonic-gate } else { 9670Sstevel@tonic-gate page_list_sub(pp, PG_FREE_LIST); 9680Sstevel@tonic-gate } 9690Sstevel@tonic-gate 9700Sstevel@tonic-gate if (iolock) 9710Sstevel@tonic-gate page_io_lock(pp); 9720Sstevel@tonic-gate page_list_concat(&plist, &pp); 9730Sstevel@tonic-gate 9740Sstevel@tonic-gate /* 9750Sstevel@tonic-gate * exit loop when pgcnt satisfied or segment boundary reached. 9760Sstevel@tonic-gate */ 9770Sstevel@tonic-gate 9780Sstevel@tonic-gate } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 9790Sstevel@tonic-gate 9800Sstevel@tonic-gate *pfnp += i; /* set to next pfn to search */ 9810Sstevel@tonic-gate 9820Sstevel@tonic-gate if (i >= minctg) { 9830Sstevel@tonic-gate *pgcnt -= i; 9840Sstevel@tonic-gate return (plist); 9850Sstevel@tonic-gate } 9860Sstevel@tonic-gate 9870Sstevel@tonic-gate /* 9880Sstevel@tonic-gate * failure: minctg not satisfied. 9890Sstevel@tonic-gate * 9900Sstevel@tonic-gate * if next request crosses segment boundary, set next pfn 9910Sstevel@tonic-gate * to search from the segment boundary. 9920Sstevel@tonic-gate */ 9930Sstevel@tonic-gate if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 9940Sstevel@tonic-gate *pfnp = roundup(*pfnp, pfnseg + 1); 9950Sstevel@tonic-gate 9960Sstevel@tonic-gate /* clean up any pages already allocated */ 9970Sstevel@tonic-gate 9980Sstevel@tonic-gate while (plist) { 9990Sstevel@tonic-gate pp = plist; 10000Sstevel@tonic-gate page_sub(&plist, pp); 10010Sstevel@tonic-gate page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 10020Sstevel@tonic-gate if (iolock) 10030Sstevel@tonic-gate page_io_unlock(pp); 10040Sstevel@tonic-gate page_unlock(pp); 10050Sstevel@tonic-gate } 10060Sstevel@tonic-gate 10070Sstevel@tonic-gate return (NULL); 10080Sstevel@tonic-gate } 1009*5084Sjohnlev #endif /* !__xpv */ 10100Sstevel@tonic-gate 10110Sstevel@tonic-gate /* 10120Sstevel@tonic-gate * verify that pages being returned from allocator have correct DMA attribute 10130Sstevel@tonic-gate */ 10140Sstevel@tonic-gate #ifndef DEBUG 10150Sstevel@tonic-gate #define check_dma(a, b, c) (0) 10160Sstevel@tonic-gate #else 10170Sstevel@tonic-gate static void 10180Sstevel@tonic-gate check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 10190Sstevel@tonic-gate { 10200Sstevel@tonic-gate if (dma_attr == NULL) 10210Sstevel@tonic-gate return; 10220Sstevel@tonic-gate 10230Sstevel@tonic-gate while (cnt-- > 0) { 10243446Smrj if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) < 10250Sstevel@tonic-gate dma_attr->dma_attr_addr_lo) 10260Sstevel@tonic-gate panic("PFN (pp=%p) below dma_attr_addr_lo", pp); 10273446Smrj if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >= 10280Sstevel@tonic-gate dma_attr->dma_attr_addr_hi) 10290Sstevel@tonic-gate panic("PFN (pp=%p) above dma_attr_addr_hi", pp); 10300Sstevel@tonic-gate pp = pp->p_next; 10310Sstevel@tonic-gate } 10320Sstevel@tonic-gate } 10330Sstevel@tonic-gate #endif 10340Sstevel@tonic-gate 1035*5084Sjohnlev #if !defined(__xpv) 10360Sstevel@tonic-gate static page_t * 10370Sstevel@tonic-gate page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 10380Sstevel@tonic-gate { 10390Sstevel@tonic-gate pfn_t pfn; 10400Sstevel@tonic-gate int sgllen; 10410Sstevel@tonic-gate uint64_t pfnseg; 10420Sstevel@tonic-gate pgcnt_t minctg; 10430Sstevel@tonic-gate page_t *pplist = NULL, *plist; 10440Sstevel@tonic-gate uint64_t lo, hi; 10450Sstevel@tonic-gate pgcnt_t pfnalign = 0; 10460Sstevel@tonic-gate static pfn_t startpfn; 10470Sstevel@tonic-gate static pgcnt_t lastctgcnt; 10480Sstevel@tonic-gate uintptr_t align; 10490Sstevel@tonic-gate 10500Sstevel@tonic-gate CONTIG_LOCK(); 10510Sstevel@tonic-gate 10520Sstevel@tonic-gate if (mattr) { 10530Sstevel@tonic-gate lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 10540Sstevel@tonic-gate hi = mmu_btop(mattr->dma_attr_addr_hi); 10550Sstevel@tonic-gate if (hi >= physmax) 10560Sstevel@tonic-gate hi = physmax - 1; 10570Sstevel@tonic-gate sgllen = mattr->dma_attr_sgllen; 10580Sstevel@tonic-gate pfnseg = mmu_btop(mattr->dma_attr_seg); 10590Sstevel@tonic-gate 10600Sstevel@tonic-gate align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 10610Sstevel@tonic-gate if (align > MMU_PAGESIZE) 10620Sstevel@tonic-gate pfnalign = mmu_btop(align); 10630Sstevel@tonic-gate 10640Sstevel@tonic-gate /* 10650Sstevel@tonic-gate * in order to satisfy the request, must minimally 10660Sstevel@tonic-gate * acquire minctg contiguous pages 10670Sstevel@tonic-gate */ 10680Sstevel@tonic-gate minctg = howmany(*pgcnt, sgllen); 10690Sstevel@tonic-gate 10700Sstevel@tonic-gate ASSERT(hi >= lo); 10710Sstevel@tonic-gate 10720Sstevel@tonic-gate /* 10730Sstevel@tonic-gate * start from where last searched if the minctg >= lastctgcnt 10740Sstevel@tonic-gate */ 10750Sstevel@tonic-gate if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 10760Sstevel@tonic-gate startpfn = lo; 10770Sstevel@tonic-gate } else { 10780Sstevel@tonic-gate hi = physmax - 1; 10790Sstevel@tonic-gate lo = 0; 10800Sstevel@tonic-gate sgllen = 1; 10810Sstevel@tonic-gate pfnseg = mmu.highest_pfn; 10820Sstevel@tonic-gate minctg = *pgcnt; 10830Sstevel@tonic-gate 10840Sstevel@tonic-gate if (minctg < lastctgcnt) 10850Sstevel@tonic-gate startpfn = lo; 10860Sstevel@tonic-gate } 10870Sstevel@tonic-gate lastctgcnt = minctg; 10880Sstevel@tonic-gate 10890Sstevel@tonic-gate ASSERT(pfnseg + 1 >= (uint64_t)minctg); 10900Sstevel@tonic-gate 10910Sstevel@tonic-gate /* conserve 16m memory - start search above 16m when possible */ 10920Sstevel@tonic-gate if (hi > PFN_16M && startpfn < PFN_16M) 10930Sstevel@tonic-gate startpfn = PFN_16M; 10940Sstevel@tonic-gate 10950Sstevel@tonic-gate pfn = startpfn; 10960Sstevel@tonic-gate if (pfnalign) 10970Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 10980Sstevel@tonic-gate 10990Sstevel@tonic-gate while (pfn + minctg - 1 <= hi) { 11000Sstevel@tonic-gate 11010Sstevel@tonic-gate plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 11020Sstevel@tonic-gate if (plist) { 11030Sstevel@tonic-gate page_list_concat(&pplist, &plist); 11040Sstevel@tonic-gate sgllen--; 11050Sstevel@tonic-gate /* 11060Sstevel@tonic-gate * return when contig pages no longer needed 11070Sstevel@tonic-gate */ 11080Sstevel@tonic-gate if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 11090Sstevel@tonic-gate startpfn = pfn; 11100Sstevel@tonic-gate CONTIG_UNLOCK(); 11110Sstevel@tonic-gate check_dma(mattr, pplist, *pgcnt); 11120Sstevel@tonic-gate return (pplist); 11130Sstevel@tonic-gate } 11140Sstevel@tonic-gate minctg = howmany(*pgcnt, sgllen); 11150Sstevel@tonic-gate } 11160Sstevel@tonic-gate if (pfnalign) 11170Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 11180Sstevel@tonic-gate } 11190Sstevel@tonic-gate 11200Sstevel@tonic-gate /* cannot find contig pages in specified range */ 11210Sstevel@tonic-gate if (startpfn == lo) { 11220Sstevel@tonic-gate CONTIG_UNLOCK(); 11230Sstevel@tonic-gate return (NULL); 11240Sstevel@tonic-gate } 11250Sstevel@tonic-gate 11260Sstevel@tonic-gate /* did not start with lo previously */ 11270Sstevel@tonic-gate pfn = lo; 11280Sstevel@tonic-gate if (pfnalign) 11290Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 11300Sstevel@tonic-gate 11310Sstevel@tonic-gate /* allow search to go above startpfn */ 11320Sstevel@tonic-gate while (pfn < startpfn) { 11330Sstevel@tonic-gate 11340Sstevel@tonic-gate plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 11350Sstevel@tonic-gate if (plist != NULL) { 11360Sstevel@tonic-gate 11370Sstevel@tonic-gate page_list_concat(&pplist, &plist); 11380Sstevel@tonic-gate sgllen--; 11390Sstevel@tonic-gate 11400Sstevel@tonic-gate /* 11410Sstevel@tonic-gate * return when contig pages no longer needed 11420Sstevel@tonic-gate */ 11430Sstevel@tonic-gate if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 11440Sstevel@tonic-gate startpfn = pfn; 11450Sstevel@tonic-gate CONTIG_UNLOCK(); 11460Sstevel@tonic-gate check_dma(mattr, pplist, *pgcnt); 11470Sstevel@tonic-gate return (pplist); 11480Sstevel@tonic-gate } 11490Sstevel@tonic-gate minctg = howmany(*pgcnt, sgllen); 11500Sstevel@tonic-gate } 11510Sstevel@tonic-gate if (pfnalign) 11520Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 11530Sstevel@tonic-gate } 11540Sstevel@tonic-gate CONTIG_UNLOCK(); 11550Sstevel@tonic-gate return (NULL); 11560Sstevel@tonic-gate } 1157*5084Sjohnlev #endif /* !__xpv */ 11580Sstevel@tonic-gate 11590Sstevel@tonic-gate /* 11600Sstevel@tonic-gate * mnode_range_cnt() calculates the number of memory ranges for mnode and 11610Sstevel@tonic-gate * memranges[]. Used to determine the size of page lists and mnoderanges. 11620Sstevel@tonic-gate */ 11630Sstevel@tonic-gate int 11642961Sdp78419 mnode_range_cnt(int mnode) 11650Sstevel@tonic-gate { 1166*5084Sjohnlev #if defined(__xpv) 1167*5084Sjohnlev ASSERT(mnode == 0); 1168*5084Sjohnlev return (1); 1169*5084Sjohnlev #else /* __xpv */ 11700Sstevel@tonic-gate int mri; 11710Sstevel@tonic-gate int mnrcnt = 0; 11720Sstevel@tonic-gate 11732961Sdp78419 if (mem_node_config[mnode].exists != 0) { 11740Sstevel@tonic-gate mri = nranges - 1; 11750Sstevel@tonic-gate 11760Sstevel@tonic-gate /* find the memranges index below contained in mnode range */ 11770Sstevel@tonic-gate 11780Sstevel@tonic-gate while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 11790Sstevel@tonic-gate mri--; 11800Sstevel@tonic-gate 11810Sstevel@tonic-gate /* 11820Sstevel@tonic-gate * increment mnode range counter when memranges or mnode 11830Sstevel@tonic-gate * boundary is reached. 11840Sstevel@tonic-gate */ 11850Sstevel@tonic-gate while (mri >= 0 && 11860Sstevel@tonic-gate mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 11870Sstevel@tonic-gate mnrcnt++; 11880Sstevel@tonic-gate if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 11890Sstevel@tonic-gate mri--; 11900Sstevel@tonic-gate else 11910Sstevel@tonic-gate break; 11920Sstevel@tonic-gate } 11930Sstevel@tonic-gate } 11942961Sdp78419 ASSERT(mnrcnt <= MAX_MNODE_MRANGES); 11950Sstevel@tonic-gate return (mnrcnt); 1196*5084Sjohnlev #endif /* __xpv */ 11970Sstevel@tonic-gate } 11980Sstevel@tonic-gate 1199*5084Sjohnlev /* 1200*5084Sjohnlev * mnode_range_setup() initializes mnoderanges. 1201*5084Sjohnlev */ 12020Sstevel@tonic-gate void 12030Sstevel@tonic-gate mnode_range_setup(mnoderange_t *mnoderanges) 12040Sstevel@tonic-gate { 12050Sstevel@tonic-gate int mnode, mri; 12060Sstevel@tonic-gate 12070Sstevel@tonic-gate for (mnode = 0; mnode < max_mem_nodes; mnode++) { 12080Sstevel@tonic-gate if (mem_node_config[mnode].exists == 0) 12090Sstevel@tonic-gate continue; 12100Sstevel@tonic-gate 12110Sstevel@tonic-gate mri = nranges - 1; 12120Sstevel@tonic-gate 12130Sstevel@tonic-gate while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 12140Sstevel@tonic-gate mri--; 12150Sstevel@tonic-gate 12160Sstevel@tonic-gate while (mri >= 0 && mem_node_config[mnode].physmax >= 12170Sstevel@tonic-gate MEMRANGELO(mri)) { 1218*5084Sjohnlev mnoderanges->mnr_pfnlo = MAX(MEMRANGELO(mri), 1219*5084Sjohnlev mem_node_config[mnode].physbase); 1220*5084Sjohnlev mnoderanges->mnr_pfnhi = MIN(MEMRANGEHI(mri), 1221*5084Sjohnlev mem_node_config[mnode].physmax); 12220Sstevel@tonic-gate mnoderanges->mnr_mnode = mnode; 12230Sstevel@tonic-gate mnoderanges->mnr_memrange = mri; 12240Sstevel@tonic-gate mnoderanges++; 12250Sstevel@tonic-gate if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 12260Sstevel@tonic-gate mri--; 12270Sstevel@tonic-gate else 12280Sstevel@tonic-gate break; 12290Sstevel@tonic-gate } 12300Sstevel@tonic-gate } 12310Sstevel@tonic-gate } 12320Sstevel@tonic-gate 1233*5084Sjohnlev /*ARGSUSED*/ 1234*5084Sjohnlev int 1235*5084Sjohnlev mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz) 1236*5084Sjohnlev { 1237*5084Sjohnlev int mtype = mnoderangecnt - 1; 1238*5084Sjohnlev 1239*5084Sjohnlev #if !defined(__xpv) 1240*5084Sjohnlev #if defined(__i386) 1241*5084Sjohnlev /* 1242*5084Sjohnlev * set the mtype range 1243*5084Sjohnlev * - kmem requests needs to be below 4g if restricted_kmemalloc is set. 1244*5084Sjohnlev * - for non kmem requests, set range to above 4g if memory below 4g 1245*5084Sjohnlev * runs low. 1246*5084Sjohnlev */ 1247*5084Sjohnlev if (restricted_kmemalloc && VN_ISKAS(vp) && 1248*5084Sjohnlev (caddr_t)(vaddr) >= kernelheap && 1249*5084Sjohnlev (caddr_t)(vaddr) < ekernelheap) { 1250*5084Sjohnlev ASSERT(physmax4g); 1251*5084Sjohnlev mtype = mtype4g; 1252*5084Sjohnlev if (RESTRICT16M_ALLOC(freemem4g - btop(pgsz), 1253*5084Sjohnlev btop(pgsz), *flags)) { 1254*5084Sjohnlev *flags |= PGI_MT_RANGE16M; 1255*5084Sjohnlev } else { 1256*5084Sjohnlev VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1257*5084Sjohnlev VM_STAT_COND_ADD((*flags & PG_PANIC), 1258*5084Sjohnlev vmm_vmstats.pgpanicalloc); 1259*5084Sjohnlev *flags |= PGI_MT_RANGE0; 1260*5084Sjohnlev } 1261*5084Sjohnlev return (mtype); 1262*5084Sjohnlev } 1263*5084Sjohnlev #endif /* __i386 */ 1264*5084Sjohnlev 1265*5084Sjohnlev if (RESTRICT4G_ALLOC) { 1266*5084Sjohnlev VM_STAT_ADD(vmm_vmstats.restrict4gcnt); 1267*5084Sjohnlev /* here only for > 4g systems */ 1268*5084Sjohnlev *flags |= PGI_MT_RANGE4G; 1269*5084Sjohnlev } else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) { 1270*5084Sjohnlev *flags |= PGI_MT_RANGE16M; 1271*5084Sjohnlev } else { 1272*5084Sjohnlev VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1273*5084Sjohnlev VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc); 1274*5084Sjohnlev *flags |= PGI_MT_RANGE0; 1275*5084Sjohnlev } 1276*5084Sjohnlev #endif /* !__xpv */ 1277*5084Sjohnlev return (mtype); 1278*5084Sjohnlev } 1279*5084Sjohnlev 1280*5084Sjohnlev 1281*5084Sjohnlev /* mtype init for page_get_replacement_page */ 1282*5084Sjohnlev /*ARGSUSED*/ 1283*5084Sjohnlev int 1284*5084Sjohnlev mtype_pgr_init(int *flags, page_t *pp, int mnode, pgcnt_t pgcnt) 1285*5084Sjohnlev { 1286*5084Sjohnlev int mtype = mnoderangecnt - 1; 1287*5084Sjohnlev #if !defined(__ixpv) 1288*5084Sjohnlev if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) { 1289*5084Sjohnlev *flags |= PGI_MT_RANGE16M; 1290*5084Sjohnlev } else { 1291*5084Sjohnlev VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1292*5084Sjohnlev *flags |= PGI_MT_RANGE0; 1293*5084Sjohnlev } 1294*5084Sjohnlev #endif 1295*5084Sjohnlev return (mtype); 1296*5084Sjohnlev } 1297*5084Sjohnlev 12980Sstevel@tonic-gate /* 12990Sstevel@tonic-gate * Determine if the mnode range specified in mtype contains memory belonging 13000Sstevel@tonic-gate * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 13011385Skchow * the range of indices from high pfn to 0, 16m or 4g. 13020Sstevel@tonic-gate * 13030Sstevel@tonic-gate * Return first mnode range type index found otherwise return -1 if none found. 13040Sstevel@tonic-gate */ 13050Sstevel@tonic-gate int 13060Sstevel@tonic-gate mtype_func(int mnode, int mtype, uint_t flags) 13070Sstevel@tonic-gate { 13080Sstevel@tonic-gate if (flags & PGI_MT_RANGE) { 1309*5084Sjohnlev int mtlim = 0; 13100Sstevel@tonic-gate 13110Sstevel@tonic-gate if (flags & PGI_MT_NEXT) 13120Sstevel@tonic-gate mtype--; 1313*5084Sjohnlev if (flags & PGI_MT_RANGE4G) 13141385Skchow mtlim = mtype4g + 1; /* exclude 0-4g range */ 13151385Skchow else if (flags & PGI_MT_RANGE16M) 13161385Skchow mtlim = 1; /* exclude 0-16m range */ 13170Sstevel@tonic-gate while (mtype >= mtlim) { 13180Sstevel@tonic-gate if (mnoderanges[mtype].mnr_mnode == mnode) 13190Sstevel@tonic-gate return (mtype); 13200Sstevel@tonic-gate mtype--; 13210Sstevel@tonic-gate } 1322*5084Sjohnlev } else if (mnoderanges[mtype].mnr_mnode == mnode) { 1323*5084Sjohnlev return (mtype); 13240Sstevel@tonic-gate } 13250Sstevel@tonic-gate return (-1); 13260Sstevel@tonic-gate } 13270Sstevel@tonic-gate 13280Sstevel@tonic-gate /* 13291373Skchow * Update the page list max counts with the pfn range specified by the 13301373Skchow * input parameters. Called from add_physmem() when physical memory with 13311373Skchow * page_t's are initially added to the page lists. 13321373Skchow */ 13331373Skchow void 13341373Skchow mtype_modify_max(pfn_t startpfn, long cnt) 13351373Skchow { 13361373Skchow int mtype = 0; 13371373Skchow pfn_t endpfn = startpfn + cnt, pfn; 13381373Skchow pgcnt_t inc; 13391373Skchow 13401373Skchow ASSERT(cnt > 0); 13411373Skchow 1342*5084Sjohnlev if (!physmax4g) 1343*5084Sjohnlev return; 1344*5084Sjohnlev 13451373Skchow for (pfn = startpfn; pfn < endpfn; ) { 13461373Skchow if (pfn <= mnoderanges[mtype].mnr_pfnhi) { 13471373Skchow if (endpfn < mnoderanges[mtype].mnr_pfnhi) { 13481373Skchow inc = endpfn - pfn; 13491373Skchow } else { 13501373Skchow inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1; 13511373Skchow } 1352*5084Sjohnlev if (mtype <= mtype4g) 13531373Skchow maxmem4g += inc; 13541373Skchow pfn += inc; 13551373Skchow } 13561373Skchow mtype++; 13571373Skchow ASSERT(mtype < mnoderangecnt || pfn >= endpfn); 13581373Skchow } 13591373Skchow } 13601373Skchow 1361*5084Sjohnlev int 1362*5084Sjohnlev mtype_2_mrange(int mtype) 1363*5084Sjohnlev { 1364*5084Sjohnlev return (mnoderanges[mtype].mnr_memrange); 1365*5084Sjohnlev } 1366*5084Sjohnlev 1367*5084Sjohnlev void 1368*5084Sjohnlev mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi) 1369*5084Sjohnlev { 1370*5084Sjohnlev ASSERT(mnoderanges[mtype].mnr_mnode == mnode); 1371*5084Sjohnlev *pfnlo = mnoderanges[mtype].mnr_pfnlo; 1372*5084Sjohnlev *pfnhi = mnoderanges[mtype].mnr_pfnhi; 1373*5084Sjohnlev } 1374*5084Sjohnlev 1375*5084Sjohnlev size_t 1376*5084Sjohnlev plcnt_sz(size_t ctrs_sz) 1377*5084Sjohnlev { 1378*5084Sjohnlev #ifdef DEBUG 1379*5084Sjohnlev int szc, colors; 1380*5084Sjohnlev 1381*5084Sjohnlev ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes; 1382*5084Sjohnlev for (szc = 0; szc < mmu_page_sizes; szc++) { 1383*5084Sjohnlev colors = page_get_pagecolors(szc); 1384*5084Sjohnlev ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors; 1385*5084Sjohnlev } 1386*5084Sjohnlev #endif 1387*5084Sjohnlev return (ctrs_sz); 1388*5084Sjohnlev } 1389*5084Sjohnlev 1390*5084Sjohnlev caddr_t 1391*5084Sjohnlev plcnt_init(caddr_t addr) 1392*5084Sjohnlev { 1393*5084Sjohnlev #ifdef DEBUG 1394*5084Sjohnlev int mt, szc, colors; 1395*5084Sjohnlev 1396*5084Sjohnlev for (mt = 0; mt < mnoderangecnt; mt++) { 1397*5084Sjohnlev mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr; 1398*5084Sjohnlev addr += (sizeof (struct mnr_mts) * mmu_page_sizes); 1399*5084Sjohnlev for (szc = 0; szc < mmu_page_sizes; szc++) { 1400*5084Sjohnlev colors = page_get_pagecolors(szc); 1401*5084Sjohnlev mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors; 1402*5084Sjohnlev mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt = 1403*5084Sjohnlev (pgcnt_t *)addr; 1404*5084Sjohnlev addr += (sizeof (pgcnt_t) * colors); 1405*5084Sjohnlev } 1406*5084Sjohnlev } 1407*5084Sjohnlev #endif 1408*5084Sjohnlev return (addr); 1409*5084Sjohnlev } 1410*5084Sjohnlev 1411*5084Sjohnlev void 1412*5084Sjohnlev plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags) 1413*5084Sjohnlev { 1414*5084Sjohnlev #ifdef DEBUG 1415*5084Sjohnlev int bin = PP_2_BIN(pp); 1416*5084Sjohnlev 1417*5084Sjohnlev atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt); 1418*5084Sjohnlev atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin], 1419*5084Sjohnlev cnt); 1420*5084Sjohnlev #endif 1421*5084Sjohnlev ASSERT(mtype == PP_2_MTYPE(pp)); 1422*5084Sjohnlev if (physmax4g && mtype <= mtype4g) 1423*5084Sjohnlev atomic_add_long(&freemem4g, cnt); 1424*5084Sjohnlev if (flags & PG_CACHE_LIST) 1425*5084Sjohnlev atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt); 1426*5084Sjohnlev else if (szc) 1427*5084Sjohnlev atomic_add_long(&mnoderanges[mtype].mnr_mt_lgpgcnt, cnt); 1428*5084Sjohnlev else 1429*5084Sjohnlev atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt, cnt); 1430*5084Sjohnlev } 1431*5084Sjohnlev 14321373Skchow /* 1433414Skchow * Returns the free page count for mnode 1434414Skchow */ 1435414Skchow int 1436414Skchow mnode_pgcnt(int mnode) 1437414Skchow { 1438414Skchow int mtype = mnoderangecnt - 1; 1439414Skchow int flags = PGI_MT_RANGE0; 1440414Skchow pgcnt_t pgcnt = 0; 1441414Skchow 1442414Skchow mtype = mtype_func(mnode, mtype, flags); 1443414Skchow 1444414Skchow while (mtype != -1) { 14451385Skchow pgcnt += MTYPE_FREEMEM(mtype); 1446414Skchow mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); 1447414Skchow } 1448414Skchow return (pgcnt); 1449414Skchow } 1450414Skchow 1451414Skchow /* 14520Sstevel@tonic-gate * Initialize page coloring variables based on the l2 cache parameters. 14530Sstevel@tonic-gate * Calculate and return memory needed for page coloring data structures. 14540Sstevel@tonic-gate */ 14550Sstevel@tonic-gate size_t 14560Sstevel@tonic-gate page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 14570Sstevel@tonic-gate { 14580Sstevel@tonic-gate size_t colorsz = 0; 14590Sstevel@tonic-gate int i; 14600Sstevel@tonic-gate int colors; 14610Sstevel@tonic-gate 1462*5084Sjohnlev #if defined(__xpv) 1463*5084Sjohnlev /* 1464*5084Sjohnlev * Hypervisor domains currently don't have any concept of NUMA. 1465*5084Sjohnlev * Hence we'll act like there is only 1 memrange. 1466*5084Sjohnlev */ 1467*5084Sjohnlev i = memrange_num(1); 1468*5084Sjohnlev #else /* !__xpv */ 14690Sstevel@tonic-gate /* 14700Sstevel@tonic-gate * Reduce the memory ranges lists if we don't have large amounts 14710Sstevel@tonic-gate * of memory. This avoids searching known empty free lists. 14720Sstevel@tonic-gate */ 14730Sstevel@tonic-gate i = memrange_num(physmax); 14740Sstevel@tonic-gate #if defined(__i386) 14750Sstevel@tonic-gate if (i > 0) 14760Sstevel@tonic-gate restricted_kmemalloc = 0; 14770Sstevel@tonic-gate #endif 14780Sstevel@tonic-gate /* physmax greater than 4g */ 14790Sstevel@tonic-gate if (i == 0) 14800Sstevel@tonic-gate physmax4g = 1; 1481*5084Sjohnlev #endif /* !__xpv */ 1482*5084Sjohnlev memranges += i; 1483*5084Sjohnlev nranges -= i; 14840Sstevel@tonic-gate 14850Sstevel@tonic-gate ASSERT(ISP2(l2_sz)); 14860Sstevel@tonic-gate ASSERT(ISP2(l2_linesz)); 14870Sstevel@tonic-gate ASSERT(l2_sz > MMU_PAGESIZE); 14880Sstevel@tonic-gate 14890Sstevel@tonic-gate /* l2_assoc is 0 for fully associative l2 cache */ 14900Sstevel@tonic-gate if (l2_assoc) 14910Sstevel@tonic-gate l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 14920Sstevel@tonic-gate else 14930Sstevel@tonic-gate l2_colors = 1; 14940Sstevel@tonic-gate 14950Sstevel@tonic-gate /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 14960Sstevel@tonic-gate page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 14970Sstevel@tonic-gate 14980Sstevel@tonic-gate /* 14990Sstevel@tonic-gate * cpu_page_colors is non-zero when a page color may be spread across 15000Sstevel@tonic-gate * multiple bins. 15010Sstevel@tonic-gate */ 15020Sstevel@tonic-gate if (l2_colors < page_colors) 15030Sstevel@tonic-gate cpu_page_colors = l2_colors; 15040Sstevel@tonic-gate 15050Sstevel@tonic-gate ASSERT(ISP2(page_colors)); 15060Sstevel@tonic-gate 15070Sstevel@tonic-gate page_colors_mask = page_colors - 1; 15080Sstevel@tonic-gate 15090Sstevel@tonic-gate ASSERT(ISP2(CPUSETSIZE())); 15100Sstevel@tonic-gate page_coloring_shift = lowbit(CPUSETSIZE()); 15110Sstevel@tonic-gate 15122961Sdp78419 /* initialize number of colors per page size */ 15132961Sdp78419 for (i = 0; i <= mmu.max_page_level; i++) { 15142961Sdp78419 hw_page_array[i].hp_size = LEVEL_SIZE(i); 15152961Sdp78419 hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 15162961Sdp78419 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 15172961Sdp78419 hw_page_array[i].hp_colors = (page_colors_mask >> 15182961Sdp78419 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift)) 15192961Sdp78419 + 1; 15203717Sdp78419 colorequivszc[i] = 0; 15212961Sdp78419 } 15222961Sdp78419 15232961Sdp78419 /* 15242961Sdp78419 * The value of cpu_page_colors determines if additional color bins 15252961Sdp78419 * need to be checked for a particular color in the page_get routines. 15262961Sdp78419 */ 15272961Sdp78419 if (cpu_page_colors != 0) { 15282961Sdp78419 15292961Sdp78419 int a = lowbit(page_colors) - lowbit(cpu_page_colors); 15302961Sdp78419 ASSERT(a > 0); 15312961Sdp78419 ASSERT(a < 16); 15322961Sdp78419 15332961Sdp78419 for (i = 0; i <= mmu.max_page_level; i++) { 15342961Sdp78419 if ((colors = hw_page_array[i].hp_colors) <= 1) { 15352961Sdp78419 colorequivszc[i] = 0; 15362961Sdp78419 continue; 15372961Sdp78419 } 15382961Sdp78419 while ((colors >> a) == 0) 15392961Sdp78419 a--; 15402961Sdp78419 ASSERT(a >= 0); 15412961Sdp78419 15422961Sdp78419 /* higher 4 bits encodes color equiv mask */ 15432961Sdp78419 colorequivszc[i] = (a << 4); 15442961Sdp78419 } 15452961Sdp78419 } 15462961Sdp78419 1547*5084Sjohnlev /* factor in colorequiv to check additional 'equivalent' bins. */ 1548*5084Sjohnlev if (colorequiv > 1) { 1549*5084Sjohnlev 1550*5084Sjohnlev int a = lowbit(colorequiv) - 1; 1551*5084Sjohnlev if (a > 15) 1552*5084Sjohnlev a = 15; 1553*5084Sjohnlev 1554*5084Sjohnlev for (i = 0; i <= mmu.max_page_level; i++) { 1555*5084Sjohnlev if ((colors = hw_page_array[i].hp_colors) <= 1) { 1556*5084Sjohnlev continue; 1557*5084Sjohnlev } 1558*5084Sjohnlev while ((colors >> a) == 0) 1559*5084Sjohnlev a--; 1560*5084Sjohnlev if ((a << 4) > colorequivszc[i]) { 1561*5084Sjohnlev colorequivszc[i] = (a << 4); 1562*5084Sjohnlev } 1563*5084Sjohnlev } 1564*5084Sjohnlev } 1565*5084Sjohnlev 15660Sstevel@tonic-gate /* size for mnoderanges */ 15672961Sdp78419 for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++) 15682961Sdp78419 mnoderangecnt += mnode_range_cnt(i); 15690Sstevel@tonic-gate colorsz = mnoderangecnt * sizeof (mnoderange_t); 15700Sstevel@tonic-gate 15710Sstevel@tonic-gate /* size for fpc_mutex and cpc_mutex */ 15720Sstevel@tonic-gate colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 15730Sstevel@tonic-gate 15740Sstevel@tonic-gate /* size of page_freelists */ 15750Sstevel@tonic-gate colorsz += mnoderangecnt * sizeof (page_t ***); 15760Sstevel@tonic-gate colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 15770Sstevel@tonic-gate 15780Sstevel@tonic-gate for (i = 0; i < mmu_page_sizes; i++) { 15790Sstevel@tonic-gate colors = page_get_pagecolors(i); 15800Sstevel@tonic-gate colorsz += mnoderangecnt * colors * sizeof (page_t *); 15810Sstevel@tonic-gate } 15820Sstevel@tonic-gate 15830Sstevel@tonic-gate /* size of page_cachelists */ 15840Sstevel@tonic-gate colorsz += mnoderangecnt * sizeof (page_t **); 15850Sstevel@tonic-gate colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 15860Sstevel@tonic-gate 15870Sstevel@tonic-gate return (colorsz); 15880Sstevel@tonic-gate } 15890Sstevel@tonic-gate 15900Sstevel@tonic-gate /* 15910Sstevel@tonic-gate * Called once at startup to configure page_coloring data structures and 15920Sstevel@tonic-gate * does the 1st page_free()/page_freelist_add(). 15930Sstevel@tonic-gate */ 15940Sstevel@tonic-gate void 15950Sstevel@tonic-gate page_coloring_setup(caddr_t pcmemaddr) 15960Sstevel@tonic-gate { 15970Sstevel@tonic-gate int i; 15980Sstevel@tonic-gate int j; 15990Sstevel@tonic-gate int k; 16000Sstevel@tonic-gate caddr_t addr; 16010Sstevel@tonic-gate int colors; 16020Sstevel@tonic-gate 16030Sstevel@tonic-gate /* 16040Sstevel@tonic-gate * do page coloring setup 16050Sstevel@tonic-gate */ 16060Sstevel@tonic-gate addr = pcmemaddr; 16070Sstevel@tonic-gate 16080Sstevel@tonic-gate mnoderanges = (mnoderange_t *)addr; 16090Sstevel@tonic-gate addr += (mnoderangecnt * sizeof (mnoderange_t)); 16100Sstevel@tonic-gate 16110Sstevel@tonic-gate mnode_range_setup(mnoderanges); 16120Sstevel@tonic-gate 16130Sstevel@tonic-gate if (physmax4g) 16140Sstevel@tonic-gate mtype4g = pfn_2_mtype(0xfffff); 16150Sstevel@tonic-gate 16160Sstevel@tonic-gate for (k = 0; k < NPC_MUTEX; k++) { 16170Sstevel@tonic-gate fpc_mutex[k] = (kmutex_t *)addr; 16180Sstevel@tonic-gate addr += (max_mem_nodes * sizeof (kmutex_t)); 16190Sstevel@tonic-gate } 16200Sstevel@tonic-gate for (k = 0; k < NPC_MUTEX; k++) { 16210Sstevel@tonic-gate cpc_mutex[k] = (kmutex_t *)addr; 16220Sstevel@tonic-gate addr += (max_mem_nodes * sizeof (kmutex_t)); 16230Sstevel@tonic-gate } 16240Sstevel@tonic-gate page_freelists = (page_t ****)addr; 16250Sstevel@tonic-gate addr += (mnoderangecnt * sizeof (page_t ***)); 16260Sstevel@tonic-gate 16270Sstevel@tonic-gate page_cachelists = (page_t ***)addr; 16280Sstevel@tonic-gate addr += (mnoderangecnt * sizeof (page_t **)); 16290Sstevel@tonic-gate 16300Sstevel@tonic-gate for (i = 0; i < mnoderangecnt; i++) { 16310Sstevel@tonic-gate page_freelists[i] = (page_t ***)addr; 16320Sstevel@tonic-gate addr += (mmu_page_sizes * sizeof (page_t **)); 16330Sstevel@tonic-gate 16340Sstevel@tonic-gate for (j = 0; j < mmu_page_sizes; j++) { 16350Sstevel@tonic-gate colors = page_get_pagecolors(j); 16360Sstevel@tonic-gate page_freelists[i][j] = (page_t **)addr; 16370Sstevel@tonic-gate addr += (colors * sizeof (page_t *)); 16380Sstevel@tonic-gate } 16390Sstevel@tonic-gate page_cachelists[i] = (page_t **)addr; 16400Sstevel@tonic-gate addr += (page_colors * sizeof (page_t *)); 16410Sstevel@tonic-gate } 16420Sstevel@tonic-gate } 16430Sstevel@tonic-gate 1644*5084Sjohnlev #if defined(__xpv) 1645*5084Sjohnlev /* 1646*5084Sjohnlev * Give back 10% of the io_pool pages to the free list. 1647*5084Sjohnlev * Don't shrink the pool below some absolute minimum. 1648*5084Sjohnlev */ 1649*5084Sjohnlev static void 1650*5084Sjohnlev page_io_pool_shrink() 1651*5084Sjohnlev { 1652*5084Sjohnlev int retcnt; 1653*5084Sjohnlev page_t *pp, *pp_first, *pp_last, **curpool; 1654*5084Sjohnlev mfn_t mfn; 1655*5084Sjohnlev int bothpools = 0; 1656*5084Sjohnlev 1657*5084Sjohnlev mutex_enter(&io_pool_lock); 1658*5084Sjohnlev io_pool_shrink_attempts++; /* should be a kstat? */ 1659*5084Sjohnlev retcnt = io_pool_cnt / 10; 1660*5084Sjohnlev if (io_pool_cnt - retcnt < io_pool_cnt_min) 1661*5084Sjohnlev retcnt = io_pool_cnt - io_pool_cnt_min; 1662*5084Sjohnlev if (retcnt <= 0) 1663*5084Sjohnlev goto done; 1664*5084Sjohnlev io_pool_shrinks++; /* should be a kstat? */ 1665*5084Sjohnlev curpool = &io_pool_4g; 1666*5084Sjohnlev domore: 1667*5084Sjohnlev /* 1668*5084Sjohnlev * Loop through taking pages from the end of the list 1669*5084Sjohnlev * (highest mfns) till amount to return reached. 1670*5084Sjohnlev */ 1671*5084Sjohnlev for (pp = *curpool; pp && retcnt > 0; ) { 1672*5084Sjohnlev pp_first = pp_last = pp->p_prev; 1673*5084Sjohnlev if (pp_first == *curpool) 1674*5084Sjohnlev break; 1675*5084Sjohnlev retcnt--; 1676*5084Sjohnlev io_pool_cnt--; 1677*5084Sjohnlev page_io_pool_sub(curpool, pp_first, pp_last); 1678*5084Sjohnlev if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn) 1679*5084Sjohnlev start_mfn = mfn; 1680*5084Sjohnlev page_free(pp_first, 1); 1681*5084Sjohnlev pp = *curpool; 1682*5084Sjohnlev } 1683*5084Sjohnlev if (retcnt != 0 && !bothpools) { 1684*5084Sjohnlev /* 1685*5084Sjohnlev * If not enough found in less constrained pool try the 1686*5084Sjohnlev * more constrained one. 1687*5084Sjohnlev */ 1688*5084Sjohnlev curpool = &io_pool_16m; 1689*5084Sjohnlev bothpools = 1; 1690*5084Sjohnlev goto domore; 1691*5084Sjohnlev } 1692*5084Sjohnlev done: 1693*5084Sjohnlev mutex_exit(&io_pool_lock); 1694*5084Sjohnlev } 1695*5084Sjohnlev 1696*5084Sjohnlev #endif /* __xpv */ 1697*5084Sjohnlev 1698*5084Sjohnlev uint_t 1699*5084Sjohnlev page_create_update_flags_x86(uint_t flags) 1700*5084Sjohnlev { 1701*5084Sjohnlev #if defined(__xpv) 1702*5084Sjohnlev /* 1703*5084Sjohnlev * Check this is an urgent allocation and free pages are depleted. 1704*5084Sjohnlev */ 1705*5084Sjohnlev if (!(flags & PG_WAIT) && freemem < desfree) 1706*5084Sjohnlev page_io_pool_shrink(); 1707*5084Sjohnlev #else /* !__xpv */ 1708*5084Sjohnlev /* 1709*5084Sjohnlev * page_create_get_something may call this because 4g memory may be 1710*5084Sjohnlev * depleted. Set flags to allow for relocation of base page below 1711*5084Sjohnlev * 4g if necessary. 1712*5084Sjohnlev */ 1713*5084Sjohnlev if (physmax4g) 1714*5084Sjohnlev flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI); 1715*5084Sjohnlev #endif /* __xpv */ 1716*5084Sjohnlev return (flags); 1717*5084Sjohnlev } 1718*5084Sjohnlev 17190Sstevel@tonic-gate /*ARGSUSED*/ 17200Sstevel@tonic-gate int 17210Sstevel@tonic-gate bp_color(struct buf *bp) 17220Sstevel@tonic-gate { 17230Sstevel@tonic-gate return (0); 17240Sstevel@tonic-gate } 17250Sstevel@tonic-gate 1726*5084Sjohnlev #if defined(__xpv) 1727*5084Sjohnlev 1728*5084Sjohnlev /* 1729*5084Sjohnlev * Take pages out of an io_pool 1730*5084Sjohnlev */ 1731*5084Sjohnlev static void 1732*5084Sjohnlev page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last) 1733*5084Sjohnlev { 1734*5084Sjohnlev if (*poolp == pp_first) { 1735*5084Sjohnlev *poolp = pp_last->p_next; 1736*5084Sjohnlev if (*poolp == pp_first) 1737*5084Sjohnlev *poolp = NULL; 1738*5084Sjohnlev } 1739*5084Sjohnlev pp_first->p_prev->p_next = pp_last->p_next; 1740*5084Sjohnlev pp_last->p_next->p_prev = pp_first->p_prev; 1741*5084Sjohnlev pp_first->p_prev = pp_last; 1742*5084Sjohnlev pp_last->p_next = pp_first; 1743*5084Sjohnlev } 1744*5084Sjohnlev 1745*5084Sjohnlev /* 1746*5084Sjohnlev * Put a page on the io_pool list. The list is ordered by increasing MFN. 1747*5084Sjohnlev */ 1748*5084Sjohnlev static void 1749*5084Sjohnlev page_io_pool_add(page_t **poolp, page_t *pp) 1750*5084Sjohnlev { 1751*5084Sjohnlev page_t *look; 1752*5084Sjohnlev mfn_t mfn = mfn_list[pp->p_pagenum]; 1753*5084Sjohnlev 1754*5084Sjohnlev if (*poolp == NULL) { 1755*5084Sjohnlev *poolp = pp; 1756*5084Sjohnlev pp->p_next = pp; 1757*5084Sjohnlev pp->p_prev = pp; 1758*5084Sjohnlev return; 1759*5084Sjohnlev } 1760*5084Sjohnlev 1761*5084Sjohnlev /* 1762*5084Sjohnlev * Since we try to take pages from the high end of the pool 1763*5084Sjohnlev * chances are good that the pages to be put on the list will 1764*5084Sjohnlev * go at or near the end of the list. so start at the end and 1765*5084Sjohnlev * work backwards. 1766*5084Sjohnlev */ 1767*5084Sjohnlev look = (*poolp)->p_prev; 1768*5084Sjohnlev while (mfn < mfn_list[look->p_pagenum]) { 1769*5084Sjohnlev look = look->p_prev; 1770*5084Sjohnlev if (look == (*poolp)->p_prev) 1771*5084Sjohnlev break; /* backed all the way to front of list */ 1772*5084Sjohnlev } 1773*5084Sjohnlev 1774*5084Sjohnlev /* insert after look */ 1775*5084Sjohnlev pp->p_prev = look; 1776*5084Sjohnlev pp->p_next = look->p_next; 1777*5084Sjohnlev pp->p_next->p_prev = pp; 1778*5084Sjohnlev look->p_next = pp; 1779*5084Sjohnlev if (mfn < mfn_list[(*poolp)->p_pagenum]) { 1780*5084Sjohnlev /* 1781*5084Sjohnlev * we inserted a new first list element 1782*5084Sjohnlev * adjust pool pointer to newly inserted element 1783*5084Sjohnlev */ 1784*5084Sjohnlev *poolp = pp; 1785*5084Sjohnlev } 1786*5084Sjohnlev } 1787*5084Sjohnlev 1788*5084Sjohnlev /* 1789*5084Sjohnlev * Add a page to the io_pool. Setting the force flag will force the page 1790*5084Sjohnlev * into the io_pool no matter what. 1791*5084Sjohnlev */ 1792*5084Sjohnlev static void 1793*5084Sjohnlev add_page_to_pool(page_t *pp, int force) 1794*5084Sjohnlev { 1795*5084Sjohnlev page_t *highest; 1796*5084Sjohnlev page_t *freep = NULL; 1797*5084Sjohnlev 1798*5084Sjohnlev mutex_enter(&io_pool_lock); 1799*5084Sjohnlev /* 1800*5084Sjohnlev * Always keep the scarce low memory pages 1801*5084Sjohnlev */ 1802*5084Sjohnlev if (mfn_list[pp->p_pagenum] < PFN_16MEG) { 1803*5084Sjohnlev ++io_pool_cnt; 1804*5084Sjohnlev page_io_pool_add(&io_pool_16m, pp); 1805*5084Sjohnlev goto done; 1806*5084Sjohnlev } 1807*5084Sjohnlev if (io_pool_cnt < io_pool_cnt_max || force) { 1808*5084Sjohnlev ++io_pool_cnt; 1809*5084Sjohnlev page_io_pool_add(&io_pool_4g, pp); 1810*5084Sjohnlev } else { 1811*5084Sjohnlev highest = io_pool_4g->p_prev; 1812*5084Sjohnlev if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) { 1813*5084Sjohnlev page_io_pool_sub(&io_pool_4g, highest, highest); 1814*5084Sjohnlev page_io_pool_add(&io_pool_4g, pp); 1815*5084Sjohnlev freep = highest; 1816*5084Sjohnlev } else { 1817*5084Sjohnlev freep = pp; 1818*5084Sjohnlev } 1819*5084Sjohnlev } 1820*5084Sjohnlev done: 1821*5084Sjohnlev mutex_exit(&io_pool_lock); 1822*5084Sjohnlev if (freep) 1823*5084Sjohnlev page_free(freep, 1); 1824*5084Sjohnlev } 1825*5084Sjohnlev 1826*5084Sjohnlev 1827*5084Sjohnlev int contig_pfn_cnt; /* no of pfns in the contig pfn list */ 1828*5084Sjohnlev int contig_pfn_max; /* capacity of the contig pfn list */ 1829*5084Sjohnlev int next_alloc_pfn; /* next position in list to start a contig search */ 1830*5084Sjohnlev int contig_pfnlist_updates; /* pfn list update count */ 1831*5084Sjohnlev int contig_pfnlist_locked; /* contig pfn list locked against use */ 1832*5084Sjohnlev int contig_pfnlist_builds; /* how many times have we (re)built list */ 1833*5084Sjohnlev int contig_pfnlist_buildfailed; /* how many times has list build failed */ 1834*5084Sjohnlev int create_contig_pending; /* nonzero means taskq creating contig list */ 1835*5084Sjohnlev pfn_t *contig_pfn_list = NULL; /* list of contig pfns in ascending mfn order */ 1836*5084Sjohnlev 1837*5084Sjohnlev /* 1838*5084Sjohnlev * Function to use in sorting a list of pfns by their underlying mfns. 1839*5084Sjohnlev */ 1840*5084Sjohnlev static int 1841*5084Sjohnlev mfn_compare(const void *pfnp1, const void *pfnp2) 1842*5084Sjohnlev { 1843*5084Sjohnlev mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1]; 1844*5084Sjohnlev mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2]; 1845*5084Sjohnlev 1846*5084Sjohnlev if (mfn1 > mfn2) 1847*5084Sjohnlev return (1); 1848*5084Sjohnlev if (mfn1 < mfn2) 1849*5084Sjohnlev return (-1); 1850*5084Sjohnlev return (0); 1851*5084Sjohnlev } 1852*5084Sjohnlev 1853*5084Sjohnlev /* 1854*5084Sjohnlev * Compact the contig_pfn_list by tossing all the non-contiguous 1855*5084Sjohnlev * elements from the list. 1856*5084Sjohnlev */ 1857*5084Sjohnlev static void 1858*5084Sjohnlev compact_contig_pfn_list(void) 1859*5084Sjohnlev { 1860*5084Sjohnlev pfn_t pfn, lapfn, prev_lapfn; 1861*5084Sjohnlev mfn_t mfn; 1862*5084Sjohnlev int i, newcnt = 0; 1863*5084Sjohnlev 1864*5084Sjohnlev prev_lapfn = 0; 1865*5084Sjohnlev for (i = 0; i < contig_pfn_cnt - 1; i++) { 1866*5084Sjohnlev pfn = contig_pfn_list[i]; 1867*5084Sjohnlev lapfn = contig_pfn_list[i + 1]; 1868*5084Sjohnlev mfn = mfn_list[pfn]; 1869*5084Sjohnlev /* 1870*5084Sjohnlev * See if next pfn is for a contig mfn 1871*5084Sjohnlev */ 1872*5084Sjohnlev if (mfn_list[lapfn] != mfn + 1) 1873*5084Sjohnlev continue; 1874*5084Sjohnlev /* 1875*5084Sjohnlev * pfn and lookahead are both put in list 1876*5084Sjohnlev * unless pfn is the previous lookahead. 1877*5084Sjohnlev */ 1878*5084Sjohnlev if (pfn != prev_lapfn) 1879*5084Sjohnlev contig_pfn_list[newcnt++] = pfn; 1880*5084Sjohnlev contig_pfn_list[newcnt++] = lapfn; 1881*5084Sjohnlev prev_lapfn = lapfn; 1882*5084Sjohnlev } 1883*5084Sjohnlev for (i = newcnt; i < contig_pfn_cnt; i++) 1884*5084Sjohnlev contig_pfn_list[i] = 0; 1885*5084Sjohnlev contig_pfn_cnt = newcnt; 1886*5084Sjohnlev } 1887*5084Sjohnlev 1888*5084Sjohnlev /*ARGSUSED*/ 1889*5084Sjohnlev static void 1890*5084Sjohnlev call_create_contiglist(void *arg) 1891*5084Sjohnlev { 1892*5084Sjohnlev mutex_enter(&io_pool_lock); 1893*5084Sjohnlev (void) create_contig_pfnlist(PG_WAIT); 1894*5084Sjohnlev create_contig_pending = 0; 1895*5084Sjohnlev mutex_exit(&io_pool_lock); 1896*5084Sjohnlev } 1897*5084Sjohnlev 1898*5084Sjohnlev /* 1899*5084Sjohnlev * Create list of freelist pfns that have underlying 1900*5084Sjohnlev * contiguous mfns. The list is kept in ascending mfn order. 1901*5084Sjohnlev * returns 1 if list created else 0. 1902*5084Sjohnlev */ 1903*5084Sjohnlev static int 1904*5084Sjohnlev create_contig_pfnlist(uint_t flags) 1905*5084Sjohnlev { 1906*5084Sjohnlev pfn_t pfn; 1907*5084Sjohnlev page_t *pp; 1908*5084Sjohnlev 1909*5084Sjohnlev if (contig_pfn_list != NULL) 1910*5084Sjohnlev return (1); 1911*5084Sjohnlev ASSERT(!contig_pfnlist_locked); 1912*5084Sjohnlev contig_pfn_max = freemem + (freemem / 10); 1913*5084Sjohnlev contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t), 1914*5084Sjohnlev (flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP); 1915*5084Sjohnlev if (contig_pfn_list == NULL) { 1916*5084Sjohnlev /* 1917*5084Sjohnlev * If we could not create the contig list (because 1918*5084Sjohnlev * we could not sleep for memory). Dispatch a taskq that can 1919*5084Sjohnlev * sleep to get the memory. 1920*5084Sjohnlev */ 1921*5084Sjohnlev if (!create_contig_pending) { 1922*5084Sjohnlev if (taskq_dispatch(system_taskq, call_create_contiglist, 1923*5084Sjohnlev NULL, TQ_NOSLEEP) != NULL) 1924*5084Sjohnlev create_contig_pending = 1; 1925*5084Sjohnlev } 1926*5084Sjohnlev contig_pfnlist_buildfailed++; /* count list build failures */ 1927*5084Sjohnlev return (0); 1928*5084Sjohnlev } 1929*5084Sjohnlev ASSERT(contig_pfn_cnt == 0); 1930*5084Sjohnlev for (pfn = 0; pfn < mfn_count; pfn++) { 1931*5084Sjohnlev pp = page_numtopp_nolock(pfn); 1932*5084Sjohnlev if (pp == NULL || !PP_ISFREE(pp)) 1933*5084Sjohnlev continue; 1934*5084Sjohnlev contig_pfn_list[contig_pfn_cnt] = pfn; 1935*5084Sjohnlev if (++contig_pfn_cnt == contig_pfn_max) 1936*5084Sjohnlev break; 1937*5084Sjohnlev } 1938*5084Sjohnlev qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare); 1939*5084Sjohnlev compact_contig_pfn_list(); 1940*5084Sjohnlev /* 1941*5084Sjohnlev * Make sure next search of the newly created contiguous pfn 1942*5084Sjohnlev * list starts at the beginning of the list. 1943*5084Sjohnlev */ 1944*5084Sjohnlev next_alloc_pfn = 0; 1945*5084Sjohnlev contig_pfnlist_builds++; /* count list builds */ 1946*5084Sjohnlev return (1); 1947*5084Sjohnlev } 1948*5084Sjohnlev 1949*5084Sjohnlev 1950*5084Sjohnlev /* 1951*5084Sjohnlev * Toss the current contig pfnlist. Someone is about to do a massive 1952*5084Sjohnlev * update to pfn<->mfn mappings. So we have them destroy the list and lock 1953*5084Sjohnlev * it till they are done with their update. 1954*5084Sjohnlev */ 1955*5084Sjohnlev void 1956*5084Sjohnlev clear_and_lock_contig_pfnlist() 1957*5084Sjohnlev { 1958*5084Sjohnlev pfn_t *listp = NULL; 1959*5084Sjohnlev size_t listsize; 1960*5084Sjohnlev 1961*5084Sjohnlev mutex_enter(&io_pool_lock); 1962*5084Sjohnlev ASSERT(!contig_pfnlist_locked); 1963*5084Sjohnlev if (contig_pfn_list != NULL) { 1964*5084Sjohnlev listp = contig_pfn_list; 1965*5084Sjohnlev listsize = contig_pfn_max * sizeof (pfn_t); 1966*5084Sjohnlev contig_pfn_list = NULL; 1967*5084Sjohnlev contig_pfn_max = contig_pfn_cnt = 0; 1968*5084Sjohnlev } 1969*5084Sjohnlev contig_pfnlist_locked = 1; 1970*5084Sjohnlev mutex_exit(&io_pool_lock); 1971*5084Sjohnlev if (listp != NULL) 1972*5084Sjohnlev kmem_free(listp, listsize); 1973*5084Sjohnlev } 1974*5084Sjohnlev 1975*5084Sjohnlev /* 1976*5084Sjohnlev * Unlock the contig_pfn_list. The next attempted use of it will cause 1977*5084Sjohnlev * it to be re-created. 1978*5084Sjohnlev */ 1979*5084Sjohnlev void 1980*5084Sjohnlev unlock_contig_pfnlist() 1981*5084Sjohnlev { 1982*5084Sjohnlev mutex_enter(&io_pool_lock); 1983*5084Sjohnlev ASSERT(contig_pfnlist_locked); 1984*5084Sjohnlev contig_pfnlist_locked = 0; 1985*5084Sjohnlev mutex_exit(&io_pool_lock); 1986*5084Sjohnlev } 1987*5084Sjohnlev 1988*5084Sjohnlev /* 1989*5084Sjohnlev * Update the contiguous pfn list in response to a pfn <-> mfn reassignment 1990*5084Sjohnlev */ 1991*5084Sjohnlev void 1992*5084Sjohnlev update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn) 1993*5084Sjohnlev { 1994*5084Sjohnlev int probe_hi, probe_lo, probe_pos, insert_after, insert_point; 1995*5084Sjohnlev pfn_t probe_pfn; 1996*5084Sjohnlev mfn_t probe_mfn; 1997*5084Sjohnlev 1998*5084Sjohnlev if (contig_pfn_list == NULL) 1999*5084Sjohnlev return; 2000*5084Sjohnlev mutex_enter(&io_pool_lock); 2001*5084Sjohnlev contig_pfnlist_updates++; 2002*5084Sjohnlev /* 2003*5084Sjohnlev * Find the pfn in the current list. Use a binary chop to locate it. 2004*5084Sjohnlev */ 2005*5084Sjohnlev probe_hi = contig_pfn_cnt - 1; 2006*5084Sjohnlev probe_lo = 0; 2007*5084Sjohnlev probe_pos = (probe_hi + probe_lo) / 2; 2008*5084Sjohnlev while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) { 2009*5084Sjohnlev if (probe_pos == probe_lo) { /* pfn not in list */ 2010*5084Sjohnlev probe_pos = -1; 2011*5084Sjohnlev break; 2012*5084Sjohnlev } 2013*5084Sjohnlev if (pfn_to_mfn(probe_pfn) <= oldmfn) 2014*5084Sjohnlev probe_lo = probe_pos; 2015*5084Sjohnlev else 2016*5084Sjohnlev probe_hi = probe_pos; 2017*5084Sjohnlev probe_pos = (probe_hi + probe_lo) / 2; 2018*5084Sjohnlev } 2019*5084Sjohnlev if (probe_pos >= 0) { /* remove pfn fom list */ 2020*5084Sjohnlev contig_pfn_cnt--; 2021*5084Sjohnlev ovbcopy(&contig_pfn_list[probe_pos + 1], 2022*5084Sjohnlev &contig_pfn_list[probe_pos], 2023*5084Sjohnlev (contig_pfn_cnt - probe_pos) * sizeof (pfn_t)); 2024*5084Sjohnlev } 2025*5084Sjohnlev if (newmfn == MFN_INVALID) 2026*5084Sjohnlev goto done; 2027*5084Sjohnlev /* 2028*5084Sjohnlev * Check if new mfn has adjacent mfns in the list 2029*5084Sjohnlev */ 2030*5084Sjohnlev probe_hi = contig_pfn_cnt - 1; 2031*5084Sjohnlev probe_lo = 0; 2032*5084Sjohnlev insert_after = -2; 2033*5084Sjohnlev do { 2034*5084Sjohnlev probe_pos = (probe_hi + probe_lo) / 2; 2035*5084Sjohnlev probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]); 2036*5084Sjohnlev if (newmfn == probe_mfn + 1) 2037*5084Sjohnlev insert_after = probe_pos; 2038*5084Sjohnlev else if (newmfn == probe_mfn - 1) 2039*5084Sjohnlev insert_after = probe_pos - 1; 2040*5084Sjohnlev if (probe_pos == probe_lo) 2041*5084Sjohnlev break; 2042*5084Sjohnlev if (probe_mfn <= newmfn) 2043*5084Sjohnlev probe_lo = probe_pos; 2044*5084Sjohnlev else 2045*5084Sjohnlev probe_hi = probe_pos; 2046*5084Sjohnlev } while (insert_after == -2); 2047*5084Sjohnlev /* 2048*5084Sjohnlev * If there is space in the list and there are adjacent mfns 2049*5084Sjohnlev * insert the pfn in to its proper place in the list. 2050*5084Sjohnlev */ 2051*5084Sjohnlev if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) { 2052*5084Sjohnlev insert_point = insert_after + 1; 2053*5084Sjohnlev ovbcopy(&contig_pfn_list[insert_point], 2054*5084Sjohnlev &contig_pfn_list[insert_point + 1], 2055*5084Sjohnlev (contig_pfn_cnt - insert_point) * sizeof (pfn_t)); 2056*5084Sjohnlev contig_pfn_list[insert_point] = pfn; 2057*5084Sjohnlev contig_pfn_cnt++; 2058*5084Sjohnlev } 2059*5084Sjohnlev done: 2060*5084Sjohnlev mutex_exit(&io_pool_lock); 2061*5084Sjohnlev } 2062*5084Sjohnlev 2063*5084Sjohnlev /* 2064*5084Sjohnlev * Called to (re-)populate the io_pool from the free page lists. 2065*5084Sjohnlev */ 2066*5084Sjohnlev long 2067*5084Sjohnlev populate_io_pool(void) 2068*5084Sjohnlev { 2069*5084Sjohnlev pfn_t pfn; 2070*5084Sjohnlev mfn_t mfn, max_mfn; 2071*5084Sjohnlev page_t *pp; 2072*5084Sjohnlev 2073*5084Sjohnlev /* 2074*5084Sjohnlev * Figure out the bounds of the pool on first invocation. 2075*5084Sjohnlev * We use a percentage of memory for the io pool size. 2076*5084Sjohnlev * we allow that to shrink, but not to less than a fixed minimum 2077*5084Sjohnlev */ 2078*5084Sjohnlev if (io_pool_cnt_max == 0) { 2079*5084Sjohnlev io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct); 2080*5084Sjohnlev io_pool_cnt_lowater = io_pool_cnt_max; 2081*5084Sjohnlev /* 2082*5084Sjohnlev * This is the first time in populate_io_pool, grab a va to use 2083*5084Sjohnlev * when we need to allocate pages. 2084*5084Sjohnlev */ 2085*5084Sjohnlev io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); 2086*5084Sjohnlev } 2087*5084Sjohnlev /* 2088*5084Sjohnlev * If we are out of pages in the pool, then grow the size of the pool 2089*5084Sjohnlev */ 2090*5084Sjohnlev if (io_pool_cnt == 0) 2091*5084Sjohnlev io_pool_cnt_max += io_pool_cnt_max / 20; /* grow by 5% */ 2092*5084Sjohnlev io_pool_grows++; /* should be a kstat? */ 2093*5084Sjohnlev 2094*5084Sjohnlev /* 2095*5084Sjohnlev * Get highest mfn on this platform, but limit to the 32 bit DMA max. 2096*5084Sjohnlev */ 2097*5084Sjohnlev (void) mfn_to_pfn(start_mfn); 2098*5084Sjohnlev max_mfn = MIN(cached_max_mfn, PFN_4GIG); 2099*5084Sjohnlev for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) { 2100*5084Sjohnlev pfn = mfn_to_pfn(mfn); 2101*5084Sjohnlev if (pfn & PFN_IS_FOREIGN_MFN) 2102*5084Sjohnlev continue; 2103*5084Sjohnlev /* 2104*5084Sjohnlev * try to allocate it from free pages 2105*5084Sjohnlev */ 2106*5084Sjohnlev pp = page_numtopp_alloc(pfn); 2107*5084Sjohnlev if (pp == NULL) 2108*5084Sjohnlev continue; 2109*5084Sjohnlev PP_CLRFREE(pp); 2110*5084Sjohnlev add_page_to_pool(pp, 1); 2111*5084Sjohnlev if (io_pool_cnt >= io_pool_cnt_max) 2112*5084Sjohnlev break; 2113*5084Sjohnlev } 2114*5084Sjohnlev 2115*5084Sjohnlev return (io_pool_cnt); 2116*5084Sjohnlev } 2117*5084Sjohnlev 2118*5084Sjohnlev /* 2119*5084Sjohnlev * Destroy a page that was being used for DMA I/O. It may or 2120*5084Sjohnlev * may not actually go back to the io_pool. 2121*5084Sjohnlev */ 2122*5084Sjohnlev void 2123*5084Sjohnlev page_destroy_io(page_t *pp) 2124*5084Sjohnlev { 2125*5084Sjohnlev mfn_t mfn = mfn_list[pp->p_pagenum]; 2126*5084Sjohnlev 2127*5084Sjohnlev /* 2128*5084Sjohnlev * When the page was alloc'd a reservation was made, release it now 2129*5084Sjohnlev */ 2130*5084Sjohnlev page_unresv(1); 2131*5084Sjohnlev /* 2132*5084Sjohnlev * Unload translations, if any, then hash out the 2133*5084Sjohnlev * page to erase its identity. 2134*5084Sjohnlev */ 2135*5084Sjohnlev (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 2136*5084Sjohnlev page_hashout(pp, NULL); 2137*5084Sjohnlev 2138*5084Sjohnlev /* 2139*5084Sjohnlev * If the page came from the free lists, just put it back to them. 2140*5084Sjohnlev * DomU pages always go on the free lists as well. 2141*5084Sjohnlev */ 2142*5084Sjohnlev if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) { 2143*5084Sjohnlev page_free(pp, 1); 2144*5084Sjohnlev return; 2145*5084Sjohnlev } 2146*5084Sjohnlev 2147*5084Sjohnlev add_page_to_pool(pp, 0); 2148*5084Sjohnlev } 2149*5084Sjohnlev 2150*5084Sjohnlev 2151*5084Sjohnlev long contig_searches; /* count of times contig pages requested */ 2152*5084Sjohnlev long contig_search_restarts; /* count of contig ranges tried */ 2153*5084Sjohnlev long contig_search_failed; /* count of contig alloc failures */ 2154*5084Sjohnlev 2155*5084Sjohnlev /* 2156*5084Sjohnlev * Look thru the contiguous pfns that are not part of the io_pool for 2157*5084Sjohnlev * contiguous free pages. Return a list of the found pages or NULL. 2158*5084Sjohnlev */ 2159*5084Sjohnlev page_t * 2160*5084Sjohnlev find_contig_free(uint_t bytes, uint_t flags) 2161*5084Sjohnlev { 2162*5084Sjohnlev page_t *pp, *plist = NULL; 2163*5084Sjohnlev mfn_t mfn, prev_mfn; 2164*5084Sjohnlev pfn_t pfn; 2165*5084Sjohnlev int pages_needed, pages_requested; 2166*5084Sjohnlev int search_start; 2167*5084Sjohnlev 2168*5084Sjohnlev /* 2169*5084Sjohnlev * create the contig pfn list if not already done 2170*5084Sjohnlev */ 2171*5084Sjohnlev if (contig_pfn_list == NULL) { 2172*5084Sjohnlev if (contig_pfnlist_locked) { 2173*5084Sjohnlev return (NULL); 2174*5084Sjohnlev } else { 2175*5084Sjohnlev if (!create_contig_pfnlist(flags)) 2176*5084Sjohnlev return (NULL); 2177*5084Sjohnlev } 2178*5084Sjohnlev } 2179*5084Sjohnlev contig_searches++; 2180*5084Sjohnlev /* 2181*5084Sjohnlev * Search contiguous pfn list for physically contiguous pages not in 2182*5084Sjohnlev * the io_pool. Start the search where the last search left off. 2183*5084Sjohnlev */ 2184*5084Sjohnlev pages_requested = pages_needed = mmu_btop(bytes); 2185*5084Sjohnlev search_start = next_alloc_pfn; 2186*5084Sjohnlev prev_mfn = 0; 2187*5084Sjohnlev while (pages_needed) { 2188*5084Sjohnlev pfn = contig_pfn_list[next_alloc_pfn]; 2189*5084Sjohnlev mfn = pfn_to_mfn(pfn); 2190*5084Sjohnlev if ((prev_mfn == 0 || mfn == prev_mfn + 1) && 2191*5084Sjohnlev (pp = page_numtopp_alloc(pfn)) != NULL) { 2192*5084Sjohnlev PP_CLRFREE(pp); 2193*5084Sjohnlev page_io_pool_add(&plist, pp); 2194*5084Sjohnlev pages_needed--; 2195*5084Sjohnlev prev_mfn = mfn; 2196*5084Sjohnlev } else { 2197*5084Sjohnlev contig_search_restarts++; 2198*5084Sjohnlev /* 2199*5084Sjohnlev * free partial page list 2200*5084Sjohnlev */ 2201*5084Sjohnlev while (plist != NULL) { 2202*5084Sjohnlev pp = plist; 2203*5084Sjohnlev page_io_pool_sub(&plist, pp, pp); 2204*5084Sjohnlev page_free(pp, 1); 2205*5084Sjohnlev } 2206*5084Sjohnlev pages_needed = pages_requested; 2207*5084Sjohnlev prev_mfn = 0; 2208*5084Sjohnlev } 2209*5084Sjohnlev if (++next_alloc_pfn == contig_pfn_cnt) 2210*5084Sjohnlev next_alloc_pfn = 0; 2211*5084Sjohnlev if (next_alloc_pfn == search_start) 2212*5084Sjohnlev break; /* all pfns searched */ 2213*5084Sjohnlev } 2214*5084Sjohnlev if (pages_needed) { 2215*5084Sjohnlev contig_search_failed++; 2216*5084Sjohnlev /* 2217*5084Sjohnlev * Failed to find enough contig pages. 2218*5084Sjohnlev * free partial page list 2219*5084Sjohnlev */ 2220*5084Sjohnlev while (plist != NULL) { 2221*5084Sjohnlev pp = plist; 2222*5084Sjohnlev page_io_pool_sub(&plist, pp, pp); 2223*5084Sjohnlev page_free(pp, 1); 2224*5084Sjohnlev } 2225*5084Sjohnlev } 2226*5084Sjohnlev return (plist); 2227*5084Sjohnlev } 2228*5084Sjohnlev 2229*5084Sjohnlev /* 2230*5084Sjohnlev * Allocator for domain 0 I/O pages. We match the required 2231*5084Sjohnlev * DMA attributes and contiguity constraints. 2232*5084Sjohnlev */ 2233*5084Sjohnlev /*ARGSUSED*/ 2234*5084Sjohnlev page_t * 2235*5084Sjohnlev page_create_io( 2236*5084Sjohnlev struct vnode *vp, 2237*5084Sjohnlev u_offset_t off, 2238*5084Sjohnlev uint_t bytes, 2239*5084Sjohnlev uint_t flags, 2240*5084Sjohnlev struct as *as, 2241*5084Sjohnlev caddr_t vaddr, 2242*5084Sjohnlev ddi_dma_attr_t *mattr) 2243*5084Sjohnlev { 2244*5084Sjohnlev mfn_t max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); 2245*5084Sjohnlev page_t *pp_first; /* list to return */ 2246*5084Sjohnlev page_t *pp_last; /* last in list to return */ 2247*5084Sjohnlev page_t *pp, **poolp, **pplist = NULL, *expp; 2248*5084Sjohnlev int i, extpages = 0, npages = 0, contig, anyaddr, extra; 2249*5084Sjohnlev mfn_t lo_mfn; 2250*5084Sjohnlev mfn_t hi_mfn; 2251*5084Sjohnlev mfn_t mfn, tmfn; 2252*5084Sjohnlev mfn_t *mfnlist = 0; 2253*5084Sjohnlev pgcnt_t pfnalign = 0; 2254*5084Sjohnlev int align, order, nbits, extents; 2255*5084Sjohnlev uint64_t pfnseg; 2256*5084Sjohnlev int attempt = 0, is_domu = 0; 2257*5084Sjohnlev int asked_hypervisor = 0; 2258*5084Sjohnlev uint_t kflags; 2259*5084Sjohnlev 2260*5084Sjohnlev ASSERT(mattr != NULL); 2261*5084Sjohnlev lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 2262*5084Sjohnlev hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 2263*5084Sjohnlev align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 2264*5084Sjohnlev if (align > MMU_PAGESIZE) 2265*5084Sjohnlev pfnalign = mmu_btop(align); 2266*5084Sjohnlev pfnseg = mmu_btop(mattr->dma_attr_seg); 2267*5084Sjohnlev 2268*5084Sjohnlev /* 2269*5084Sjohnlev * Clear the contig flag if only one page is needed. 2270*5084Sjohnlev */ 2271*5084Sjohnlev contig = (flags & PG_PHYSCONTIG); 2272*5084Sjohnlev flags &= ~PG_PHYSCONTIG; 2273*5084Sjohnlev bytes = P2ROUNDUP(bytes, MMU_PAGESIZE); 2274*5084Sjohnlev if (bytes == MMU_PAGESIZE) 2275*5084Sjohnlev contig = 0; 2276*5084Sjohnlev 2277*5084Sjohnlev /* 2278*5084Sjohnlev * Check if any old page in the system is fine. 2279*5084Sjohnlev * DomU should always go down this path. 2280*5084Sjohnlev */ 2281*5084Sjohnlev is_domu = !DOMAIN_IS_INITDOMAIN(xen_info); 2282*5084Sjohnlev anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign; 2283*5084Sjohnlev if ((!contig && anyaddr) || is_domu) { 2284*5084Sjohnlev pp = page_create_va(vp, off, bytes, flags, &kvseg, vaddr); 2285*5084Sjohnlev if (pp) 2286*5084Sjohnlev return (pp); 2287*5084Sjohnlev else if (is_domu) 2288*5084Sjohnlev return (NULL); /* no memory available */ 2289*5084Sjohnlev } 2290*5084Sjohnlev /* 2291*5084Sjohnlev * DomU should never reach here 2292*5084Sjohnlev */ 2293*5084Sjohnlev try_again: 2294*5084Sjohnlev /* 2295*5084Sjohnlev * We could just want unconstrained but contig pages. 2296*5084Sjohnlev */ 2297*5084Sjohnlev if (anyaddr && contig && pfnseg >= max_mfn) { 2298*5084Sjohnlev /* 2299*5084Sjohnlev * Look for free contig pages to satisfy the request. 2300*5084Sjohnlev */ 2301*5084Sjohnlev mutex_enter(&io_pool_lock); 2302*5084Sjohnlev pp_first = find_contig_free(bytes, flags); 2303*5084Sjohnlev mutex_exit(&io_pool_lock); 2304*5084Sjohnlev if (pp_first != NULL) 2305*5084Sjohnlev goto done; 2306*5084Sjohnlev } 2307*5084Sjohnlev /* 2308*5084Sjohnlev * See if we want pages for a legacy device 2309*5084Sjohnlev */ 2310*5084Sjohnlev if (hi_mfn < PFN_16MEG) 2311*5084Sjohnlev poolp = &io_pool_16m; 2312*5084Sjohnlev else 2313*5084Sjohnlev poolp = &io_pool_4g; 2314*5084Sjohnlev try_smaller: 2315*5084Sjohnlev /* 2316*5084Sjohnlev * Take pages from I/O pool. We'll use pages from the highest MFN 2317*5084Sjohnlev * range possible. 2318*5084Sjohnlev */ 2319*5084Sjohnlev pp_first = pp_last = NULL; 2320*5084Sjohnlev npages = mmu_btop(bytes); 2321*5084Sjohnlev mutex_enter(&io_pool_lock); 2322*5084Sjohnlev for (pp = *poolp; pp && npages > 0; ) { 2323*5084Sjohnlev pp = pp->p_prev; 2324*5084Sjohnlev 2325*5084Sjohnlev /* 2326*5084Sjohnlev * skip pages above allowable range 2327*5084Sjohnlev */ 2328*5084Sjohnlev mfn = mfn_list[pp->p_pagenum]; 2329*5084Sjohnlev if (hi_mfn < mfn) 2330*5084Sjohnlev goto skip; 2331*5084Sjohnlev 2332*5084Sjohnlev /* 2333*5084Sjohnlev * stop at pages below allowable range 2334*5084Sjohnlev */ 2335*5084Sjohnlev if (lo_mfn > mfn) 2336*5084Sjohnlev break; 2337*5084Sjohnlev restart: 2338*5084Sjohnlev if (pp_last == NULL) { 2339*5084Sjohnlev /* 2340*5084Sjohnlev * Check alignment 2341*5084Sjohnlev */ 2342*5084Sjohnlev tmfn = mfn - (npages - 1); 2343*5084Sjohnlev if (pfnalign) { 2344*5084Sjohnlev if (tmfn != P2ROUNDUP(tmfn, pfnalign)) 2345*5084Sjohnlev goto skip; /* not properly aligned */ 2346*5084Sjohnlev } 2347*5084Sjohnlev /* 2348*5084Sjohnlev * Check segment 2349*5084Sjohnlev */ 2350*5084Sjohnlev if ((mfn & pfnseg) < (tmfn & pfnseg)) 2351*5084Sjohnlev goto skip; /* crosses segment boundary */ 2352*5084Sjohnlev /* 2353*5084Sjohnlev * Start building page list 2354*5084Sjohnlev */ 2355*5084Sjohnlev pp_first = pp_last = pp; 2356*5084Sjohnlev npages--; 2357*5084Sjohnlev } else { 2358*5084Sjohnlev /* 2359*5084Sjohnlev * check physical contiguity if required 2360*5084Sjohnlev */ 2361*5084Sjohnlev if (contig && 2362*5084Sjohnlev mfn_list[pp_first->p_pagenum] != mfn + 1) { 2363*5084Sjohnlev /* 2364*5084Sjohnlev * not a contiguous page, restart list. 2365*5084Sjohnlev */ 2366*5084Sjohnlev pp_last = NULL; 2367*5084Sjohnlev npages = mmu_btop(bytes); 2368*5084Sjohnlev goto restart; 2369*5084Sjohnlev } else { /* add page to list */ 2370*5084Sjohnlev pp_first = pp; 2371*5084Sjohnlev --npages; 2372*5084Sjohnlev } 2373*5084Sjohnlev } 2374*5084Sjohnlev skip: 2375*5084Sjohnlev if (pp == *poolp) 2376*5084Sjohnlev break; 2377*5084Sjohnlev } 2378*5084Sjohnlev 2379*5084Sjohnlev /* 2380*5084Sjohnlev * If we didn't find memory. Try the more constrained pool, then 2381*5084Sjohnlev * sweep free pages into the DMA pool and try again. If we fail 2382*5084Sjohnlev * repeatedly, ask the Hypervisor for help. 2383*5084Sjohnlev */ 2384*5084Sjohnlev if (npages != 0) { 2385*5084Sjohnlev mutex_exit(&io_pool_lock); 2386*5084Sjohnlev /* 2387*5084Sjohnlev * If we were looking in the less constrained pool and didn't 2388*5084Sjohnlev * find pages, try the more constrained pool. 2389*5084Sjohnlev */ 2390*5084Sjohnlev if (poolp == &io_pool_4g) { 2391*5084Sjohnlev poolp = &io_pool_16m; 2392*5084Sjohnlev goto try_smaller; 2393*5084Sjohnlev } 2394*5084Sjohnlev kmem_reap(); 2395*5084Sjohnlev if (++attempt < 4) { 2396*5084Sjohnlev /* 2397*5084Sjohnlev * Grab some more io_pool pages 2398*5084Sjohnlev */ 2399*5084Sjohnlev (void) populate_io_pool(); 2400*5084Sjohnlev goto try_again; 2401*5084Sjohnlev } 2402*5084Sjohnlev 2403*5084Sjohnlev if (asked_hypervisor++) 2404*5084Sjohnlev return (NULL); /* really out of luck */ 2405*5084Sjohnlev /* 2406*5084Sjohnlev * Hypervisor exchange doesn't handle segment or alignment 2407*5084Sjohnlev * constraints 2408*5084Sjohnlev */ 2409*5084Sjohnlev if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi || pfnalign) 2410*5084Sjohnlev return (NULL); 2411*5084Sjohnlev /* 2412*5084Sjohnlev * Try exchanging pages with the hypervisor. 2413*5084Sjohnlev */ 2414*5084Sjohnlev npages = mmu_btop(bytes); 2415*5084Sjohnlev kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP; 2416*5084Sjohnlev /* 2417*5084Sjohnlev * Hypervisor will allocate extents, if we want contig pages 2418*5084Sjohnlev * extent must be >= npages 2419*5084Sjohnlev */ 2420*5084Sjohnlev if (contig) { 2421*5084Sjohnlev order = highbit(npages) - 1; 2422*5084Sjohnlev if (npages & ((1 << order) - 1)) 2423*5084Sjohnlev order++; 2424*5084Sjohnlev extpages = 1 << order; 2425*5084Sjohnlev } else { 2426*5084Sjohnlev order = 0; 2427*5084Sjohnlev extpages = npages; 2428*5084Sjohnlev } 2429*5084Sjohnlev if (extpages > npages) { 2430*5084Sjohnlev extra = extpages - npages; 2431*5084Sjohnlev if (!page_resv(extra, kflags)) 2432*5084Sjohnlev return (NULL); 2433*5084Sjohnlev } 2434*5084Sjohnlev pplist = kmem_alloc(extpages * sizeof (page_t *), kflags); 2435*5084Sjohnlev if (pplist == NULL) 2436*5084Sjohnlev goto fail; 2437*5084Sjohnlev mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags); 2438*5084Sjohnlev if (mfnlist == NULL) 2439*5084Sjohnlev goto fail; 2440*5084Sjohnlev pp = page_create_va(vp, off, npages * PAGESIZE, flags, 2441*5084Sjohnlev &kvseg, vaddr); 2442*5084Sjohnlev if (pp == NULL) 2443*5084Sjohnlev goto fail; 2444*5084Sjohnlev pp_first = pp; 2445*5084Sjohnlev if (extpages > npages) { 2446*5084Sjohnlev /* 2447*5084Sjohnlev * fill out the rest of extent pages to swap with the 2448*5084Sjohnlev * hypervisor 2449*5084Sjohnlev */ 2450*5084Sjohnlev for (i = 0; i < extra; i++) { 2451*5084Sjohnlev expp = page_create_va(vp, 2452*5084Sjohnlev (u_offset_t)(uintptr_t)io_pool_kva, 2453*5084Sjohnlev PAGESIZE, flags, &kvseg, io_pool_kva); 2454*5084Sjohnlev if (expp == NULL) 2455*5084Sjohnlev goto balloon_fail; 2456*5084Sjohnlev (void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD); 2457*5084Sjohnlev page_io_unlock(expp); 2458*5084Sjohnlev page_hashout(expp, NULL); 2459*5084Sjohnlev page_io_lock(expp); 2460*5084Sjohnlev /* 2461*5084Sjohnlev * add page to end of list 2462*5084Sjohnlev */ 2463*5084Sjohnlev expp->p_prev = pp_first->p_prev; 2464*5084Sjohnlev expp->p_next = pp_first; 2465*5084Sjohnlev expp->p_prev->p_next = expp; 2466*5084Sjohnlev pp_first->p_prev = expp; 2467*5084Sjohnlev } 2468*5084Sjohnlev 2469*5084Sjohnlev } 2470*5084Sjohnlev for (i = 0; i < extpages; i++) { 2471*5084Sjohnlev pplist[i] = pp; 2472*5084Sjohnlev pp = pp->p_next; 2473*5084Sjohnlev } 2474*5084Sjohnlev nbits = highbit(mattr->dma_attr_addr_hi); 2475*5084Sjohnlev extents = contig ? 1 : npages; 2476*5084Sjohnlev if (balloon_replace_pages(extents, pplist, nbits, order, 2477*5084Sjohnlev mfnlist) != extents) 2478*5084Sjohnlev goto balloon_fail; 2479*5084Sjohnlev 2480*5084Sjohnlev kmem_free(pplist, extpages * sizeof (page_t *)); 2481*5084Sjohnlev kmem_free(mfnlist, extpages * sizeof (mfn_t)); 2482*5084Sjohnlev /* 2483*5084Sjohnlev * Return any excess pages to free list 2484*5084Sjohnlev */ 2485*5084Sjohnlev if (extpages > npages) { 2486*5084Sjohnlev for (i = 0; i < extra; i++) { 2487*5084Sjohnlev pp = pp_first->p_prev; 2488*5084Sjohnlev page_sub(&pp_first, pp); 2489*5084Sjohnlev page_io_unlock(pp); 2490*5084Sjohnlev page_unresv(1); 2491*5084Sjohnlev page_free(pp, 1); 2492*5084Sjohnlev } 2493*5084Sjohnlev } 2494*5084Sjohnlev check_dma(mattr, pp_first, mmu_btop(bytes)); 2495*5084Sjohnlev return (pp_first); 2496*5084Sjohnlev } 2497*5084Sjohnlev 2498*5084Sjohnlev /* 2499*5084Sjohnlev * Found the pages, now snip them from the list 2500*5084Sjohnlev */ 2501*5084Sjohnlev page_io_pool_sub(poolp, pp_first, pp_last); 2502*5084Sjohnlev io_pool_cnt -= mmu_btop(bytes); 2503*5084Sjohnlev if (io_pool_cnt < io_pool_cnt_lowater) 2504*5084Sjohnlev io_pool_cnt_lowater = io_pool_cnt; /* io pool low water mark */ 2505*5084Sjohnlev mutex_exit(&io_pool_lock); 2506*5084Sjohnlev done: 2507*5084Sjohnlev check_dma(mattr, pp_first, mmu_btop(bytes)); 2508*5084Sjohnlev pp = pp_first; 2509*5084Sjohnlev do { 2510*5084Sjohnlev if (!page_hashin(pp, vp, off, NULL)) { 2511*5084Sjohnlev panic("pg_create_io: hashin failed pp %p, vp %p," 2512*5084Sjohnlev " off %llx", 2513*5084Sjohnlev (void *)pp, (void *)vp, off); 2514*5084Sjohnlev } 2515*5084Sjohnlev off += MMU_PAGESIZE; 2516*5084Sjohnlev PP_CLRFREE(pp); 2517*5084Sjohnlev PP_CLRAGED(pp); 2518*5084Sjohnlev page_set_props(pp, P_REF); 2519*5084Sjohnlev page_io_lock(pp); 2520*5084Sjohnlev pp = pp->p_next; 2521*5084Sjohnlev } while (pp != pp_first); 2522*5084Sjohnlev return (pp_first); 2523*5084Sjohnlev balloon_fail: 2524*5084Sjohnlev /* 2525*5084Sjohnlev * Return pages to free list and return failure 2526*5084Sjohnlev */ 2527*5084Sjohnlev while (pp_first != NULL) { 2528*5084Sjohnlev pp = pp_first; 2529*5084Sjohnlev page_sub(&pp_first, pp); 2530*5084Sjohnlev page_io_unlock(pp); 2531*5084Sjohnlev if (pp->p_vnode != NULL) 2532*5084Sjohnlev page_hashout(pp, NULL); 2533*5084Sjohnlev page_free(pp, 1); 2534*5084Sjohnlev } 2535*5084Sjohnlev fail: 2536*5084Sjohnlev if (pplist) 2537*5084Sjohnlev kmem_free(pplist, extpages * sizeof (page_t *)); 2538*5084Sjohnlev if (mfnlist) 2539*5084Sjohnlev kmem_free(mfnlist, extpages * sizeof (mfn_t)); 2540*5084Sjohnlev page_unresv(extpages - npages); 2541*5084Sjohnlev return (NULL); 2542*5084Sjohnlev } 2543*5084Sjohnlev 2544*5084Sjohnlev /* 2545*5084Sjohnlev * Lock and return the page with the highest mfn that we can find. last_mfn 2546*5084Sjohnlev * holds the last one found, so the next search can start from there. We 2547*5084Sjohnlev * also keep a counter so that we don't loop forever if the machine has no 2548*5084Sjohnlev * free pages. 2549*5084Sjohnlev * 2550*5084Sjohnlev * This is called from the balloon thread to find pages to give away. new_high 2551*5084Sjohnlev * is used when new mfn's have been added to the system - we will reset our 2552*5084Sjohnlev * search if the new mfn's are higher than our current search position. 2553*5084Sjohnlev */ 2554*5084Sjohnlev page_t * 2555*5084Sjohnlev page_get_high_mfn(mfn_t new_high) 2556*5084Sjohnlev { 2557*5084Sjohnlev static mfn_t last_mfn = 0; 2558*5084Sjohnlev pfn_t pfn; 2559*5084Sjohnlev page_t *pp; 2560*5084Sjohnlev ulong_t loop_count = 0; 2561*5084Sjohnlev 2562*5084Sjohnlev if (new_high > last_mfn) 2563*5084Sjohnlev last_mfn = new_high; 2564*5084Sjohnlev 2565*5084Sjohnlev for (; loop_count < mfn_count; loop_count++, last_mfn--) { 2566*5084Sjohnlev if (last_mfn == 0) { 2567*5084Sjohnlev last_mfn = cached_max_mfn; 2568*5084Sjohnlev } 2569*5084Sjohnlev 2570*5084Sjohnlev pfn = mfn_to_pfn(last_mfn); 2571*5084Sjohnlev if (pfn & PFN_IS_FOREIGN_MFN) 2572*5084Sjohnlev continue; 2573*5084Sjohnlev 2574*5084Sjohnlev /* See if the page is free. If so, lock it. */ 2575*5084Sjohnlev pp = page_numtopp_alloc(pfn); 2576*5084Sjohnlev if (pp == NULL) 2577*5084Sjohnlev continue; 2578*5084Sjohnlev PP_CLRFREE(pp); 2579*5084Sjohnlev 2580*5084Sjohnlev ASSERT(PAGE_EXCL(pp)); 2581*5084Sjohnlev ASSERT(pp->p_vnode == NULL); 2582*5084Sjohnlev ASSERT(!hat_page_is_mapped(pp)); 2583*5084Sjohnlev last_mfn--; 2584*5084Sjohnlev return (pp); 2585*5084Sjohnlev } 2586*5084Sjohnlev return (NULL); 2587*5084Sjohnlev } 2588*5084Sjohnlev 2589*5084Sjohnlev #else /* !__xpv */ 2590*5084Sjohnlev 25910Sstevel@tonic-gate /* 25920Sstevel@tonic-gate * get a page from any list with the given mnode 25930Sstevel@tonic-gate */ 2594*5084Sjohnlev static page_t * 25950Sstevel@tonic-gate page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 25960Sstevel@tonic-gate int mnode, int mtype, ddi_dma_attr_t *dma_attr) 25970Sstevel@tonic-gate { 25982961Sdp78419 kmutex_t *pcm; 25992961Sdp78419 int i; 26002961Sdp78419 page_t *pp; 26012961Sdp78419 page_t *first_pp; 26022961Sdp78419 uint64_t pgaddr; 26032961Sdp78419 ulong_t bin; 26042961Sdp78419 int mtypestart; 26052961Sdp78419 int plw_initialized; 26062961Sdp78419 page_list_walker_t plw; 26070Sstevel@tonic-gate 26080Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_alloc); 26090Sstevel@tonic-gate 26100Sstevel@tonic-gate ASSERT((flags & PG_MATCH_COLOR) == 0); 26110Sstevel@tonic-gate ASSERT(szc == 0); 26120Sstevel@tonic-gate ASSERT(dma_attr != NULL); 26130Sstevel@tonic-gate 26140Sstevel@tonic-gate MTYPE_START(mnode, mtype, flags); 26150Sstevel@tonic-gate if (mtype < 0) { 26160Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocempty); 26170Sstevel@tonic-gate return (NULL); 26180Sstevel@tonic-gate } 26190Sstevel@tonic-gate 26200Sstevel@tonic-gate mtypestart = mtype; 26210Sstevel@tonic-gate 26220Sstevel@tonic-gate bin = origbin; 26230Sstevel@tonic-gate 26240Sstevel@tonic-gate /* 26250Sstevel@tonic-gate * check up to page_colors + 1 bins - origbin may be checked twice 26260Sstevel@tonic-gate * because of BIN_STEP skip 26270Sstevel@tonic-gate */ 26280Sstevel@tonic-gate do { 26292961Sdp78419 plw_initialized = 0; 26302961Sdp78419 26312961Sdp78419 for (plw.plw_count = 0; 26322961Sdp78419 plw.plw_count < page_colors; plw.plw_count++) { 26332961Sdp78419 26340Sstevel@tonic-gate if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 26350Sstevel@tonic-gate goto nextfreebin; 26360Sstevel@tonic-gate 26370Sstevel@tonic-gate pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 26380Sstevel@tonic-gate mutex_enter(pcm); 26390Sstevel@tonic-gate pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 26400Sstevel@tonic-gate first_pp = pp; 26410Sstevel@tonic-gate while (pp != NULL) { 26420Sstevel@tonic-gate if (page_trylock(pp, SE_EXCL) == 0) { 26430Sstevel@tonic-gate pp = pp->p_next; 26440Sstevel@tonic-gate if (pp == first_pp) { 26450Sstevel@tonic-gate pp = NULL; 26460Sstevel@tonic-gate } 26470Sstevel@tonic-gate continue; 26480Sstevel@tonic-gate } 26490Sstevel@tonic-gate 26500Sstevel@tonic-gate ASSERT(PP_ISFREE(pp)); 26510Sstevel@tonic-gate ASSERT(PP_ISAGED(pp)); 26520Sstevel@tonic-gate ASSERT(pp->p_vnode == NULL); 26530Sstevel@tonic-gate ASSERT(pp->p_hash == NULL); 26540Sstevel@tonic-gate ASSERT(pp->p_offset == (u_offset_t)-1); 26550Sstevel@tonic-gate ASSERT(pp->p_szc == szc); 26560Sstevel@tonic-gate ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 26570Sstevel@tonic-gate /* check if page within DMA attributes */ 26583446Smrj pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 26590Sstevel@tonic-gate if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 26600Sstevel@tonic-gate (pgaddr + MMU_PAGESIZE - 1 <= 26610Sstevel@tonic-gate dma_attr->dma_attr_addr_hi)) { 26620Sstevel@tonic-gate break; 26630Sstevel@tonic-gate } 26640Sstevel@tonic-gate 26650Sstevel@tonic-gate /* continue looking */ 26660Sstevel@tonic-gate page_unlock(pp); 26670Sstevel@tonic-gate pp = pp->p_next; 26680Sstevel@tonic-gate if (pp == first_pp) 26690Sstevel@tonic-gate pp = NULL; 26700Sstevel@tonic-gate 26710Sstevel@tonic-gate } 26720Sstevel@tonic-gate if (pp != NULL) { 26730Sstevel@tonic-gate ASSERT(mtype == PP_2_MTYPE(pp)); 26740Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 26750Sstevel@tonic-gate 26760Sstevel@tonic-gate /* found a page with specified DMA attributes */ 26770Sstevel@tonic-gate page_sub(&PAGE_FREELISTS(mnode, szc, bin, 26780Sstevel@tonic-gate mtype), pp); 2679414Skchow page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 26800Sstevel@tonic-gate 26810Sstevel@tonic-gate if ((PP_ISFREE(pp) == 0) || 26820Sstevel@tonic-gate (PP_ISAGED(pp) == 0)) { 26830Sstevel@tonic-gate cmn_err(CE_PANIC, "page %p is not free", 26840Sstevel@tonic-gate (void *)pp); 26850Sstevel@tonic-gate } 26860Sstevel@tonic-gate 26870Sstevel@tonic-gate mutex_exit(pcm); 26880Sstevel@tonic-gate check_dma(dma_attr, pp, 1); 26890Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocok); 26900Sstevel@tonic-gate return (pp); 26910Sstevel@tonic-gate } 26920Sstevel@tonic-gate mutex_exit(pcm); 26930Sstevel@tonic-gate nextfreebin: 26942961Sdp78419 if (plw_initialized == 0) { 26952961Sdp78419 page_list_walk_init(szc, 0, bin, 1, 0, &plw); 26962961Sdp78419 ASSERT(plw.plw_ceq_dif == page_colors); 26972961Sdp78419 plw_initialized = 1; 26982961Sdp78419 } 26990Sstevel@tonic-gate 27002961Sdp78419 if (plw.plw_do_split) { 27012961Sdp78419 pp = page_freelist_split(szc, bin, mnode, 27022961Sdp78419 mtype, 27032961Sdp78419 mmu_btop(dma_attr->dma_attr_addr_hi + 1), 27042961Sdp78419 &plw); 27052961Sdp78419 if (pp != NULL) 27062961Sdp78419 return (pp); 27072961Sdp78419 } 27082961Sdp78419 27092961Sdp78419 bin = page_list_walk_next_bin(szc, bin, &plw); 27100Sstevel@tonic-gate } 27112961Sdp78419 2712414Skchow MTYPE_NEXT(mnode, mtype, flags); 2713414Skchow } while (mtype >= 0); 27140Sstevel@tonic-gate 27150Sstevel@tonic-gate /* failed to find a page in the freelist; try it in the cachelist */ 27160Sstevel@tonic-gate 27170Sstevel@tonic-gate /* reset mtype start for cachelist search */ 27180Sstevel@tonic-gate mtype = mtypestart; 27190Sstevel@tonic-gate ASSERT(mtype >= 0); 27200Sstevel@tonic-gate 27210Sstevel@tonic-gate /* start with the bin of matching color */ 27220Sstevel@tonic-gate bin = origbin; 27230Sstevel@tonic-gate 27240Sstevel@tonic-gate do { 27250Sstevel@tonic-gate for (i = 0; i <= page_colors; i++) { 27260Sstevel@tonic-gate if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 27270Sstevel@tonic-gate goto nextcachebin; 27280Sstevel@tonic-gate pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 27290Sstevel@tonic-gate mutex_enter(pcm); 27300Sstevel@tonic-gate pp = PAGE_CACHELISTS(mnode, bin, mtype); 27310Sstevel@tonic-gate first_pp = pp; 27320Sstevel@tonic-gate while (pp != NULL) { 27330Sstevel@tonic-gate if (page_trylock(pp, SE_EXCL) == 0) { 27340Sstevel@tonic-gate pp = pp->p_next; 27350Sstevel@tonic-gate if (pp == first_pp) 27360Sstevel@tonic-gate break; 27370Sstevel@tonic-gate continue; 27380Sstevel@tonic-gate } 27390Sstevel@tonic-gate ASSERT(pp->p_vnode); 27400Sstevel@tonic-gate ASSERT(PP_ISAGED(pp) == 0); 27410Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 27420Sstevel@tonic-gate ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 27430Sstevel@tonic-gate 27440Sstevel@tonic-gate /* check if page within DMA attributes */ 27450Sstevel@tonic-gate 27463446Smrj pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 27470Sstevel@tonic-gate if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 27480Sstevel@tonic-gate (pgaddr + MMU_PAGESIZE - 1 <= 27490Sstevel@tonic-gate dma_attr->dma_attr_addr_hi)) { 27500Sstevel@tonic-gate break; 27510Sstevel@tonic-gate } 27520Sstevel@tonic-gate 27530Sstevel@tonic-gate /* continue looking */ 27540Sstevel@tonic-gate page_unlock(pp); 27550Sstevel@tonic-gate pp = pp->p_next; 27560Sstevel@tonic-gate if (pp == first_pp) 27570Sstevel@tonic-gate pp = NULL; 27580Sstevel@tonic-gate } 27590Sstevel@tonic-gate 27600Sstevel@tonic-gate if (pp != NULL) { 27610Sstevel@tonic-gate ASSERT(mtype == PP_2_MTYPE(pp)); 27620Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 27630Sstevel@tonic-gate 27640Sstevel@tonic-gate /* found a page with specified DMA attributes */ 27650Sstevel@tonic-gate page_sub(&PAGE_CACHELISTS(mnode, bin, 27660Sstevel@tonic-gate mtype), pp); 2767414Skchow page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 27680Sstevel@tonic-gate 27690Sstevel@tonic-gate mutex_exit(pcm); 27700Sstevel@tonic-gate ASSERT(pp->p_vnode); 27710Sstevel@tonic-gate ASSERT(PP_ISAGED(pp) == 0); 27720Sstevel@tonic-gate check_dma(dma_attr, pp, 1); 27730Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocok); 27740Sstevel@tonic-gate return (pp); 27750Sstevel@tonic-gate } 27760Sstevel@tonic-gate mutex_exit(pcm); 27770Sstevel@tonic-gate nextcachebin: 27780Sstevel@tonic-gate bin += (i == 0) ? BIN_STEP : 1; 27790Sstevel@tonic-gate bin &= page_colors_mask; 27800Sstevel@tonic-gate } 2781414Skchow MTYPE_NEXT(mnode, mtype, flags); 2782414Skchow } while (mtype >= 0); 27830Sstevel@tonic-gate 27840Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 27850Sstevel@tonic-gate return (NULL); 27860Sstevel@tonic-gate } 27870Sstevel@tonic-gate 27880Sstevel@tonic-gate /* 27890Sstevel@tonic-gate * This function is similar to page_get_freelist()/page_get_cachelist() 27900Sstevel@tonic-gate * but it searches both the lists to find a page with the specified 27910Sstevel@tonic-gate * color (or no color) and DMA attributes. The search is done in the 27920Sstevel@tonic-gate * freelist first and then in the cache list within the highest memory 27930Sstevel@tonic-gate * range (based on DMA attributes) before searching in the lower 27940Sstevel@tonic-gate * memory ranges. 27950Sstevel@tonic-gate * 27960Sstevel@tonic-gate * Note: This function is called only by page_create_io(). 27970Sstevel@tonic-gate */ 27980Sstevel@tonic-gate /*ARGSUSED*/ 2799*5084Sjohnlev static page_t * 28000Sstevel@tonic-gate page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 28010Sstevel@tonic-gate size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 28020Sstevel@tonic-gate { 28030Sstevel@tonic-gate uint_t bin; 28040Sstevel@tonic-gate int mtype; 28050Sstevel@tonic-gate page_t *pp; 28060Sstevel@tonic-gate int n; 28070Sstevel@tonic-gate int m; 28080Sstevel@tonic-gate int szc; 28090Sstevel@tonic-gate int fullrange; 28100Sstevel@tonic-gate int mnode; 28110Sstevel@tonic-gate int local_failed_stat = 0; 28120Sstevel@tonic-gate lgrp_mnode_cookie_t lgrp_cookie; 28130Sstevel@tonic-gate 28140Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_alloc); 28150Sstevel@tonic-gate 28160Sstevel@tonic-gate /* only base pagesize currently supported */ 28170Sstevel@tonic-gate if (size != MMU_PAGESIZE) 28180Sstevel@tonic-gate return (NULL); 28190Sstevel@tonic-gate 28200Sstevel@tonic-gate /* 28210Sstevel@tonic-gate * If we're passed a specific lgroup, we use it. Otherwise, 28220Sstevel@tonic-gate * assume first-touch placement is desired. 28230Sstevel@tonic-gate */ 28240Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp)) 28250Sstevel@tonic-gate lgrp = lgrp_home_lgrp(); 28260Sstevel@tonic-gate 28270Sstevel@tonic-gate /* LINTED */ 28282961Sdp78419 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 28290Sstevel@tonic-gate 28300Sstevel@tonic-gate /* 28310Sstevel@tonic-gate * Only hold one freelist or cachelist lock at a time, that way we 28320Sstevel@tonic-gate * can start anywhere and not have to worry about lock 28330Sstevel@tonic-gate * ordering. 28340Sstevel@tonic-gate */ 28350Sstevel@tonic-gate if (dma_attr == NULL) { 28360Sstevel@tonic-gate n = 0; 28370Sstevel@tonic-gate m = mnoderangecnt - 1; 28380Sstevel@tonic-gate fullrange = 1; 28390Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 28400Sstevel@tonic-gate } else { 28410Sstevel@tonic-gate pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 28420Sstevel@tonic-gate pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 28430Sstevel@tonic-gate 28440Sstevel@tonic-gate /* 28450Sstevel@tonic-gate * We can guarantee alignment only for page boundary. 28460Sstevel@tonic-gate */ 28470Sstevel@tonic-gate if (dma_attr->dma_attr_align > MMU_PAGESIZE) 28480Sstevel@tonic-gate return (NULL); 28490Sstevel@tonic-gate 28500Sstevel@tonic-gate n = pfn_2_mtype(pfnlo); 28510Sstevel@tonic-gate m = pfn_2_mtype(pfnhi); 28520Sstevel@tonic-gate 28530Sstevel@tonic-gate fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 28540Sstevel@tonic-gate (pfnhi >= mnoderanges[m].mnr_pfnhi)); 28550Sstevel@tonic-gate } 28560Sstevel@tonic-gate VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 28570Sstevel@tonic-gate 28580Sstevel@tonic-gate if (n > m) 28590Sstevel@tonic-gate return (NULL); 28600Sstevel@tonic-gate 28610Sstevel@tonic-gate szc = 0; 28620Sstevel@tonic-gate 28630Sstevel@tonic-gate /* cylcing thru mtype handled by RANGE0 if n == 0 */ 28640Sstevel@tonic-gate if (n == 0) { 28650Sstevel@tonic-gate flags |= PGI_MT_RANGE0; 28660Sstevel@tonic-gate n = m; 28670Sstevel@tonic-gate } 28680Sstevel@tonic-gate 28690Sstevel@tonic-gate /* 28700Sstevel@tonic-gate * Try local memory node first, but try remote if we can't 28710Sstevel@tonic-gate * get a page of the right color. 28720Sstevel@tonic-gate */ 28730Sstevel@tonic-gate LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 28740Sstevel@tonic-gate while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 28750Sstevel@tonic-gate /* 28760Sstevel@tonic-gate * allocate pages from high pfn to low. 28770Sstevel@tonic-gate */ 28780Sstevel@tonic-gate for (mtype = m; mtype >= n; mtype--) { 28790Sstevel@tonic-gate if (fullrange != 0) { 28800Sstevel@tonic-gate pp = page_get_mnode_freelist(mnode, 28810Sstevel@tonic-gate bin, mtype, szc, flags); 28820Sstevel@tonic-gate if (pp == NULL) { 28830Sstevel@tonic-gate pp = page_get_mnode_cachelist( 2884*5084Sjohnlev bin, flags, mnode, mtype); 28850Sstevel@tonic-gate } 28860Sstevel@tonic-gate } else { 28870Sstevel@tonic-gate pp = page_get_mnode_anylist(bin, szc, 28880Sstevel@tonic-gate flags, mnode, mtype, dma_attr); 28890Sstevel@tonic-gate } 28900Sstevel@tonic-gate if (pp != NULL) { 28910Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_allocok); 28920Sstevel@tonic-gate check_dma(dma_attr, pp, 1); 28930Sstevel@tonic-gate return (pp); 28940Sstevel@tonic-gate } 28950Sstevel@tonic-gate } 28960Sstevel@tonic-gate if (!local_failed_stat) { 28970Sstevel@tonic-gate lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 28980Sstevel@tonic-gate local_failed_stat = 1; 28990Sstevel@tonic-gate } 29000Sstevel@tonic-gate } 29010Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_allocfailed); 29020Sstevel@tonic-gate 29030Sstevel@tonic-gate return (NULL); 29040Sstevel@tonic-gate } 29050Sstevel@tonic-gate 29060Sstevel@tonic-gate /* 29070Sstevel@tonic-gate * page_create_io() 29080Sstevel@tonic-gate * 29090Sstevel@tonic-gate * This function is a copy of page_create_va() with an additional 29100Sstevel@tonic-gate * argument 'mattr' that specifies DMA memory requirements to 29110Sstevel@tonic-gate * the page list functions. This function is used by the segkmem 29120Sstevel@tonic-gate * allocator so it is only to create new pages (i.e PG_EXCL is 29130Sstevel@tonic-gate * set). 29140Sstevel@tonic-gate * 29150Sstevel@tonic-gate * Note: This interface is currently used by x86 PSM only and is 29160Sstevel@tonic-gate * not fully specified so the commitment level is only for 29170Sstevel@tonic-gate * private interface specific to x86. This interface uses PSM 29180Sstevel@tonic-gate * specific page_get_anylist() interface. 29190Sstevel@tonic-gate */ 29200Sstevel@tonic-gate 29210Sstevel@tonic-gate #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 29220Sstevel@tonic-gate for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 29230Sstevel@tonic-gate if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 29240Sstevel@tonic-gate break; \ 29250Sstevel@tonic-gate } \ 29260Sstevel@tonic-gate } 29270Sstevel@tonic-gate 29280Sstevel@tonic-gate 29290Sstevel@tonic-gate page_t * 29300Sstevel@tonic-gate page_create_io( 29310Sstevel@tonic-gate struct vnode *vp, 29320Sstevel@tonic-gate u_offset_t off, 29330Sstevel@tonic-gate uint_t bytes, 29340Sstevel@tonic-gate uint_t flags, 29350Sstevel@tonic-gate struct as *as, 29360Sstevel@tonic-gate caddr_t vaddr, 29370Sstevel@tonic-gate ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 29380Sstevel@tonic-gate { 29390Sstevel@tonic-gate page_t *plist = NULL; 29400Sstevel@tonic-gate uint_t plist_len = 0; 29410Sstevel@tonic-gate pgcnt_t npages; 29420Sstevel@tonic-gate page_t *npp = NULL; 29430Sstevel@tonic-gate uint_t pages_req; 29440Sstevel@tonic-gate page_t *pp; 29450Sstevel@tonic-gate kmutex_t *phm = NULL; 29460Sstevel@tonic-gate uint_t index; 29470Sstevel@tonic-gate 29480Sstevel@tonic-gate TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 2949*5084Sjohnlev "page_create_start:vp %p off %llx bytes %u flags %x", 2950*5084Sjohnlev vp, off, bytes, flags); 29510Sstevel@tonic-gate 29520Sstevel@tonic-gate ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 29530Sstevel@tonic-gate 29540Sstevel@tonic-gate pages_req = npages = mmu_btopr(bytes); 29550Sstevel@tonic-gate 29560Sstevel@tonic-gate /* 29570Sstevel@tonic-gate * Do the freemem and pcf accounting. 29580Sstevel@tonic-gate */ 29590Sstevel@tonic-gate if (!page_create_wait(npages, flags)) { 29600Sstevel@tonic-gate return (NULL); 29610Sstevel@tonic-gate } 29620Sstevel@tonic-gate 29630Sstevel@tonic-gate TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 2964*5084Sjohnlev "page_create_success:vp %p off %llx", vp, off); 29650Sstevel@tonic-gate 29660Sstevel@tonic-gate /* 29670Sstevel@tonic-gate * If satisfying this request has left us with too little 29680Sstevel@tonic-gate * memory, start the wheels turning to get some back. The 29690Sstevel@tonic-gate * first clause of the test prevents waking up the pageout 29700Sstevel@tonic-gate * daemon in situations where it would decide that there's 29710Sstevel@tonic-gate * nothing to do. 29720Sstevel@tonic-gate */ 29730Sstevel@tonic-gate if (nscan < desscan && freemem < minfree) { 29740Sstevel@tonic-gate TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 2975*5084Sjohnlev "pageout_cv_signal:freemem %ld", freemem); 29760Sstevel@tonic-gate cv_signal(&proc_pageout->p_cv); 29770Sstevel@tonic-gate } 29780Sstevel@tonic-gate 29790Sstevel@tonic-gate if (flags & PG_PHYSCONTIG) { 29800Sstevel@tonic-gate 29810Sstevel@tonic-gate plist = page_get_contigpage(&npages, mattr, 1); 29820Sstevel@tonic-gate if (plist == NULL) { 29830Sstevel@tonic-gate page_create_putback(npages); 29840Sstevel@tonic-gate return (NULL); 29850Sstevel@tonic-gate } 29860Sstevel@tonic-gate 29870Sstevel@tonic-gate pp = plist; 29880Sstevel@tonic-gate 29890Sstevel@tonic-gate do { 29900Sstevel@tonic-gate if (!page_hashin(pp, vp, off, NULL)) { 29910Sstevel@tonic-gate panic("pg_creat_io: hashin failed %p %p %llx", 29920Sstevel@tonic-gate (void *)pp, (void *)vp, off); 29930Sstevel@tonic-gate } 29940Sstevel@tonic-gate VM_STAT_ADD(page_create_new); 29950Sstevel@tonic-gate off += MMU_PAGESIZE; 29960Sstevel@tonic-gate PP_CLRFREE(pp); 29970Sstevel@tonic-gate PP_CLRAGED(pp); 29980Sstevel@tonic-gate page_set_props(pp, P_REF); 29990Sstevel@tonic-gate pp = pp->p_next; 30000Sstevel@tonic-gate } while (pp != plist); 30010Sstevel@tonic-gate 30020Sstevel@tonic-gate if (!npages) { 30030Sstevel@tonic-gate check_dma(mattr, plist, pages_req); 30040Sstevel@tonic-gate return (plist); 30050Sstevel@tonic-gate } else { 30060Sstevel@tonic-gate vaddr += (pages_req - npages) << MMU_PAGESHIFT; 30070Sstevel@tonic-gate } 30080Sstevel@tonic-gate 30090Sstevel@tonic-gate /* 30100Sstevel@tonic-gate * fall-thru: 30110Sstevel@tonic-gate * 30120Sstevel@tonic-gate * page_get_contigpage returns when npages <= sgllen. 30130Sstevel@tonic-gate * Grab the rest of the non-contig pages below from anylist. 30140Sstevel@tonic-gate */ 30150Sstevel@tonic-gate } 30160Sstevel@tonic-gate 30170Sstevel@tonic-gate /* 30180Sstevel@tonic-gate * Loop around collecting the requested number of pages. 30190Sstevel@tonic-gate * Most of the time, we have to `create' a new page. With 30200Sstevel@tonic-gate * this in mind, pull the page off the free list before 30210Sstevel@tonic-gate * getting the hash lock. This will minimize the hash 30220Sstevel@tonic-gate * lock hold time, nesting, and the like. If it turns 30230Sstevel@tonic-gate * out we don't need the page, we put it back at the end. 30240Sstevel@tonic-gate */ 30250Sstevel@tonic-gate while (npages--) { 30260Sstevel@tonic-gate phm = NULL; 30270Sstevel@tonic-gate 30280Sstevel@tonic-gate index = PAGE_HASH_FUNC(vp, off); 30290Sstevel@tonic-gate top: 30300Sstevel@tonic-gate ASSERT(phm == NULL); 30310Sstevel@tonic-gate ASSERT(index == PAGE_HASH_FUNC(vp, off)); 30320Sstevel@tonic-gate ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 30330Sstevel@tonic-gate 30340Sstevel@tonic-gate if (npp == NULL) { 30350Sstevel@tonic-gate /* 30360Sstevel@tonic-gate * Try to get the page of any color either from 30370Sstevel@tonic-gate * the freelist or from the cache list. 30380Sstevel@tonic-gate */ 30390Sstevel@tonic-gate npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 30400Sstevel@tonic-gate flags & ~PG_MATCH_COLOR, mattr, NULL); 30410Sstevel@tonic-gate if (npp == NULL) { 30420Sstevel@tonic-gate if (mattr == NULL) { 30430Sstevel@tonic-gate /* 30440Sstevel@tonic-gate * Not looking for a special page; 30450Sstevel@tonic-gate * panic! 30460Sstevel@tonic-gate */ 30470Sstevel@tonic-gate panic("no page found %d", (int)npages); 30480Sstevel@tonic-gate } 30490Sstevel@tonic-gate /* 30500Sstevel@tonic-gate * No page found! This can happen 30510Sstevel@tonic-gate * if we are looking for a page 30520Sstevel@tonic-gate * within a specific memory range 30530Sstevel@tonic-gate * for DMA purposes. If PG_WAIT is 30540Sstevel@tonic-gate * specified then we wait for a 30550Sstevel@tonic-gate * while and then try again. The 30560Sstevel@tonic-gate * wait could be forever if we 30570Sstevel@tonic-gate * don't get the page(s) we need. 30580Sstevel@tonic-gate * 30590Sstevel@tonic-gate * Note: XXX We really need a mechanism 30600Sstevel@tonic-gate * to wait for pages in the desired 30610Sstevel@tonic-gate * range. For now, we wait for any 30620Sstevel@tonic-gate * pages and see if we can use it. 30630Sstevel@tonic-gate */ 30640Sstevel@tonic-gate 30650Sstevel@tonic-gate if ((mattr != NULL) && (flags & PG_WAIT)) { 30660Sstevel@tonic-gate delay(10); 30670Sstevel@tonic-gate goto top; 30680Sstevel@tonic-gate } 30690Sstevel@tonic-gate goto fail; /* undo accounting stuff */ 30700Sstevel@tonic-gate } 30710Sstevel@tonic-gate 30720Sstevel@tonic-gate if (PP_ISAGED(npp) == 0) { 30730Sstevel@tonic-gate /* 30740Sstevel@tonic-gate * Since this page came from the 30750Sstevel@tonic-gate * cachelist, we must destroy the 30760Sstevel@tonic-gate * old vnode association. 30770Sstevel@tonic-gate */ 30780Sstevel@tonic-gate page_hashout(npp, (kmutex_t *)NULL); 30790Sstevel@tonic-gate } 30800Sstevel@tonic-gate } 30810Sstevel@tonic-gate 30820Sstevel@tonic-gate /* 30830Sstevel@tonic-gate * We own this page! 30840Sstevel@tonic-gate */ 30850Sstevel@tonic-gate ASSERT(PAGE_EXCL(npp)); 30860Sstevel@tonic-gate ASSERT(npp->p_vnode == NULL); 30870Sstevel@tonic-gate ASSERT(!hat_page_is_mapped(npp)); 30880Sstevel@tonic-gate PP_CLRFREE(npp); 30890Sstevel@tonic-gate PP_CLRAGED(npp); 30900Sstevel@tonic-gate 30910Sstevel@tonic-gate /* 30920Sstevel@tonic-gate * Here we have a page in our hot little mits and are 30930Sstevel@tonic-gate * just waiting to stuff it on the appropriate lists. 30940Sstevel@tonic-gate * Get the mutex and check to see if it really does 30950Sstevel@tonic-gate * not exist. 30960Sstevel@tonic-gate */ 30970Sstevel@tonic-gate phm = PAGE_HASH_MUTEX(index); 30980Sstevel@tonic-gate mutex_enter(phm); 30990Sstevel@tonic-gate PAGE_HASH_SEARCH(index, pp, vp, off); 31000Sstevel@tonic-gate if (pp == NULL) { 31010Sstevel@tonic-gate VM_STAT_ADD(page_create_new); 31020Sstevel@tonic-gate pp = npp; 31030Sstevel@tonic-gate npp = NULL; 31040Sstevel@tonic-gate if (!page_hashin(pp, vp, off, phm)) { 31050Sstevel@tonic-gate /* 31060Sstevel@tonic-gate * Since we hold the page hash mutex and 31070Sstevel@tonic-gate * just searched for this page, page_hashin 31080Sstevel@tonic-gate * had better not fail. If it does, that 31090Sstevel@tonic-gate * means somethread did not follow the 31100Sstevel@tonic-gate * page hash mutex rules. Panic now and 31110Sstevel@tonic-gate * get it over with. As usual, go down 31120Sstevel@tonic-gate * holding all the locks. 31130Sstevel@tonic-gate */ 31140Sstevel@tonic-gate ASSERT(MUTEX_HELD(phm)); 31150Sstevel@tonic-gate panic("page_create: hashin fail %p %p %llx %p", 31160Sstevel@tonic-gate (void *)pp, (void *)vp, off, (void *)phm); 31170Sstevel@tonic-gate 31180Sstevel@tonic-gate } 31190Sstevel@tonic-gate ASSERT(MUTEX_HELD(phm)); 31200Sstevel@tonic-gate mutex_exit(phm); 31210Sstevel@tonic-gate phm = NULL; 31220Sstevel@tonic-gate 31230Sstevel@tonic-gate /* 31240Sstevel@tonic-gate * Hat layer locking need not be done to set 31250Sstevel@tonic-gate * the following bits since the page is not hashed 31260Sstevel@tonic-gate * and was on the free list (i.e., had no mappings). 31270Sstevel@tonic-gate * 31280Sstevel@tonic-gate * Set the reference bit to protect 31290Sstevel@tonic-gate * against immediate pageout 31300Sstevel@tonic-gate * 31310Sstevel@tonic-gate * XXXmh modify freelist code to set reference 31320Sstevel@tonic-gate * bit so we don't have to do it here. 31330Sstevel@tonic-gate */ 31340Sstevel@tonic-gate page_set_props(pp, P_REF); 31350Sstevel@tonic-gate } else { 31360Sstevel@tonic-gate ASSERT(MUTEX_HELD(phm)); 31370Sstevel@tonic-gate mutex_exit(phm); 31380Sstevel@tonic-gate phm = NULL; 31390Sstevel@tonic-gate /* 31400Sstevel@tonic-gate * NOTE: This should not happen for pages associated 31410Sstevel@tonic-gate * with kernel vnode 'kvp'. 31420Sstevel@tonic-gate */ 31430Sstevel@tonic-gate /* XX64 - to debug why this happens! */ 31443290Sjohansen ASSERT(!VN_ISKAS(vp)); 31453290Sjohansen if (VN_ISKAS(vp)) 31460Sstevel@tonic-gate cmn_err(CE_NOTE, 31470Sstevel@tonic-gate "page_create: page not expected " 31480Sstevel@tonic-gate "in hash list for kernel vnode - pp 0x%p", 31490Sstevel@tonic-gate (void *)pp); 31500Sstevel@tonic-gate VM_STAT_ADD(page_create_exists); 31510Sstevel@tonic-gate goto fail; 31520Sstevel@tonic-gate } 31530Sstevel@tonic-gate 31540Sstevel@tonic-gate /* 31550Sstevel@tonic-gate * Got a page! It is locked. Acquire the i/o 31560Sstevel@tonic-gate * lock since we are going to use the p_next and 31570Sstevel@tonic-gate * p_prev fields to link the requested pages together. 31580Sstevel@tonic-gate */ 31590Sstevel@tonic-gate page_io_lock(pp); 31600Sstevel@tonic-gate page_add(&plist, pp); 31610Sstevel@tonic-gate plist = plist->p_next; 31620Sstevel@tonic-gate off += MMU_PAGESIZE; 31630Sstevel@tonic-gate vaddr += MMU_PAGESIZE; 31640Sstevel@tonic-gate } 31650Sstevel@tonic-gate 31660Sstevel@tonic-gate check_dma(mattr, plist, pages_req); 31670Sstevel@tonic-gate return (plist); 31680Sstevel@tonic-gate 31690Sstevel@tonic-gate fail: 31700Sstevel@tonic-gate if (npp != NULL) { 31710Sstevel@tonic-gate /* 31720Sstevel@tonic-gate * Did not need this page after all. 31730Sstevel@tonic-gate * Put it back on the free list. 31740Sstevel@tonic-gate */ 31750Sstevel@tonic-gate VM_STAT_ADD(page_create_putbacks); 31760Sstevel@tonic-gate PP_SETFREE(npp); 31770Sstevel@tonic-gate PP_SETAGED(npp); 31780Sstevel@tonic-gate npp->p_offset = (u_offset_t)-1; 31790Sstevel@tonic-gate page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 31800Sstevel@tonic-gate page_unlock(npp); 31810Sstevel@tonic-gate } 31820Sstevel@tonic-gate 31830Sstevel@tonic-gate /* 31840Sstevel@tonic-gate * Give up the pages we already got. 31850Sstevel@tonic-gate */ 31860Sstevel@tonic-gate while (plist != NULL) { 31870Sstevel@tonic-gate pp = plist; 31880Sstevel@tonic-gate page_sub(&plist, pp); 31890Sstevel@tonic-gate page_io_unlock(pp); 31900Sstevel@tonic-gate plist_len++; 31910Sstevel@tonic-gate /*LINTED: constant in conditional ctx*/ 31920Sstevel@tonic-gate VN_DISPOSE(pp, B_INVAL, 0, kcred); 31930Sstevel@tonic-gate } 31940Sstevel@tonic-gate 31950Sstevel@tonic-gate /* 31960Sstevel@tonic-gate * VN_DISPOSE does freemem accounting for the pages in plist 31970Sstevel@tonic-gate * by calling page_free. So, we need to undo the pcf accounting 31980Sstevel@tonic-gate * for only the remaining pages. 31990Sstevel@tonic-gate */ 32000Sstevel@tonic-gate VM_STAT_ADD(page_create_putbacks); 32010Sstevel@tonic-gate page_create_putback(pages_req - plist_len); 32020Sstevel@tonic-gate 32030Sstevel@tonic-gate return (NULL); 32040Sstevel@tonic-gate } 3205*5084Sjohnlev #endif /* !__xpv */ 32060Sstevel@tonic-gate 32070Sstevel@tonic-gate 32080Sstevel@tonic-gate /* 32090Sstevel@tonic-gate * Copy the data from the physical page represented by "frompp" to 32100Sstevel@tonic-gate * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 32110Sstevel@tonic-gate * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 32120Sstevel@tonic-gate * level and no one sleeps with an active mapping there. 32130Sstevel@tonic-gate * 32140Sstevel@tonic-gate * Note that the ref/mod bits in the page_t's are not affected by 32150Sstevel@tonic-gate * this operation, hence it is up to the caller to update them appropriately. 32160Sstevel@tonic-gate */ 32173253Smec int 32180Sstevel@tonic-gate ppcopy(page_t *frompp, page_t *topp) 32190Sstevel@tonic-gate { 32200Sstevel@tonic-gate caddr_t pp_addr1; 32210Sstevel@tonic-gate caddr_t pp_addr2; 32223446Smrj hat_mempte_t pte1; 32233446Smrj hat_mempte_t pte2; 32240Sstevel@tonic-gate kmutex_t *ppaddr_mutex; 32253253Smec label_t ljb; 32263253Smec int ret = 1; 32270Sstevel@tonic-gate 32280Sstevel@tonic-gate ASSERT_STACK_ALIGNED(); 32290Sstevel@tonic-gate ASSERT(PAGE_LOCKED(frompp)); 32300Sstevel@tonic-gate ASSERT(PAGE_LOCKED(topp)); 32310Sstevel@tonic-gate 32320Sstevel@tonic-gate if (kpm_enable) { 32330Sstevel@tonic-gate pp_addr1 = hat_kpm_page2va(frompp, 0); 32340Sstevel@tonic-gate pp_addr2 = hat_kpm_page2va(topp, 0); 32350Sstevel@tonic-gate kpreempt_disable(); 32360Sstevel@tonic-gate } else { 32370Sstevel@tonic-gate /* 32380Sstevel@tonic-gate * disable pre-emption so that CPU can't change 32390Sstevel@tonic-gate */ 32400Sstevel@tonic-gate kpreempt_disable(); 32410Sstevel@tonic-gate 32420Sstevel@tonic-gate pp_addr1 = CPU->cpu_caddr1; 32430Sstevel@tonic-gate pp_addr2 = CPU->cpu_caddr2; 32443446Smrj pte1 = CPU->cpu_caddr1pte; 32453446Smrj pte2 = CPU->cpu_caddr2pte; 32460Sstevel@tonic-gate 32470Sstevel@tonic-gate ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 32480Sstevel@tonic-gate mutex_enter(ppaddr_mutex); 32490Sstevel@tonic-gate 32500Sstevel@tonic-gate hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 32510Sstevel@tonic-gate PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 32520Sstevel@tonic-gate hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 32530Sstevel@tonic-gate PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 32540Sstevel@tonic-gate HAT_LOAD_NOCONSIST); 32550Sstevel@tonic-gate } 32560Sstevel@tonic-gate 32573253Smec if (on_fault(&ljb)) { 32583253Smec ret = 0; 32593253Smec goto faulted; 32603253Smec } 32610Sstevel@tonic-gate if (use_sse_pagecopy) 3262*5084Sjohnlev #ifdef __xpv 3263*5084Sjohnlev page_copy_no_xmm(pp_addr2, pp_addr1); 3264*5084Sjohnlev #else 32650Sstevel@tonic-gate hwblkpagecopy(pp_addr1, pp_addr2); 3266*5084Sjohnlev #endif 32670Sstevel@tonic-gate else 32680Sstevel@tonic-gate bcopy(pp_addr1, pp_addr2, PAGESIZE); 32690Sstevel@tonic-gate 32703253Smec no_fault(); 32713253Smec faulted: 32723446Smrj if (!kpm_enable) { 3273*5084Sjohnlev #ifdef __xpv 3274*5084Sjohnlev /* 3275*5084Sjohnlev * The target page might get used for a page table before any 3276*5084Sjohnlev * intervening change to the non-kpm mapping, so blow it away. 3277*5084Sjohnlev */ 3278*5084Sjohnlev if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0, 3279*5084Sjohnlev UVMF_INVLPG | UVMF_LOCAL) < 0) 3280*5084Sjohnlev panic("HYPERVISOR_update_va_mapping() failed"); 3281*5084Sjohnlev #endif 32820Sstevel@tonic-gate mutex_exit(ppaddr_mutex); 32833446Smrj } 32840Sstevel@tonic-gate kpreempt_enable(); 32853253Smec return (ret); 32860Sstevel@tonic-gate } 32870Sstevel@tonic-gate 32880Sstevel@tonic-gate /* 32890Sstevel@tonic-gate * Zero the physical page from off to off + len given by `pp' 32900Sstevel@tonic-gate * without changing the reference and modified bits of page. 32910Sstevel@tonic-gate * 32920Sstevel@tonic-gate * We use this using CPU private page address #2, see ppcopy() for more info. 32930Sstevel@tonic-gate * pagezero() must not be called at interrupt level. 32940Sstevel@tonic-gate */ 32950Sstevel@tonic-gate void 32960Sstevel@tonic-gate pagezero(page_t *pp, uint_t off, uint_t len) 32970Sstevel@tonic-gate { 32980Sstevel@tonic-gate caddr_t pp_addr2; 32993446Smrj hat_mempte_t pte2; 33000Sstevel@tonic-gate kmutex_t *ppaddr_mutex; 33010Sstevel@tonic-gate 33020Sstevel@tonic-gate ASSERT_STACK_ALIGNED(); 33030Sstevel@tonic-gate ASSERT(len <= MMU_PAGESIZE); 33040Sstevel@tonic-gate ASSERT(off <= MMU_PAGESIZE); 33050Sstevel@tonic-gate ASSERT(off + len <= MMU_PAGESIZE); 33060Sstevel@tonic-gate ASSERT(PAGE_LOCKED(pp)); 33070Sstevel@tonic-gate 33080Sstevel@tonic-gate if (kpm_enable) { 33090Sstevel@tonic-gate pp_addr2 = hat_kpm_page2va(pp, 0); 33100Sstevel@tonic-gate kpreempt_disable(); 33110Sstevel@tonic-gate } else { 33120Sstevel@tonic-gate kpreempt_disable(); 33130Sstevel@tonic-gate 33140Sstevel@tonic-gate pp_addr2 = CPU->cpu_caddr2; 33153446Smrj pte2 = CPU->cpu_caddr2pte; 33160Sstevel@tonic-gate 33170Sstevel@tonic-gate ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 33180Sstevel@tonic-gate mutex_enter(ppaddr_mutex); 33190Sstevel@tonic-gate 33200Sstevel@tonic-gate hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2, 33210Sstevel@tonic-gate PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 33220Sstevel@tonic-gate HAT_LOAD_NOCONSIST); 33230Sstevel@tonic-gate } 33240Sstevel@tonic-gate 33253446Smrj if (use_sse_pagezero) { 3326*5084Sjohnlev #ifdef __xpv 3327*5084Sjohnlev uint_t rem; 3328*5084Sjohnlev 3329*5084Sjohnlev /* 3330*5084Sjohnlev * zero a byte at a time until properly aligned for 3331*5084Sjohnlev * block_zero_no_xmm(). 3332*5084Sjohnlev */ 3333*5084Sjohnlev while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0) 3334*5084Sjohnlev pp_addr2[off++] = 0; 3335*5084Sjohnlev 3336*5084Sjohnlev /* 3337*5084Sjohnlev * Now use faster block_zero_no_xmm() for any range 3338*5084Sjohnlev * that is properly aligned and sized. 3339*5084Sjohnlev */ 3340*5084Sjohnlev rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN)); 3341*5084Sjohnlev len -= rem; 3342*5084Sjohnlev if (len != 0) { 3343*5084Sjohnlev block_zero_no_xmm(pp_addr2 + off, len); 3344*5084Sjohnlev off += len; 3345*5084Sjohnlev } 3346*5084Sjohnlev 3347*5084Sjohnlev /* 3348*5084Sjohnlev * zero remainder with byte stores. 3349*5084Sjohnlev */ 3350*5084Sjohnlev while (rem-- > 0) 3351*5084Sjohnlev pp_addr2[off++] = 0; 3352*5084Sjohnlev #else 33530Sstevel@tonic-gate hwblkclr(pp_addr2 + off, len); 3354*5084Sjohnlev #endif 33553446Smrj } else { 33560Sstevel@tonic-gate bzero(pp_addr2 + off, len); 33573446Smrj } 33580Sstevel@tonic-gate 3359*5084Sjohnlev #ifdef __xpv 3360*5084Sjohnlev /* 3361*5084Sjohnlev * On the hypervisor this page might get used for a page table before 3362*5084Sjohnlev * any intervening change to this mapping, so blow it away. 3363*5084Sjohnlev */ 3364*5084Sjohnlev if (!kpm_enable && HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0, 3365*5084Sjohnlev UVMF_INVLPG) < 0) 3366*5084Sjohnlev panic("HYPERVISOR_update_va_mapping() failed"); 3367*5084Sjohnlev #endif 3368*5084Sjohnlev 33690Sstevel@tonic-gate if (!kpm_enable) 33700Sstevel@tonic-gate mutex_exit(ppaddr_mutex); 33710Sstevel@tonic-gate kpreempt_enable(); 33720Sstevel@tonic-gate } 33730Sstevel@tonic-gate 33740Sstevel@tonic-gate /* 33750Sstevel@tonic-gate * Platform-dependent page scrub call. 33760Sstevel@tonic-gate */ 33770Sstevel@tonic-gate void 33780Sstevel@tonic-gate pagescrub(page_t *pp, uint_t off, uint_t len) 33790Sstevel@tonic-gate { 33800Sstevel@tonic-gate /* 33810Sstevel@tonic-gate * For now, we rely on the fact that pagezero() will 33820Sstevel@tonic-gate * always clear UEs. 33830Sstevel@tonic-gate */ 33840Sstevel@tonic-gate pagezero(pp, off, len); 33850Sstevel@tonic-gate } 33860Sstevel@tonic-gate 33870Sstevel@tonic-gate /* 33880Sstevel@tonic-gate * set up two private addresses for use on a given CPU for use in ppcopy() 33890Sstevel@tonic-gate */ 33900Sstevel@tonic-gate void 33910Sstevel@tonic-gate setup_vaddr_for_ppcopy(struct cpu *cpup) 33920Sstevel@tonic-gate { 33930Sstevel@tonic-gate void *addr; 33943446Smrj hat_mempte_t pte_pa; 33950Sstevel@tonic-gate 33960Sstevel@tonic-gate addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 33973446Smrj pte_pa = hat_mempte_setup(addr); 33980Sstevel@tonic-gate cpup->cpu_caddr1 = addr; 33993446Smrj cpup->cpu_caddr1pte = pte_pa; 34000Sstevel@tonic-gate 34010Sstevel@tonic-gate addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 34023446Smrj pte_pa = hat_mempte_setup(addr); 34030Sstevel@tonic-gate cpup->cpu_caddr2 = addr; 34043446Smrj cpup->cpu_caddr2pte = pte_pa; 34050Sstevel@tonic-gate 34060Sstevel@tonic-gate mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 34070Sstevel@tonic-gate } 34080Sstevel@tonic-gate 34093446Smrj /* 34103446Smrj * Undo setup_vaddr_for_ppcopy 34113446Smrj */ 34123446Smrj void 34133446Smrj teardown_vaddr_for_ppcopy(struct cpu *cpup) 34143446Smrj { 34153446Smrj mutex_destroy(&cpup->cpu_ppaddr_mutex); 34163446Smrj 34173446Smrj hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte); 34183446Smrj cpup->cpu_caddr2pte = 0; 34193446Smrj vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1)); 34203446Smrj cpup->cpu_caddr2 = 0; 34213446Smrj 34223446Smrj hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte); 34233446Smrj cpup->cpu_caddr1pte = 0; 34243446Smrj vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1)); 34253446Smrj cpup->cpu_caddr1 = 0; 34263446Smrj } 34270Sstevel@tonic-gate 34280Sstevel@tonic-gate /* 34290Sstevel@tonic-gate * Create the pageout scanner thread. The thread has to 34300Sstevel@tonic-gate * start at procedure with process pp and priority pri. 34310Sstevel@tonic-gate */ 34320Sstevel@tonic-gate void 34330Sstevel@tonic-gate pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 34340Sstevel@tonic-gate { 34350Sstevel@tonic-gate (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 34360Sstevel@tonic-gate } 34370Sstevel@tonic-gate 34380Sstevel@tonic-gate /* 34390Sstevel@tonic-gate * Function for flushing D-cache when performing module relocations 34400Sstevel@tonic-gate * to an alternate mapping. Unnecessary on Intel / AMD platforms. 34410Sstevel@tonic-gate */ 34420Sstevel@tonic-gate void 34430Sstevel@tonic-gate dcache_flushall() 34440Sstevel@tonic-gate {} 34453177Sdp78419 34463177Sdp78419 size_t 34473177Sdp78419 exec_get_spslew(void) 34483177Sdp78419 { 34493177Sdp78419 return (0); 34503177Sdp78419 } 34513446Smrj 34523446Smrj /* 34533446Smrj * Allocate a memory page. The argument 'seed' can be any pseudo-random 34543446Smrj * number to vary where the pages come from. This is quite a hacked up 34553446Smrj * method -- it works for now, but really needs to be fixed up a bit. 34563446Smrj * 34573446Smrj * We currently use page_create_va() on the kvp with fake offsets, 34583446Smrj * segments and virt address. This is pretty bogus, but was copied from the 34593446Smrj * old hat_i86.c code. A better approach would be to specify either mnode 34603446Smrj * random or mnode local and takes a page from whatever color has the MOST 34613446Smrj * available - this would have a minimal impact on page coloring. 34623446Smrj */ 34633446Smrj page_t * 34643446Smrj page_get_physical(uintptr_t seed) 34653446Smrj { 34663446Smrj page_t *pp; 34673446Smrj u_offset_t offset; 34683446Smrj static struct seg tmpseg; 34693446Smrj static uintptr_t ctr = 0; 34703446Smrj 34713446Smrj /* 34723446Smrj * This code is gross, we really need a simpler page allocator. 34733446Smrj * 34743446Smrj * We need assign an offset for the page to call page_create_va(). 34753446Smrj * To avoid conflicts with other pages, we get creative with the offset. 34763446Smrj * For 32 bits, we pick an offset > 4Gig 34773446Smrj * For 64 bits, pick an offset somewhere in the VA hole. 34783446Smrj */ 34793446Smrj offset = seed; 34803446Smrj if (offset > kernelbase) 34813446Smrj offset -= kernelbase; 34823446Smrj offset <<= MMU_PAGESHIFT; 34833446Smrj #if defined(__amd64) 34843446Smrj offset += mmu.hole_start; /* something in VA hole */ 34853446Smrj #else 34863446Smrj offset += 1ULL << 40; /* something > 4 Gig */ 34873446Smrj #endif 34883446Smrj 34893446Smrj if (page_resv(1, KM_NOSLEEP) == 0) 34903446Smrj return (NULL); 34913446Smrj 34923446Smrj #ifdef DEBUG 34933446Smrj pp = page_exists(&kvp, offset); 34943446Smrj if (pp != NULL) 34953446Smrj panic("page already exists %p", pp); 34963446Smrj #endif 34973446Smrj 3498*5084Sjohnlev pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL, 34993446Smrj &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE)); /* changing VA usage */ 35003446Smrj if (pp == NULL) 35013446Smrj return (NULL); 35023446Smrj page_io_unlock(pp); 35033446Smrj page_hashout(pp, NULL); 35043446Smrj return (pp); 35053446Smrj } 3506