1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 26 /* All Rights Reserved */ 27 28 /* 29 * Portions of this source code were derived from Berkeley 4.3 BSD 30 * under license from the Regents of the University of California. 31 */ 32 33 34 /* 35 * This file contains common functions to access and manage the page lists. 36 * Many of these routines originated from platform dependent modules 37 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in 38 * a platform independent manner. 39 * 40 * vm/vm_dep.h provides for platform specific support. 41 */ 42 43 #include <sys/types.h> 44 #include <sys/debug.h> 45 #include <sys/cmn_err.h> 46 #include <sys/systm.h> 47 #include <sys/atomic.h> 48 #include <sys/sysmacros.h> 49 #include <vm/as.h> 50 #include <vm/page.h> 51 #include <vm/seg_kmem.h> 52 #include <vm/seg_vn.h> 53 #include <sys/vmsystm.h> 54 #include <sys/memnode.h> 55 #include <vm/vm_dep.h> 56 #include <sys/lgrp.h> 57 #include <sys/mem_config.h> 58 #include <sys/callb.h> 59 #include <sys/mem_cage.h> 60 #include <sys/kflt_mem.h> 61 #include <sys/sdt.h> 62 #include <sys/dumphdr.h> 63 #include <sys/swap.h> 64 65 extern uint_t vac_colors; 66 67 #define MAX_PRAGMA_ALIGN 128 68 69 /* vm_cpu_data0 for the boot cpu before kmem is initialized */ 70 71 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN 72 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0) 73 #else 74 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0) 75 #endif 76 char vm_cpu_data0[VM_CPU_DATA_PADSIZE]; 77 78 /* 79 * number of page colors equivalent to reqested color in page_get routines. 80 * If set, keeps large pages intact longer and keeps MPO allocation 81 * from the local mnode in favor of acquiring the 'correct' page color from 82 * a demoted large page or from a remote mnode. 83 */ 84 uint_t colorequiv; 85 86 /* 87 * color equivalency mask for each page size. 88 * Mask is computed based on cpu L2$ way sizes and colorequiv global. 89 * High 4 bits determine the number of high order bits of the color to ignore. 90 * Low 4 bits determines number of low order bits of color to ignore (it's only 91 * relevant for hashed index based page coloring). 92 */ 93 uchar_t colorequivszc[MMU_PAGE_SIZES]; 94 95 /* 96 * if set, specifies the percentage of large pages that are free from within 97 * a large page region before attempting to lock those pages for 98 * page_get_contig_pages processing. 99 * 100 * Should be turned on when kpr is available when page_trylock_contig_pages 101 * can be more selective. 102 */ 103 104 int ptcpthreshold; 105 106 /* 107 * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. 108 * Enabled by default via pgcplimitsearch. 109 * 110 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed 111 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper 112 * bound. This upper bound range guarantees: 113 * - all large page 'slots' will be searched over time 114 * - the minimum (1) large page candidates considered on each pgcp call 115 * - count doesn't wrap around to 0 116 */ 117 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES]; 118 int pgcplimitsearch = 1; 119 120 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1)) 121 #define SETPGCPFAILCNT(szc) \ 122 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \ 123 pgcpfailcnt[szc] = PGCPFAILMAX / 2; 124 125 /* 126 * There are two page freelist types that are supported, flt_user, the user 127 * page freelist type and flt_kern, the kernel page freelist type. 128 */ 129 130 page_freelist_type_t flt_user; 131 page_freelist_type_t flt_kern; 132 page_freelist_type_t *ufltp = &flt_user; 133 page_freelist_type_t *kfltp = &flt_kern; 134 135 #ifdef VM_STATS 136 struct vmm_vmstats_str vmm_vmstats; 137 #endif /* VM_STATS */ 138 139 #if defined(__sparc) 140 #define LPGCREATE 0 141 #else 142 /* enable page_get_contig_pages */ 143 #define LPGCREATE 1 144 #endif 145 146 int pg_contig_disable; 147 int pg_lpgcreate_nocage = LPGCREATE; 148 149 /* 150 * page_freelist_split pfn flag to signify no lo or hi pfn requirement. 151 */ 152 #define PFNNULL 0 153 154 /* Flags involved in promotion and demotion routines */ 155 #define PC_FREE 0x1 /* put page on freelist */ 156 #define PC_ALLOC 0x2 /* return page for allocation */ 157 158 /* 159 * Flag for page_demote to be used with PC_FREE to denote that we don't care 160 * what the color is as the color parameter to the function is ignored. 161 */ 162 #define PC_NO_COLOR (-1) 163 164 /* mtype value for page_promote to use when mtype does not matter */ 165 #define PC_MTYPE_ANY (-1) 166 167 /* 168 * page counters candidates info 169 * See page_ctrs_cands comment below for more details. 170 * fields are as follows: 171 * pcc_pages_free: # pages which freelist coalesce can create 172 * pcc_color_free: pointer to page free counts per color 173 */ 174 typedef struct pcc_info { 175 pgcnt_t pcc_pages_free; 176 pgcnt_t *pcc_color_free; 177 uint_t pad[12]; 178 } pcc_info_t; 179 180 /* 181 * On big machines it can take a long time to check page_counters 182 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically 183 * updated sum of all elements of the corresponding page_counters arrays. 184 * page_freelist_coalesce() searches page_counters only if an appropriate 185 * element of page_ctrs_cands array is greater than 0. 186 * 187 * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g) 188 */ 189 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; 190 191 /* 192 * Return in val the total number of free pages which can be created 193 * for the given mnode (m), mrange (g), and region size (r) 194 */ 195 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) { \ 196 int i; \ 197 val = 0; \ 198 for (i = 0; i < NPC_MUTEX; i++) { \ 199 val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free; \ 200 } \ 201 } 202 203 /* 204 * Return in val the total number of free pages which can be created 205 * for the given mnode (m), mrange (g), region size (r), and color (c) 206 */ 207 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) { \ 208 int i; \ 209 val = 0; \ 210 ASSERT((c) < PAGE_GET_PAGECOLORS(r)); \ 211 for (i = 0; i < NPC_MUTEX; i++) { \ 212 val += \ 213 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)]; \ 214 } \ 215 } 216 217 /* 218 * We can only allow a single thread to update a counter within the physical 219 * range of the largest supported page size. That is the finest granularity 220 * possible since the counter values are dependent on each other 221 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the 222 * ctr_mutex lock index for a particular physical range. 223 */ 224 static kmutex_t *ctr_mutex[NPC_MUTEX]; 225 226 #define PP_CTR_LOCK_INDX(pp) \ 227 (((pp)->p_pagenum >> \ 228 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) 229 230 #define INVALID_COLOR 0xffffffff 231 #define INVALID_MASK 0xffffffff 232 233 /* 234 * Local functions prototypes. 235 */ 236 237 void page_ctr_add(int, int, page_t *, int); 238 void page_ctr_add_internal(int, int, page_t *, int); 239 void page_ctr_sub(int, int, page_t *, int); 240 void page_ctr_sub_internal(int, int, page_t *, int); 241 void page_freelist_lock(int); 242 void page_freelist_unlock(int); 243 page_t *page_promote(int, pfn_t, uchar_t, int, int); 244 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int); 245 page_t *page_freelist_split(uchar_t, 246 uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *); 247 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 248 static page_t *page_get_flist(page_freelist_type_t *, uint_t, int, 249 uchar_t, uint_t, struct lgrp *); 250 251 static int page_trylock_cons(page_t *pp, se_t se); 252 253 /* 254 * The page_counters array below is used to keep track of free contiguous 255 * physical memory. A hw_page_map_t will be allocated per mnode per szc. 256 * This contains an array of counters, the size of the array, a shift value 257 * used to convert a pagenum into a counter array index or vice versa, as 258 * well as a cache of the last successful index to be promoted to a larger 259 * page size. As an optimization, we keep track of the last successful index 260 * to be promoted per page color for the given size region, and this is 261 * allocated dynamically based upon the number of colors for a given 262 * region size. 263 * 264 * Conceptually, the page counters are represented as: 265 * 266 * page_counters[region_size][mnode] 267 * 268 * region_size: size code of a candidate larger page made up 269 * of contiguous free smaller pages. 270 * 271 * page_counters[region_size][mnode].hpm_counters[index]: 272 * represents how many (region_size - 1) pages either 273 * exist or can be created within the given index range. 274 * 275 * Let's look at a sparc example: 276 * If we want to create a free 512k page, we look at region_size 2 277 * for the mnode we want. We calculate the index and look at a specific 278 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at 279 * this location, it means that 8 64k pages either exist or can be created 280 * from 8K pages in order to make a single free 512k page at the given 281 * index. Note that when a region is full, it will contribute to the 282 * counts in the region above it. Thus we will not know what page 283 * size the free pages will be which can be promoted to this new free 284 * page unless we look at all regions below the current region. 285 */ 286 287 /* 288 * Note: hpmctr_t is defined in platform vm_dep.h 289 * hw_page_map_t contains all the information needed for the page_counters 290 * logic. The fields are as follows: 291 * 292 * hpm_counters: dynamically allocated array to hold counter data 293 * hpm_entries: entries in hpm_counters 294 * hpm_shift: shift for pnum/array index conv 295 * hpm_base: PFN mapped to counter index 0 296 * hpm_color_current: last index in counter array for this color at 297 * which we successfully created a large page 298 */ 299 typedef struct hw_page_map { 300 hpmctr_t *hpm_counters; 301 size_t hpm_entries; 302 int hpm_shift; 303 pfn_t hpm_base; 304 size_t *hpm_color_current[MAX_MNODE_MRANGES]; 305 #if defined(__sparc) 306 uint_t pad[4]; 307 #endif 308 } hw_page_map_t; 309 310 /* 311 * Element zero is not used, but is allocated for convenience. 312 */ 313 static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; 314 315 /* 316 * Cached value of MNODE_RANGE_CNT(mnode). 317 * This is a function call in x86. 318 */ 319 static int mnode_nranges[MAX_MEM_NODES]; 320 static int mnode_maxmrange[MAX_MEM_NODES]; 321 322 /* 323 * The following macros are convenient ways to get access to the individual 324 * elements of the page_counters arrays. They can be used on both 325 * the left side and right side of equations. 326 */ 327 #define PAGE_COUNTERS(mnode, rg_szc, idx) \ 328 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) 329 330 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ 331 (page_counters[(rg_szc)][(mnode)].hpm_counters) 332 333 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ 334 (page_counters[(rg_szc)][(mnode)].hpm_shift) 335 336 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ 337 (page_counters[(rg_szc)][(mnode)].hpm_entries) 338 339 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \ 340 (page_counters[(rg_szc)][(mnode)].hpm_base) 341 342 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g) \ 343 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)]) 344 345 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange) \ 346 (page_counters[(rg_szc)][(mnode)]. \ 347 hpm_color_current[(mrange)][(color)]) 348 349 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \ 350 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ 351 PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) 352 353 #define IDX_TO_PNUM(mnode, rg_szc, index) \ 354 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ 355 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) 356 357 /* 358 * Protects the hpm_counters and hpm_color_current memory from changing while 359 * looking at page counters information. 360 * Grab the write lock to modify what these fields point at. 361 * Grab the read lock to prevent any pointers from changing. 362 * The write lock can not be held during memory allocation due to a possible 363 * recursion deadlock with trying to grab the read lock while the 364 * write lock is already held. 365 */ 366 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; 367 368 /* 369 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t. 370 */ 371 void 372 cpu_vm_data_init(struct cpu *cp) 373 { 374 if (cp == CPU0) { 375 cp->cpu_vm_data = (void *)&vm_cpu_data0; 376 } else { 377 void *kmptr; 378 int align; 379 size_t sz; 380 381 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX; 382 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align; 383 kmptr = kmem_zalloc(sz, KM_SLEEP); 384 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align); 385 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr; 386 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz; 387 } 388 } 389 390 /* 391 * free cpu_vm_data 392 */ 393 void 394 cpu_vm_data_destroy(struct cpu *cp) 395 { 396 if (cp->cpu_seqid && cp->cpu_vm_data) { 397 ASSERT(cp != CPU0); 398 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr, 399 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize); 400 } 401 cp->cpu_vm_data = NULL; 402 } 403 404 405 /* 406 * page size to page size code 407 */ 408 int 409 page_szc(size_t pagesize) 410 { 411 int i = 0; 412 413 while (hw_page_array[i].hp_size) { 414 if (pagesize == hw_page_array[i].hp_size) 415 return (i); 416 i++; 417 } 418 return (-1); 419 } 420 421 /* 422 * page size to page size code with the restriction that it be a supported 423 * user page size. If it's not a supported user page size, -1 will be returned. 424 */ 425 int 426 page_szc_user_filtered(size_t pagesize) 427 { 428 int szc = page_szc(pagesize); 429 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) { 430 return (szc); 431 } 432 return (-1); 433 } 434 435 /* 436 * Return how many page sizes are available for the user to use. This is 437 * what the hardware supports and not based upon how the OS implements the 438 * support of different page sizes. 439 * 440 * If legacy is non-zero, return the number of pagesizes available to legacy 441 * applications. The number of legacy page sizes might be less than the 442 * exported user page sizes. This is to prevent legacy applications that 443 * use the largest page size returned from getpagesizes(3c) from inadvertantly 444 * using the 'new' large pagesizes. 445 */ 446 uint_t 447 page_num_user_pagesizes(int legacy) 448 { 449 if (legacy) 450 return (mmu_legacy_page_sizes); 451 return (mmu_exported_page_sizes); 452 } 453 454 uint_t 455 page_num_pagesizes(void) 456 { 457 return (mmu_page_sizes); 458 } 459 460 /* 461 * returns the count of the number of base pagesize pages associated with szc 462 */ 463 pgcnt_t 464 page_get_pagecnt(uint_t szc) 465 { 466 if (szc >= mmu_page_sizes) 467 panic("page_get_pagecnt: out of range %d", szc); 468 return (hw_page_array[szc].hp_pgcnt); 469 } 470 471 size_t 472 page_get_pagesize(uint_t szc) 473 { 474 if (szc >= mmu_page_sizes) 475 panic("page_get_pagesize: out of range %d", szc); 476 return (hw_page_array[szc].hp_size); 477 } 478 479 /* 480 * Return the size of a page based upon the index passed in. An index of 481 * zero refers to the smallest page size in the system, and as index increases 482 * it refers to the next larger supported page size in the system. 483 * Note that szc and userszc may not be the same due to unsupported szc's on 484 * some systems. 485 */ 486 size_t 487 page_get_user_pagesize(uint_t userszc) 488 { 489 uint_t szc = USERSZC_2_SZC(userszc); 490 491 if (szc >= mmu_page_sizes) 492 panic("page_get_user_pagesize: out of range %d", szc); 493 return (hw_page_array[szc].hp_size); 494 } 495 496 uint_t 497 page_get_shift(uint_t szc) 498 { 499 if (szc >= mmu_page_sizes) 500 panic("page_get_shift: out of range %d", szc); 501 return (PAGE_GET_SHIFT(szc)); 502 } 503 504 uint_t 505 page_get_pagecolors(uint_t szc) 506 { 507 if (szc >= mmu_page_sizes) 508 panic("page_get_pagecolors: out of range %d", szc); 509 return (PAGE_GET_PAGECOLORS(szc)); 510 } 511 512 /* 513 * this assigns the desired equivalent color after a split 514 */ 515 uint_t 516 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color, 517 uint_t ncolor, uint_t ceq_mask) 518 { 519 ASSERT(nszc > szc); 520 ASSERT(szc < mmu_page_sizes); 521 ASSERT(color < PAGE_GET_PAGECOLORS(szc)); 522 ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc)); 523 524 color &= ceq_mask; 525 ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc); 526 return (color | (ncolor & ~ceq_mask)); 527 } 528 529 /* 530 * The interleaved_mnodes flag is set when mnodes overlap in 531 * the physbase..physmax range, but have disjoint slices. 532 * In this case hpm_counters is shared by all mnodes. 533 * This flag is set dynamically by the platform. 534 */ 535 int interleaved_mnodes = 0; 536 537 /* 538 * Called by startup(). 539 * Size up the per page size free list counters based on physmax 540 * of each node and max_mem_nodes. 541 * 542 * If interleaved_mnodes is set we need to find the first mnode that 543 * exists. hpm_counters for the first mnode will then be shared by 544 * all other mnodes. If interleaved_mnodes is not set, just set 545 * first=mnode each time. That means there will be no sharing. 546 */ 547 size_t 548 page_ctrs_sz(void) 549 { 550 int r; /* region size */ 551 int mnode; 552 int firstmn; /* first mnode that exists */ 553 int nranges; 554 pfn_t physbase; 555 pfn_t physmax; 556 uint_t ctrs_sz = 0; 557 int i; 558 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 559 560 /* 561 * We need to determine how many page colors there are for each 562 * page size in order to allocate memory for any color specific 563 * arrays. 564 */ 565 for (i = 0; i < mmu_page_sizes; i++) { 566 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 567 } 568 569 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 570 571 pgcnt_t r_pgcnt; 572 pfn_t r_base; 573 pgcnt_t r_align; 574 575 if (mem_node_config[mnode].exists == 0) 576 continue; 577 578 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 579 nranges = MNODE_RANGE_CNT(mnode); 580 mnode_nranges[mnode] = nranges; 581 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 582 583 /* 584 * determine size needed for page counter arrays with 585 * base aligned to large page size. 586 */ 587 for (r = 1; r < mmu_page_sizes; r++) { 588 /* add in space for hpm_color_current */ 589 ctrs_sz += sizeof (size_t) * 590 colors_per_szc[r] * nranges; 591 592 if (firstmn != mnode) 593 continue; 594 595 /* add in space for hpm_counters */ 596 r_align = page_get_pagecnt(r); 597 r_base = physbase; 598 r_base &= ~(r_align - 1); 599 r_pgcnt = howmany(physmax - r_base + 1, r_align); 600 601 /* 602 * Round up to always allocate on pointer sized 603 * boundaries. 604 */ 605 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), 606 sizeof (hpmctr_t *)); 607 } 608 } 609 610 for (r = 1; r < mmu_page_sizes; r++) { 611 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); 612 } 613 614 /* add in space for page_ctrs_cands and pcc_color_free */ 615 ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes * 616 mmu_page_sizes * NPC_MUTEX; 617 618 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 619 620 if (mem_node_config[mnode].exists == 0) 621 continue; 622 623 nranges = mnode_nranges[mnode]; 624 ctrs_sz += sizeof (pcc_info_t) * nranges * 625 mmu_page_sizes * NPC_MUTEX; 626 for (r = 1; r < mmu_page_sizes; r++) { 627 ctrs_sz += sizeof (pgcnt_t) * nranges * 628 colors_per_szc[r] * NPC_MUTEX; 629 } 630 } 631 632 /* ctr_mutex */ 633 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); 634 635 /* size for page list counts */ 636 PLCNT_SZ(ctrs_sz); 637 638 /* 639 * add some slop for roundups. page_ctrs_alloc will roundup the start 640 * address of the counters to ecache_alignsize boundary for every 641 * memory node. 642 */ 643 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); 644 } 645 646 caddr_t 647 page_ctrs_alloc(caddr_t alloc_base) 648 { 649 int mnode; 650 int mrange, nranges; 651 int r; /* region size */ 652 int i; 653 int firstmn; /* first mnode that exists */ 654 pfn_t physbase; 655 pfn_t physmax; 656 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 657 658 /* 659 * We need to determine how many page colors there are for each 660 * page size in order to allocate memory for any color specific 661 * arrays. 662 */ 663 for (i = 0; i < mmu_page_sizes; i++) { 664 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 665 } 666 667 for (r = 1; r < mmu_page_sizes; r++) { 668 page_counters[r] = (hw_page_map_t *)alloc_base; 669 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); 670 } 671 672 /* page_ctrs_cands and pcc_color_free array */ 673 for (i = 0; i < NPC_MUTEX; i++) { 674 for (r = 1; r < mmu_page_sizes; r++) { 675 676 page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base; 677 alloc_base += sizeof (pcc_info_t *) * max_mem_nodes; 678 679 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 680 pcc_info_t *pi; 681 682 if (mem_node_config[mnode].exists == 0) 683 continue; 684 685 nranges = mnode_nranges[mnode]; 686 687 pi = (pcc_info_t *)alloc_base; 688 alloc_base += sizeof (pcc_info_t) * nranges; 689 page_ctrs_cands[i][r][mnode] = pi; 690 691 for (mrange = 0; mrange < nranges; mrange++) { 692 pi->pcc_color_free = 693 (pgcnt_t *)alloc_base; 694 alloc_base += sizeof (pgcnt_t) * 695 colors_per_szc[r]; 696 pi++; 697 } 698 } 699 } 700 } 701 702 /* ctr_mutex */ 703 for (i = 0; i < NPC_MUTEX; i++) { 704 ctr_mutex[i] = (kmutex_t *)alloc_base; 705 alloc_base += (max_mem_nodes * sizeof (kmutex_t)); 706 } 707 708 /* initialize page list counts */ 709 PLCNT_INIT(alloc_base); 710 711 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 712 713 pgcnt_t r_pgcnt; 714 pfn_t r_base; 715 pgcnt_t r_align; 716 int r_shift; 717 int nranges = mnode_nranges[mnode]; 718 719 if (mem_node_config[mnode].exists == 0) 720 continue; 721 722 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 723 724 for (r = 1; r < mmu_page_sizes; r++) { 725 /* 726 * the page_counters base has to be aligned to the 727 * page count of page size code r otherwise the counts 728 * will cross large page boundaries. 729 */ 730 r_align = page_get_pagecnt(r); 731 r_base = physbase; 732 /* base needs to be aligned - lower to aligned value */ 733 r_base &= ~(r_align - 1); 734 r_pgcnt = howmany(physmax - r_base + 1, r_align); 735 r_shift = PAGE_BSZS_SHIFT(r); 736 737 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; 738 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; 739 PAGE_COUNTERS_BASE(mnode, r) = r_base; 740 for (mrange = 0; mrange < nranges; mrange++) { 741 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 742 r, mrange) = (size_t *)alloc_base; 743 alloc_base += sizeof (size_t) * 744 colors_per_szc[r]; 745 } 746 for (i = 0; i < colors_per_szc[r]; i++) { 747 uint_t color_mask = colors_per_szc[r] - 1; 748 pfn_t pfnum = r_base; 749 size_t idx; 750 int mrange; 751 MEM_NODE_ITERATOR_DECL(it); 752 753 MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it); 754 if (pfnum == (pfn_t)-1) { 755 idx = 0; 756 } else { 757 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 758 color_mask, color_mask, &it); 759 idx = PNUM_TO_IDX(mnode, r, pfnum); 760 idx = (idx >= r_pgcnt) ? 0 : idx; 761 } 762 for (mrange = 0; mrange < nranges; mrange++) { 763 PAGE_COUNTERS_CURRENT_COLOR(mnode, 764 r, i, mrange) = idx; 765 } 766 } 767 768 /* hpm_counters may be shared by all mnodes */ 769 if (firstmn == mnode) { 770 PAGE_COUNTERS_COUNTERS(mnode, r) = 771 (hpmctr_t *)alloc_base; 772 alloc_base += 773 P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), 774 sizeof (hpmctr_t *)); 775 } else { 776 PAGE_COUNTERS_COUNTERS(mnode, r) = 777 PAGE_COUNTERS_COUNTERS(firstmn, r); 778 } 779 780 /* 781 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 782 * satisfy the identity requirement. 783 * We should be able to go from one to the other 784 * and get consistent values. 785 */ 786 ASSERT(PNUM_TO_IDX(mnode, r, 787 (IDX_TO_PNUM(mnode, r, 0))) == 0); 788 ASSERT(IDX_TO_PNUM(mnode, r, 789 (PNUM_TO_IDX(mnode, r, r_base))) == r_base); 790 } 791 /* 792 * Roundup the start address of the page_counters to 793 * cache aligned boundary for every memory node. 794 * page_ctrs_sz() has added some slop for these roundups. 795 */ 796 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, 797 L2CACHE_ALIGN); 798 } 799 800 /* Initialize other page counter specific data structures. */ 801 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { 802 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); 803 } 804 805 return (alloc_base); 806 } 807 808 /* 809 * Functions to adjust region counters for each size free list. 810 * Caller is responsible to acquire the ctr_mutex lock if necessary and 811 * thus can be called during startup without locks. 812 */ 813 /* ARGSUSED */ 814 void 815 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags) 816 { 817 ssize_t r; /* region size */ 818 ssize_t idx; 819 pfn_t pfnum; 820 int lckidx; 821 822 ASSERT(mnode == PP_2_MEM_NODE(pp)); 823 ASSERT(mtype == PP_2_MTYPE(pp)); 824 825 ASSERT(pp->p_szc < mmu_page_sizes); 826 827 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 828 829 /* no counter update needed for largest page size */ 830 if (pp->p_szc >= mmu_page_sizes - 1) { 831 return; 832 } 833 834 r = pp->p_szc + 1; 835 pfnum = pp->p_pagenum; 836 lckidx = PP_CTR_LOCK_INDX(pp); 837 838 /* 839 * Increment the count of free pages for the current 840 * region. Continue looping up in region size incrementing 841 * count if the preceeding region is full. 842 */ 843 while (r < mmu_page_sizes) { 844 idx = PNUM_TO_IDX(mnode, r, pfnum); 845 846 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 847 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); 848 849 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) { 850 break; 851 } else { 852 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 853 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 854 [MTYPE_2_MRANGE(mnode, root_mtype)]; 855 856 cand->pcc_pages_free++; 857 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++; 858 } 859 r++; 860 } 861 } 862 863 void 864 page_ctr_add(int mnode, int mtype, page_t *pp, int flags) 865 { 866 int lckidx = PP_CTR_LOCK_INDX(pp); 867 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 868 869 mutex_enter(lock); 870 page_ctr_add_internal(mnode, mtype, pp, flags); 871 mutex_exit(lock); 872 } 873 874 void 875 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags) 876 { 877 int lckidx; 878 ssize_t r; /* region size */ 879 ssize_t idx; 880 pfn_t pfnum; 881 882 ASSERT(mnode == PP_2_MEM_NODE(pp)); 883 ASSERT(mtype == PP_2_MTYPE(pp)); 884 885 ASSERT(pp->p_szc < mmu_page_sizes); 886 887 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags); 888 889 /* no counter update needed for largest page size */ 890 if (pp->p_szc >= mmu_page_sizes - 1) { 891 return; 892 } 893 894 r = pp->p_szc + 1; 895 pfnum = pp->p_pagenum; 896 lckidx = PP_CTR_LOCK_INDX(pp); 897 898 /* 899 * Decrement the count of free pages for the current 900 * region. Continue looping up in region size decrementing 901 * count if the preceeding region was full. 902 */ 903 while (r < mmu_page_sizes) { 904 idx = PNUM_TO_IDX(mnode, r, pfnum); 905 906 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 907 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); 908 909 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { 910 break; 911 } else { 912 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 913 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 914 [MTYPE_2_MRANGE(mnode, root_mtype)]; 915 916 ASSERT(cand->pcc_pages_free != 0); 917 ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); 918 919 cand->pcc_pages_free--; 920 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--; 921 } 922 r++; 923 } 924 } 925 926 void 927 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags) 928 { 929 int lckidx = PP_CTR_LOCK_INDX(pp); 930 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 931 932 mutex_enter(lock); 933 page_ctr_sub_internal(mnode, mtype, pp, flags); 934 mutex_exit(lock); 935 } 936 937 /* 938 * Adjust page counters following a memory attach, since typically the 939 * size of the array needs to change, and the PFN to counter index 940 * mapping needs to change. 941 * 942 * It is possible this mnode did not exist at startup. In that case 943 * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges 944 * to change (a theoretical possibility on x86), which means pcc_color_free 945 * arrays must be extended. 946 */ 947 uint_t 948 page_ctrs_adjust(int mnode) 949 { 950 pgcnt_t npgs; 951 int r; /* region size */ 952 int i; 953 size_t pcsz, old_csz; 954 hpmctr_t *new_ctr, *old_ctr; 955 pfn_t oldbase, newbase; 956 pfn_t physbase, physmax; 957 size_t old_npgs; 958 hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; 959 size_t size_cache[MMU_PAGE_SIZES]; 960 size_t *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; 961 size_t *old_color_array[MAX_MNODE_MRANGES]; 962 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 963 pcc_info_t **cands_cache; 964 pcc_info_t *old_pi, *pi; 965 pgcnt_t *pgcntp; 966 int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode); 967 int cands_cache_nranges; 968 int old_maxmrange, new_maxmrange; 969 int rc = 0; 970 int oldmnode; 971 972 cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX * 973 MMU_PAGE_SIZES, KM_NOSLEEP); 974 if (cands_cache == NULL) 975 return (ENOMEM); 976 977 i = -1; 978 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i); 979 980 newbase = physbase & ~PC_BASE_ALIGN_MASK; 981 npgs = roundup(physmax, PC_BASE_ALIGN) - newbase; 982 983 /* prepare to free non-null pointers on the way out */ 984 cands_cache_nranges = nranges; 985 bzero(ctr_cache, sizeof (ctr_cache)); 986 bzero(color_cache, sizeof (color_cache)); 987 988 /* 989 * We need to determine how many page colors there are for each 990 * page size in order to allocate memory for any color specific 991 * arrays. 992 */ 993 for (r = 0; r < mmu_page_sizes; r++) { 994 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r); 995 } 996 997 /* 998 * Preallocate all of the new hpm_counters arrays as we can't 999 * hold the page_ctrs_rwlock as a writer and allocate memory. 1000 * If we can't allocate all of the arrays, undo our work so far 1001 * and return failure. 1002 */ 1003 for (r = 1; r < mmu_page_sizes; r++) { 1004 pcsz = npgs >> PAGE_BSZS_SHIFT(r); 1005 size_cache[r] = pcsz; 1006 ctr_cache[r] = kmem_zalloc(pcsz * 1007 sizeof (hpmctr_t), KM_NOSLEEP); 1008 if (ctr_cache[r] == NULL) { 1009 rc = ENOMEM; 1010 goto cleanup; 1011 } 1012 } 1013 1014 /* 1015 * Preallocate all of the new color current arrays as we can't 1016 * hold the page_ctrs_rwlock as a writer and allocate memory. 1017 * If we can't allocate all of the arrays, undo our work so far 1018 * and return failure. 1019 */ 1020 for (r = 1; r < mmu_page_sizes; r++) { 1021 for (mrange = 0; mrange < nranges; mrange++) { 1022 color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) * 1023 colors_per_szc[r], KM_NOSLEEP); 1024 if (color_cache[r][mrange] == NULL) { 1025 rc = ENOMEM; 1026 goto cleanup; 1027 } 1028 } 1029 } 1030 1031 /* 1032 * Preallocate all of the new pcc_info_t arrays as we can't 1033 * hold the page_ctrs_rwlock as a writer and allocate memory. 1034 * If we can't allocate all of the arrays, undo our work so far 1035 * and return failure. 1036 */ 1037 for (r = 1; r < mmu_page_sizes; r++) { 1038 for (i = 0; i < NPC_MUTEX; i++) { 1039 pi = kmem_zalloc(nranges * sizeof (pcc_info_t), 1040 KM_NOSLEEP); 1041 if (pi == NULL) { 1042 rc = ENOMEM; 1043 goto cleanup; 1044 } 1045 cands_cache[i * MMU_PAGE_SIZES + r] = pi; 1046 1047 for (mrange = 0; mrange < nranges; mrange++, pi++) { 1048 pgcntp = kmem_zalloc(colors_per_szc[r] * 1049 sizeof (pgcnt_t), KM_NOSLEEP); 1050 if (pgcntp == NULL) { 1051 rc = ENOMEM; 1052 goto cleanup; 1053 } 1054 pi->pcc_color_free = pgcntp; 1055 } 1056 } 1057 } 1058 1059 /* 1060 * Grab the write lock to prevent others from walking these arrays 1061 * while we are modifying them. 1062 */ 1063 PAGE_CTRS_WRITE_LOCK(mnode); 1064 1065 /* 1066 * For interleaved mnodes, find the first mnode 1067 * with valid page counters since the current 1068 * mnode may have just been added and not have 1069 * valid page counters. 1070 */ 1071 if (interleaved_mnodes) { 1072 for (i = 0; i < max_mem_nodes; i++) 1073 if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL) 1074 break; 1075 ASSERT(i < max_mem_nodes); 1076 oldmnode = i; 1077 } else 1078 oldmnode = mnode; 1079 1080 old_nranges = mnode_nranges[mnode]; 1081 cands_cache_nranges = old_nranges; 1082 mnode_nranges[mnode] = nranges; 1083 old_maxmrange = mnode_maxmrange[mnode]; 1084 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 1085 new_maxmrange = mnode_maxmrange[mnode]; 1086 1087 for (r = 1; r < mmu_page_sizes; r++) { 1088 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); 1089 old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r); 1090 old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r); 1091 oldbase = PAGE_COUNTERS_BASE(oldmnode, r); 1092 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r); 1093 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1094 old_color_array[mrange] = 1095 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 1096 r, mrange); 1097 } 1098 1099 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); 1100 new_ctr = ctr_cache[r]; 1101 ctr_cache[r] = NULL; 1102 if (old_ctr != NULL && 1103 (oldbase + old_npgs > newbase) && 1104 (newbase + npgs > oldbase)) { 1105 /* 1106 * Map the intersection of the old and new 1107 * counters into the new array. 1108 */ 1109 size_t offset; 1110 if (newbase > oldbase) { 1111 offset = (newbase - oldbase) >> 1112 PAGE_COUNTERS_SHIFT(mnode, r); 1113 bcopy(old_ctr + offset, new_ctr, 1114 MIN(pcsz, (old_csz - offset)) * 1115 sizeof (hpmctr_t)); 1116 } else { 1117 offset = (oldbase - newbase) >> 1118 PAGE_COUNTERS_SHIFT(mnode, r); 1119 bcopy(old_ctr, new_ctr + offset, 1120 MIN(pcsz - offset, old_csz) * 1121 sizeof (hpmctr_t)); 1122 } 1123 } 1124 1125 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; 1126 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; 1127 PAGE_COUNTERS_BASE(mnode, r) = newbase; 1128 1129 /* update shared hpm_counters in other mnodes */ 1130 if (interleaved_mnodes) { 1131 for (i = 0; i < max_mem_nodes; i++) { 1132 if ((i == mnode) || 1133 (mem_node_config[i].exists == 0)) 1134 continue; 1135 ASSERT( 1136 PAGE_COUNTERS_COUNTERS(i, r) == old_ctr || 1137 PAGE_COUNTERS_COUNTERS(i, r) == NULL); 1138 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr; 1139 PAGE_COUNTERS_ENTRIES(i, r) = pcsz; 1140 PAGE_COUNTERS_BASE(i, r) = newbase; 1141 } 1142 } 1143 1144 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1145 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) = 1146 color_cache[r][mrange]; 1147 color_cache[r][mrange] = NULL; 1148 } 1149 /* 1150 * for now, just reset on these events as it's probably 1151 * not worthwhile to try and optimize this. 1152 */ 1153 for (i = 0; i < colors_per_szc[r]; i++) { 1154 uint_t color_mask = colors_per_szc[r] - 1; 1155 int mlo = interleaved_mnodes ? 0 : mnode; 1156 int mhi = interleaved_mnodes ? max_mem_nodes : 1157 (mnode + 1); 1158 int m; 1159 pfn_t pfnum; 1160 size_t idx; 1161 MEM_NODE_ITERATOR_DECL(it); 1162 1163 for (m = mlo; m < mhi; m++) { 1164 if (mem_node_config[m].exists == 0) 1165 continue; 1166 pfnum = newbase; 1167 MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it); 1168 if (pfnum == (pfn_t)-1) { 1169 idx = 0; 1170 } else { 1171 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 1172 color_mask, color_mask, &it); 1173 idx = PNUM_TO_IDX(m, r, pfnum); 1174 idx = (idx < pcsz) ? idx : 0; 1175 } 1176 for (mrange = 0; mrange < nranges; mrange++) { 1177 if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m, 1178 r, mrange) != NULL) 1179 PAGE_COUNTERS_CURRENT_COLOR(m, 1180 r, i, mrange) = idx; 1181 } 1182 } 1183 } 1184 1185 /* cache info for freeing out of the critical path */ 1186 if ((caddr_t)old_ctr >= kernelheap && 1187 (caddr_t)old_ctr < ekernelheap) { 1188 ctr_cache[r] = old_ctr; 1189 size_cache[r] = old_csz; 1190 } 1191 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1192 size_t *tmp = old_color_array[mrange]; 1193 if ((caddr_t)tmp >= kernelheap && 1194 (caddr_t)tmp < ekernelheap) { 1195 color_cache[r][mrange] = tmp; 1196 } 1197 } 1198 /* 1199 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 1200 * satisfy the identity requirement. 1201 * We should be able to go from one to the other 1202 * and get consistent values. 1203 */ 1204 ASSERT(PNUM_TO_IDX(mnode, r, 1205 (IDX_TO_PNUM(mnode, r, 0))) == 0); 1206 ASSERT(IDX_TO_PNUM(mnode, r, 1207 (PNUM_TO_IDX(mnode, r, newbase))) == newbase); 1208 1209 /* pcc_info_t and pcc_color_free */ 1210 for (i = 0; i < NPC_MUTEX; i++) { 1211 pcc_info_t *epi; 1212 pcc_info_t *eold_pi; 1213 1214 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1215 old_pi = page_ctrs_cands[i][r][mnode]; 1216 page_ctrs_cands[i][r][mnode] = pi; 1217 cands_cache[i * MMU_PAGE_SIZES + r] = old_pi; 1218 1219 /* preserve old pcc_color_free values, if any */ 1220 if (old_pi == NULL) 1221 continue; 1222 1223 /* 1224 * when/if x86 does DR, must account for 1225 * possible change in range index when 1226 * preserving pcc_info 1227 */ 1228 epi = &pi[nranges]; 1229 eold_pi = &old_pi[old_nranges]; 1230 if (new_maxmrange > old_maxmrange) { 1231 pi += new_maxmrange - old_maxmrange; 1232 } else if (new_maxmrange < old_maxmrange) { 1233 old_pi += old_maxmrange - new_maxmrange; 1234 } 1235 for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) { 1236 pcc_info_t tmp = *pi; 1237 *pi = *old_pi; 1238 *old_pi = tmp; 1239 } 1240 } 1241 } 1242 PAGE_CTRS_WRITE_UNLOCK(mnode); 1243 1244 /* 1245 * Now that we have dropped the write lock, it is safe to free all 1246 * of the memory we have cached above. 1247 * We come thru here to free memory when pre-alloc fails, and also to 1248 * free old pointers which were recorded while locked. 1249 */ 1250 cleanup: 1251 for (r = 1; r < mmu_page_sizes; r++) { 1252 if (ctr_cache[r] != NULL) { 1253 kmem_free(ctr_cache[r], 1254 size_cache[r] * sizeof (hpmctr_t)); 1255 } 1256 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1257 if (color_cache[r][mrange] != NULL) { 1258 kmem_free(color_cache[r][mrange], 1259 colors_per_szc[r] * sizeof (size_t)); 1260 } 1261 } 1262 for (i = 0; i < NPC_MUTEX; i++) { 1263 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1264 if (pi == NULL) 1265 continue; 1266 nr = cands_cache_nranges; 1267 for (mrange = 0; mrange < nr; mrange++, pi++) { 1268 pgcntp = pi->pcc_color_free; 1269 if (pgcntp == NULL) 1270 continue; 1271 if ((caddr_t)pgcntp >= kernelheap && 1272 (caddr_t)pgcntp < ekernelheap) { 1273 kmem_free(pgcntp, 1274 colors_per_szc[r] * 1275 sizeof (pgcnt_t)); 1276 } 1277 } 1278 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1279 if ((caddr_t)pi >= kernelheap && 1280 (caddr_t)pi < ekernelheap) { 1281 kmem_free(pi, nr * sizeof (pcc_info_t)); 1282 } 1283 } 1284 } 1285 1286 kmem_free(cands_cache, 1287 sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES); 1288 return (rc); 1289 } 1290 1291 /* 1292 * Cleanup the hpm_counters field in the page counters 1293 * array. 1294 */ 1295 void 1296 page_ctrs_cleanup(void) 1297 { 1298 int r; /* region size */ 1299 int i; /* mnode index */ 1300 1301 /* 1302 * Get the page counters write lock while we are 1303 * setting the page hpm_counters field to NULL 1304 * for non-existent mnodes. 1305 */ 1306 for (i = 0; i < max_mem_nodes; i++) { 1307 PAGE_CTRS_WRITE_LOCK(i); 1308 if (mem_node_config[i].exists) { 1309 PAGE_CTRS_WRITE_UNLOCK(i); 1310 continue; 1311 } 1312 for (r = 1; r < mmu_page_sizes; r++) { 1313 PAGE_COUNTERS_COUNTERS(i, r) = NULL; 1314 } 1315 PAGE_CTRS_WRITE_UNLOCK(i); 1316 } 1317 } 1318 1319 #ifdef DEBUG 1320 1321 /* 1322 * confirm pp is a large page corresponding to szc 1323 */ 1324 void 1325 chk_lpg(page_t *pp, uchar_t szc) 1326 { 1327 spgcnt_t npgs = page_get_pagecnt(pp->p_szc); 1328 uint_t noreloc; 1329 1330 if (npgs == 1) { 1331 ASSERT(pp->p_szc == 0); 1332 ASSERT(pp->p_next == pp); 1333 ASSERT(pp->p_prev == pp); 1334 return; 1335 } 1336 1337 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1338 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1339 1340 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs)); 1341 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1)); 1342 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1))); 1343 ASSERT(pp->p_prev == (pp + (npgs - 1))); 1344 1345 /* 1346 * Check list of pages. 1347 */ 1348 noreloc = PP_ISNORELOC(pp); 1349 while (npgs--) { 1350 if (npgs != 0) { 1351 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1); 1352 ASSERT(pp->p_next == (pp + 1)); 1353 } 1354 ASSERT(pp->p_szc == szc); 1355 ASSERT(PP_ISFREE(pp)); 1356 ASSERT(PP_ISAGED(pp)); 1357 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1358 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1359 ASSERT(pp->p_vnode == NULL); 1360 ASSERT(PP_ISNORELOC(pp) == noreloc); 1361 1362 pp = pp->p_next; 1363 } 1364 } 1365 #endif /* DEBUG */ 1366 1367 void 1368 page_freelist_lock(int mnode) 1369 { 1370 int i; 1371 for (i = 0; i < NPC_MUTEX; i++) { 1372 mutex_enter(FPC_MUTEX(mnode, i)); 1373 mutex_enter(CPC_MUTEX(mnode, i)); 1374 } 1375 } 1376 1377 void 1378 page_freelist_unlock(int mnode) 1379 { 1380 int i; 1381 for (i = 0; i < NPC_MUTEX; i++) { 1382 mutex_exit(FPC_MUTEX(mnode, i)); 1383 mutex_exit(CPC_MUTEX(mnode, i)); 1384 } 1385 } 1386 1387 /* 1388 * add pp to the specified page list. Defaults to head of the page list 1389 * unless PG_LIST_TAIL is specified. 1390 */ 1391 void 1392 page_list_add(page_t *pp, int flags) 1393 { 1394 page_t **ppp; 1395 kmutex_t *pcm; 1396 uint_t bin, mtype; 1397 int mnode; 1398 1399 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1400 ASSERT(PP_ISFREE(pp)); 1401 ASSERT(!hat_page_is_mapped(pp)); 1402 ASSERT(hat_page_getshare(pp) == 0); 1403 1404 /* 1405 * Large pages should be freed via page_list_add_pages(). 1406 */ 1407 ASSERT(pp->p_szc == 0); 1408 1409 /* 1410 * Don't need to lock the freelist first here 1411 * because the page isn't on the freelist yet. 1412 * This means p_szc can't change on us. 1413 */ 1414 1415 bin = PP_2_BIN(pp); 1416 mnode = PP_2_MEM_NODE(pp); 1417 mtype = PP_2_MTYPE(pp); 1418 1419 if (flags & PG_LIST_ISINIT) { 1420 /* 1421 * PG_LIST_ISINIT is set during system startup (ie. single 1422 * threaded), add a page to the free list and add to the 1423 * the free region counters w/o any locking 1424 */ 1425 ASSERT(!PP_ISKFLT(pp)); 1426 ppp = PAGE_FREELISTP(PFLT_USER, mnode, 0, bin, mtype); 1427 1428 /* inline version of page_add() */ 1429 if (*ppp != NULL) { 1430 pp->p_next = *ppp; 1431 pp->p_prev = (*ppp)->p_prev; 1432 (*ppp)->p_prev = pp; 1433 pp->p_prev->p_next = pp; 1434 } else 1435 *ppp = pp; 1436 1437 page_ctr_add_internal(mnode, mtype, pp, flags); 1438 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1439 } else { 1440 pcm = PC_BIN_MUTEX(PP_ISKFLT(pp), mnode, bin, flags); 1441 1442 if (flags & PG_FREE_LIST) { 1443 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1444 ASSERT(PP_ISAGED(pp)); 1445 ppp = PAGE_FREELISTP(PP_ISKFLT(pp), mnode, 0, 1446 bin, mtype); 1447 } else { 1448 VM_STAT_ADD(vmm_vmstats.pladd_cache); 1449 ASSERT(pp->p_vnode); 1450 ASSERT((pp->p_offset & PAGEOFFSET) == 0); 1451 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1452 } 1453 mutex_enter(pcm); 1454 page_add(ppp, pp); 1455 1456 if (flags & PG_LIST_TAIL) 1457 *ppp = (*ppp)->p_next; 1458 /* 1459 * Add counters before releasing pcm mutex to avoid a race with 1460 * page_freelist_coalesce and page_freelist_split. 1461 */ 1462 page_ctr_add(mnode, mtype, pp, flags); 1463 mutex_exit(pcm); 1464 } 1465 1466 1467 #if defined(__sparc) 1468 if (PP_ISNORELOC(pp)) { 1469 kcage_freemem_add(1); 1470 } 1471 #elif defined(__amd64) && !defined(__xpv) 1472 if (PP_ISKFLT(pp)) { 1473 kflt_freemem_add(1); 1474 if (PP_ISUSERKFLT(pp)) { 1475 ASSERT(kflt_user_alloc > 0); 1476 atomic_add_long(&kflt_user_alloc, -1); 1477 PP_CLRUSERKFLT(pp); 1478 } 1479 } 1480 #endif /* __sparc */ 1481 /* 1482 * It is up to the caller to unlock the page! 1483 */ 1484 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1485 } 1486 1487 1488 #ifdef __sparc 1489 /* 1490 * This routine is only used by kcage_init during system startup. 1491 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add 1492 * without the overhead of taking locks and updating counters. 1493 */ 1494 void 1495 page_list_noreloc_startup(page_t *pp) 1496 { 1497 page_t **ppp; 1498 uint_t bin; 1499 int mnode; 1500 int mtype; 1501 int flags = 0; 1502 1503 /* 1504 * If this is a large page on the freelist then 1505 * break it up into smaller pages. 1506 */ 1507 if (pp->p_szc != 0) 1508 page_boot_demote(pp); 1509 1510 /* 1511 * Get list page is currently on. 1512 */ 1513 bin = PP_2_BIN(pp); 1514 mnode = PP_2_MEM_NODE(pp); 1515 mtype = PP_2_MTYPE(pp); 1516 ASSERT(mtype == MTYPE_RELOC); 1517 ASSERT(pp->p_szc == 0); 1518 1519 if (PP_ISAGED(pp)) { 1520 ASSERT(!PP_ISKFLT(pp)); 1521 ppp = PAGE_FREELISTP(PFLT_USER, mnode, 0, bin, mtype); 1522 flags |= PG_FREE_LIST; 1523 } else { 1524 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1525 flags |= PG_CACHE_LIST; 1526 } 1527 1528 ASSERT(*ppp != NULL); 1529 1530 /* 1531 * Delete page from current list. 1532 */ 1533 if (*ppp == pp) 1534 *ppp = pp->p_next; /* go to next page */ 1535 if (*ppp == pp) { 1536 *ppp = NULL; /* page list is gone */ 1537 } else { 1538 pp->p_prev->p_next = pp->p_next; 1539 pp->p_next->p_prev = pp->p_prev; 1540 } 1541 1542 /* 1543 * Decrement page counters 1544 */ 1545 page_ctr_sub_internal(mnode, mtype, pp, flags); 1546 1547 /* 1548 * Set no reloc for cage initted pages. 1549 */ 1550 PP_SETNORELOC(pp); 1551 1552 mtype = PP_2_MTYPE(pp); 1553 ASSERT(mtype == MTYPE_NORELOC); 1554 1555 /* 1556 * Get new list for page. 1557 */ 1558 if (PP_ISAGED(pp)) { 1559 ASSERT(!PP_ISKFLT(pp)); 1560 ppp = PAGE_FREELISTP(PFLT_USER, mnode, 0, bin, mtype); 1561 } else { 1562 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1563 } 1564 1565 /* 1566 * Insert page on new list. 1567 */ 1568 if (*ppp == NULL) { 1569 *ppp = pp; 1570 pp->p_next = pp->p_prev = pp; 1571 } else { 1572 pp->p_next = *ppp; 1573 pp->p_prev = (*ppp)->p_prev; 1574 (*ppp)->p_prev = pp; 1575 pp->p_prev->p_next = pp; 1576 } 1577 1578 /* 1579 * Increment page counters 1580 */ 1581 page_ctr_add_internal(mnode, mtype, pp, flags); 1582 1583 /* 1584 * Update cage freemem counter 1585 */ 1586 atomic_add_long(&kcage_freemem, 1); 1587 } 1588 #else /* __sparc */ 1589 1590 /* ARGSUSED */ 1591 void 1592 page_list_noreloc_startup(page_t *pp) 1593 { 1594 panic("page_list_noreloc_startup: should be here only for sparc"); 1595 } 1596 #endif 1597 1598 void 1599 page_list_add_pages(page_t *pp, int flags) 1600 { 1601 kmutex_t *pcm; 1602 pgcnt_t pgcnt; 1603 uint_t bin, mtype, i; 1604 int mnode; 1605 1606 /* default to freelist/head */ 1607 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0); 1608 1609 CHK_LPG(pp, pp->p_szc); 1610 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]); 1611 1612 bin = PP_2_BIN(pp); 1613 mnode = PP_2_MEM_NODE(pp); 1614 mtype = PP_2_MTYPE(pp); 1615 1616 if (flags & PG_LIST_ISINIT) { 1617 ASSERT(pp->p_szc == mmu_page_sizes - 1); 1618 page_vpadd(PAGE_FREELISTP(PFLT_USER, mnode, pp->p_szc, 1619 bin, mtype), pp); 1620 ASSERT(!PP_ISNORELOC(pp)); 1621 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 1622 } else { 1623 1624 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 1625 1626 pcm = PC_BIN_MUTEX(PFLT_USER, mnode, bin, PG_FREE_LIST); 1627 1628 mutex_enter(pcm); 1629 ASSERT(!PP_ISKFLT(pp)); 1630 page_vpadd(PAGE_FREELISTP(PFLT_USER, mnode, pp->p_szc, 1631 bin, mtype), pp); 1632 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1633 mutex_exit(pcm); 1634 1635 pgcnt = page_get_pagecnt(pp->p_szc); 1636 #if defined(__sparc) 1637 if (PP_ISNORELOC(pp)) { 1638 kcage_freemem_add(pgcnt); 1639 } 1640 #elif defined(__amd64) && !defined(__xpv) 1641 ASSERT(!PP_ISKFLT(pp)); 1642 #endif /* __sparc */ 1643 for (i = 0; i < pgcnt; i++, pp++) 1644 page_unlock_nocapture(pp); 1645 } 1646 } 1647 1648 /* 1649 * During boot, need to demote a large page to base 1650 * pagesize pages for seg_kmem for use in boot_alloc() 1651 */ 1652 void 1653 page_boot_demote(page_t *pp) 1654 { 1655 ASSERT(pp->p_szc != 0); 1656 ASSERT(PP_ISFREE(pp)); 1657 ASSERT(PP_ISAGED(pp)); 1658 1659 (void) page_demote(PP_2_MEM_NODE(pp), 1660 PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, 1661 PC_FREE); 1662 1663 ASSERT(PP_ISFREE(pp)); 1664 ASSERT(PP_ISAGED(pp)); 1665 ASSERT(pp->p_szc == 0); 1666 } 1667 1668 /* 1669 * Take a particular page off of whatever freelist the page 1670 * is claimed to be on. 1671 * 1672 * NOTE: Only used for PAGESIZE pages. 1673 */ 1674 void 1675 page_list_sub(page_t *pp, int flags) 1676 { 1677 int bin; 1678 uint_t mtype; 1679 int mnode; 1680 kmutex_t *pcm; 1681 page_t **ppp; 1682 1683 ASSERT(PAGE_EXCL(pp)); 1684 ASSERT(PP_ISFREE(pp)); 1685 1686 /* 1687 * The p_szc field can only be changed by page_promote() 1688 * and page_demote(). Only free pages can be promoted and 1689 * demoted and the free list MUST be locked during these 1690 * operations. So to prevent a race in page_list_sub() 1691 * between computing which bin of the freelist lock to 1692 * grab and actually grabing the lock we check again that 1693 * the bin we locked is still the correct one. Notice that 1694 * the p_szc field could have actually changed on us but 1695 * if the bin happens to still be the same we are safe. 1696 */ 1697 try_again: 1698 bin = PP_2_BIN(pp); 1699 mnode = PP_2_MEM_NODE(pp); 1700 pcm = PC_BIN_MUTEX(PP_ISKFLT(pp), mnode, bin, flags); 1701 mutex_enter(pcm); 1702 if (PP_2_BIN(pp) != bin) { 1703 mutex_exit(pcm); 1704 goto try_again; 1705 } 1706 mtype = PP_2_MTYPE(pp); 1707 1708 if (flags & PG_FREE_LIST) { 1709 VM_STAT_ADD(vmm_vmstats.plsub_free[0]); 1710 ASSERT(PP_ISAGED(pp)); 1711 ppp = PAGE_FREELISTP(PP_ISKFLT(pp), mnode, pp->p_szc, 1712 bin, mtype); 1713 } else { 1714 VM_STAT_ADD(vmm_vmstats.plsub_cache); 1715 ASSERT(!PP_ISAGED(pp)); 1716 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1717 } 1718 1719 /* 1720 * Common PAGESIZE case. 1721 * 1722 * Note that we locked the freelist. This prevents 1723 * any page promotion/demotion operations. Therefore 1724 * the p_szc will not change until we drop pcm mutex. 1725 */ 1726 if (pp->p_szc == 0) { 1727 page_sub(ppp, pp); 1728 /* 1729 * Subtract counters before releasing pcm mutex 1730 * to avoid race with page_freelist_coalesce. 1731 */ 1732 page_ctr_sub(mnode, mtype, pp, flags); 1733 mutex_exit(pcm); 1734 1735 #if defined(__sparc) 1736 if (PP_ISNORELOC(pp)) { 1737 kcage_freemem_sub(1); 1738 } 1739 #elif defined(__amd64) && !defined(__xpv) 1740 if (PP_ISKFLT(pp)) { 1741 kflt_freemem_sub(1); 1742 } 1743 #endif /* __sparc */ 1744 return; 1745 } 1746 1747 /* 1748 * Large pages on the cache list are not supported. 1749 */ 1750 if (flags & PG_CACHE_LIST) 1751 panic("page_list_sub: large page on cachelist"); 1752 1753 /* 1754 * Slow but rare. 1755 * 1756 * Somebody wants this particular page which is part 1757 * of a large page. In this case we just demote the page 1758 * if it's on the freelist. 1759 * 1760 * We have to drop pcm before locking the entire freelist. 1761 * Once we have re-locked the freelist check to make sure 1762 * the page hasn't already been demoted or completely 1763 * freed. 1764 */ 1765 mutex_exit(pcm); 1766 page_freelist_lock(mnode); 1767 if (pp->p_szc != 0) { 1768 /* 1769 * Large page is on freelist. 1770 */ 1771 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc), 1772 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 1773 } 1774 ASSERT(PP_ISFREE(pp)); 1775 ASSERT(PP_ISAGED(pp)); 1776 ASSERT(pp->p_szc == 0); 1777 1778 /* Large pages on the kernel freelist are not supported. */ 1779 ASSERT(!PP_ISKFLT(pp)); 1780 1781 /* 1782 * Subtract counters before releasing pcm mutex 1783 * to avoid race with page_freelist_coalesce. 1784 */ 1785 bin = PP_2_BIN(pp); 1786 mtype = PP_2_MTYPE(pp); 1787 ppp = PAGE_FREELISTP(PFLT_USER, mnode, pp->p_szc, bin, mtype); 1788 page_sub(ppp, pp); 1789 page_ctr_sub(mnode, mtype, pp, flags); 1790 page_freelist_unlock(mnode); 1791 1792 #if defined(__sparc) 1793 if (PP_ISNORELOC(pp)) { 1794 kcage_freemem_sub(1); 1795 } 1796 #endif /* __sparc */ 1797 } 1798 1799 void 1800 page_list_sub_pages(page_t *pp, uint_t szc) 1801 { 1802 kmutex_t *pcm; 1803 uint_t bin, mtype; 1804 int mnode; 1805 1806 ASSERT(PAGE_EXCL(pp)); 1807 ASSERT(PP_ISFREE(pp)); 1808 ASSERT(PP_ISAGED(pp)); 1809 1810 /* 1811 * See comment in page_list_sub(). 1812 */ 1813 try_again: 1814 bin = PP_2_BIN(pp); 1815 mnode = PP_2_MEM_NODE(pp); 1816 pcm = PC_BIN_MUTEX(PP_ISKFLT(pp), mnode, bin, PG_FREE_LIST); 1817 mutex_enter(pcm); 1818 if (PP_2_BIN(pp) != bin) { 1819 mutex_exit(pcm); 1820 goto try_again; 1821 } 1822 1823 /* 1824 * If we're called with a page larger than szc or it got 1825 * promoted above szc before we locked the freelist then 1826 * drop pcm and re-lock entire freelist. If page still larger 1827 * than szc then demote it. 1828 */ 1829 if (pp->p_szc > szc) { 1830 mutex_exit(pcm); 1831 pcm = NULL; 1832 page_freelist_lock(mnode); 1833 if (pp->p_szc > szc) { 1834 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig); 1835 (void) page_demote(mnode, 1836 PFN_BASE(pp->p_pagenum, pp->p_szc), 0, 1837 pp->p_szc, szc, PC_NO_COLOR, PC_FREE); 1838 } 1839 bin = PP_2_BIN(pp); 1840 } 1841 ASSERT(PP_ISFREE(pp)); 1842 ASSERT(PP_ISAGED(pp)); 1843 ASSERT(pp->p_szc <= szc); 1844 ASSERT(pp == PP_PAGEROOT(pp)); 1845 ASSERT(!PP_ISKFLT(pp)); 1846 1847 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]); 1848 1849 mtype = PP_2_MTYPE(pp); 1850 if (pp->p_szc != 0) { 1851 page_vpsub(PAGE_FREELISTP(PFLT_USER, mnode, pp->p_szc, 1852 bin, mtype), pp); 1853 CHK_LPG(pp, pp->p_szc); 1854 } else { 1855 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0); 1856 page_sub(PAGE_FREELISTP(PFLT_USER, mnode, pp->p_szc, 1857 bin, mtype), pp); 1858 } 1859 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1860 1861 if (pcm != NULL) { 1862 mutex_exit(pcm); 1863 } else { 1864 page_freelist_unlock(mnode); 1865 } 1866 1867 #if defined(__sparc) 1868 if (PP_ISNORELOC(pp)) { 1869 pgcnt_t pgcnt; 1870 1871 pgcnt = page_get_pagecnt(pp->p_szc); 1872 kcage_freemem_sub(pgcnt); 1873 } 1874 #endif /* __sparc */ 1875 } 1876 1877 /* 1878 * Add the page to the front of a linked list of pages 1879 * using the p_next & p_prev pointers for the list. 1880 * The caller is responsible for protecting the list pointers. 1881 */ 1882 void 1883 mach_page_add(page_t **ppp, page_t *pp) 1884 { 1885 if (*ppp == NULL) { 1886 pp->p_next = pp->p_prev = pp; 1887 } else { 1888 pp->p_next = *ppp; 1889 pp->p_prev = (*ppp)->p_prev; 1890 (*ppp)->p_prev = pp; 1891 pp->p_prev->p_next = pp; 1892 } 1893 *ppp = pp; 1894 } 1895 1896 /* 1897 * Remove this page from a linked list of pages 1898 * using the p_next & p_prev pointers for the list. 1899 * 1900 * The caller is responsible for protecting the list pointers. 1901 */ 1902 void 1903 mach_page_sub(page_t **ppp, page_t *pp) 1904 { 1905 ASSERT(PP_ISFREE(pp)); 1906 1907 if (*ppp == NULL || pp == NULL) 1908 panic("mach_page_sub"); 1909 1910 if (*ppp == pp) 1911 *ppp = pp->p_next; /* go to next page */ 1912 1913 if (*ppp == pp) 1914 *ppp = NULL; /* page list is gone */ 1915 else { 1916 pp->p_prev->p_next = pp->p_next; 1917 pp->p_next->p_prev = pp->p_prev; 1918 } 1919 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 1920 } 1921 1922 /* 1923 * Routine fsflush uses to gradually coalesce the free list into larger pages. 1924 */ 1925 void 1926 page_promote_size(page_t *pp, uint_t cur_szc) 1927 { 1928 pfn_t pfn; 1929 int mnode; 1930 int idx; 1931 int new_szc = cur_szc + 1; 1932 int full = FULL_REGION_CNT(new_szc); 1933 1934 pfn = page_pptonum(pp); 1935 mnode = PFN_2_MEM_NODE(pfn); 1936 1937 page_freelist_lock(mnode); 1938 1939 idx = PNUM_TO_IDX(mnode, new_szc, pfn); 1940 if (PAGE_COUNTERS(mnode, new_szc, idx) == full) 1941 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY); 1942 1943 page_freelist_unlock(mnode); 1944 } 1945 1946 static uint_t page_promote_err; 1947 static uint_t page_promote_noreloc_err; 1948 static uint_t page_promote_kflt_err; 1949 /* 1950 * Create a single larger page (of szc new_szc) from smaller contiguous pages 1951 * for the given mnode starting at pfnum. Pages involved are on the freelist 1952 * before the call and may be returned to the caller if requested, otherwise 1953 * they will be placed back on the freelist. 1954 * If flags is PC_ALLOC, then the large page will be returned to the user in 1955 * a state which is consistent with a page being taken off the freelist. If 1956 * we failed to lock the new large page, then we will return NULL to the 1957 * caller and put the large page on the freelist instead. 1958 * If flags is PC_FREE, then the large page will be placed on the freelist, 1959 * and NULL will be returned. 1960 * If the PC_KFLT_EXPORT flag is set, the large page will be returned to the 1961 * caller unlocked, as the caller is going to put it on the user page 1962 * freelist 1963 * The caller is responsible for locking the freelist as well as any other 1964 * accounting which needs to be done for a returned page. 1965 * 1966 * RFE: For performance pass in pp instead of pfnum so 1967 * we can avoid excessive calls to page_numtopp_nolock(). 1968 * This would depend on an assumption that all contiguous 1969 * pages are in the same memseg so we can just add/dec 1970 * our pp. 1971 * 1972 * Lock ordering: 1973 * 1974 * There is a potential but rare deadlock situation 1975 * for page promotion and demotion operations. The problem 1976 * is there are two paths into the freelist manager and 1977 * they have different lock orders: 1978 * 1979 * page_create() 1980 * lock freelist 1981 * page_lock(EXCL) 1982 * unlock freelist 1983 * return 1984 * caller drops page_lock 1985 * 1986 * page_free() and page_reclaim() 1987 * caller grabs page_lock(EXCL) 1988 * 1989 * lock freelist 1990 * unlock freelist 1991 * drop page_lock 1992 * 1993 * What prevents a thread in page_create() from deadlocking 1994 * with a thread freeing or reclaiming the same page is the 1995 * page_trylock() in page_get_freelist(). If the trylock fails 1996 * it skips the page. 1997 * 1998 * The lock ordering for promotion and demotion is the same as 1999 * for page_create(). Since the same deadlock could occur during 2000 * page promotion and freeing or reclaiming of a page on the 2001 * cache list we might have to fail the operation and undo what 2002 * have done so far. Again this is rare. 2003 */ 2004 page_t * 2005 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype) 2006 { 2007 page_t *pp, *pplist, *tpp, *start_pp; 2008 pgcnt_t new_npgs, npgs; 2009 uint_t bin; 2010 pgcnt_t tmpnpgs, pages_left; 2011 uint_t noreloc; 2012 int which_list; 2013 ulong_t index; 2014 kmutex_t *phm; 2015 2016 /* 2017 * General algorithm: 2018 * Find the starting page 2019 * Walk each page struct removing it from the freelist, 2020 * and linking it to all the other pages removed. 2021 * Once all pages are off the freelist, 2022 * walk the list, modifying p_szc to new_szc and what 2023 * ever other info needs to be done to create a large free page. 2024 * According to the flags, either return the page or put it 2025 * on the freelist. 2026 */ 2027 2028 start_pp = page_numtopp_nolock(pfnum); 2029 ASSERT(start_pp && (start_pp->p_pagenum == pfnum)); 2030 new_npgs = page_get_pagecnt(new_szc); 2031 ASSERT(IS_P2ALIGNED(pfnum, new_npgs)); 2032 2033 /* don't return page of the wrong mtype */ 2034 if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp)) 2035 return (NULL); 2036 2037 /* 2038 * Loop through smaller pages to confirm that all pages 2039 * give the same result for PP_ISNORELOC(). 2040 * We can check this reliably here as the protocol for setting 2041 * P_NORELOC requires pages to be taken off the free list first. 2042 */ 2043 noreloc = PP_ISNORELOC(start_pp); 2044 for (pp = start_pp + new_npgs; --pp > start_pp; ) { 2045 if (noreloc != PP_ISNORELOC(pp)) { 2046 page_promote_noreloc_err++; 2047 page_promote_err++; 2048 return (NULL); 2049 } 2050 2051 /* 2052 * page promote() can only legitimately be called for 2053 * pages from the kernel freelist from the kflt_export() 2054 * routine which sets the PC_KFLT_EXPORT flag. 2055 */ 2056 if (PP_ISKFLT(pp) && !(flags & PC_KFLT_EXPORT)) { 2057 page_promote_kflt_err++; 2058 page_promote_err++; 2059 return (NULL); 2060 } 2061 } 2062 2063 pages_left = new_npgs; 2064 pplist = NULL; 2065 pp = start_pp; 2066 2067 /* Loop around coalescing the smaller pages into a big page. */ 2068 while (pages_left) { 2069 /* 2070 * Remove from the freelist. 2071 */ 2072 ASSERT(PP_ISFREE(pp)); 2073 bin = PP_2_BIN(pp); 2074 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2075 mtype = PP_2_MTYPE(pp); 2076 if (PP_ISAGED(pp)) { 2077 2078 /* 2079 * PG_FREE_LIST 2080 */ 2081 if (pp->p_szc) { 2082 page_vpsub(PAGE_FREELISTP(PFLT_USER, mnode, 2083 pp->p_szc, bin, mtype), pp); 2084 } else { 2085 ASSERT(!PP_ISKFLT(pp) || 2086 (flags & PC_KFLT_EXPORT)); 2087 mach_page_sub(PAGE_FREELISTP(PP_ISKFLT(pp), 2088 mnode, 0, bin, mtype), pp); 2089 } 2090 which_list = PG_FREE_LIST; 2091 } else { 2092 ASSERT(pp->p_szc == 0); 2093 2094 /* 2095 * PG_CACHE_LIST 2096 * 2097 * Since this page comes from the 2098 * cachelist, we must destroy the 2099 * vnode association. 2100 */ 2101 if (!page_trylock(pp, SE_EXCL)) { 2102 goto fail_promote; 2103 } 2104 2105 /* 2106 * We need to be careful not to deadlock 2107 * with another thread in page_lookup(). 2108 * The page_lookup() thread could be holding 2109 * the same phm that we need if the two 2110 * pages happen to hash to the same phm lock. 2111 * At this point we have locked the entire 2112 * freelist and page_lookup() could be trying 2113 * to grab a freelist lock. 2114 */ 2115 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); 2116 phm = PAGE_HASH_MUTEX(index); 2117 if (!mutex_tryenter(phm)) { 2118 page_unlock_nocapture(pp); 2119 goto fail_promote; 2120 } 2121 2122 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp); 2123 page_hashout(pp, phm); 2124 mutex_exit(phm); 2125 PP_SETAGED(pp); 2126 page_unlock_nocapture(pp); 2127 which_list = PG_CACHE_LIST; 2128 } 2129 page_ctr_sub(mnode, mtype, pp, which_list); 2130 2131 /* 2132 * Concatenate the smaller page(s) onto 2133 * the large page list. 2134 */ 2135 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc); 2136 pages_left -= npgs; 2137 tpp = pp; 2138 while (npgs--) { 2139 tpp->p_szc = new_szc; 2140 tpp = tpp->p_next; 2141 } 2142 page_list_concat(&pplist, &pp); 2143 pp += tmpnpgs; 2144 } 2145 CHK_LPG(pplist, new_szc); 2146 2147 /* 2148 * return the page to the user if requested 2149 * in the properly locked state. 2150 */ 2151 if ((flags & PC_ALLOC) && (page_trylock_cons(pplist, SE_EXCL))) { 2152 return (pplist); 2153 } 2154 2155 /* 2156 * If the PC_KFLT_EXPORT flag is set, kflt_export() is just going to 2157 * return this large page to the user page freelist, so there is no 2158 * need to lock it. 2159 */ 2160 if (flags & PC_KFLT_EXPORT) { 2161 return (pplist); 2162 } 2163 2164 /* 2165 * Otherwise place the new large page on the freelist 2166 */ 2167 bin = PP_2_BIN(pplist); 2168 mnode = PP_2_MEM_NODE(pplist); 2169 mtype = PP_2_MTYPE(pplist); 2170 page_vpadd(PAGE_FREELISTP(PFLT_USER, mnode, new_szc, 2171 bin, mtype), pplist); 2172 2173 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST); 2174 return (NULL); 2175 2176 fail_promote: 2177 /* 2178 * A thread must have still been freeing or 2179 * reclaiming the page on the cachelist. 2180 * To prevent a deadlock undo what we have 2181 * done sofar and return failure. This 2182 * situation can only happen while promoting 2183 * PAGESIZE pages. 2184 */ 2185 page_promote_err++; 2186 while (pplist) { 2187 pp = pplist; 2188 mach_page_sub(&pplist, pp); 2189 pp->p_szc = 0; 2190 bin = PP_2_BIN(pp); 2191 mtype = PP_2_MTYPE(pp); 2192 ASSERT(!PP_ISKFLT(pp)); 2193 mach_page_add(PAGE_FREELISTP(PFLT_USER, mnode, 2194 0, bin, mtype), pp); 2195 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2196 } 2197 return (NULL); 2198 2199 } 2200 2201 /* 2202 * Break up a large page into smaller size pages. 2203 * Pages involved are on the freelist before the call and may 2204 * be returned to the caller if requested, otherwise they will 2205 * be placed back on the freelist. 2206 * The caller is responsible for locking the freelist as well as any other 2207 * accounting which needs to be done for a returned page. 2208 * If flags is not PC_ALLOC, the color argument is ignored, and thus 2209 * technically, any value may be passed in but PC_NO_COLOR is the standard 2210 * which should be followed for clarity's sake. 2211 * Returns a page whose pfn is < pfnmax 2212 */ 2213 page_t * 2214 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc, 2215 uchar_t new_szc, int color, int flags) 2216 { 2217 page_t *pp, *pplist, *npplist; 2218 pgcnt_t npgs, n; 2219 uint_t bin; 2220 uint_t mtype; 2221 page_t *ret_pp = NULL; 2222 2223 ASSERT(cur_szc != 0); 2224 ASSERT(new_szc < cur_szc); 2225 2226 pplist = page_numtopp_nolock(pfnum); 2227 ASSERT(pplist != NULL); 2228 2229 ASSERT(pplist->p_szc == cur_szc); 2230 ASSERT(!PP_ISKFLT(pplist)); 2231 2232 bin = PP_2_BIN(pplist); 2233 ASSERT(mnode == PP_2_MEM_NODE(pplist)); 2234 mtype = PP_2_MTYPE(pplist); 2235 page_vpsub(PAGE_FREELISTP(PFLT_USER, mnode, cur_szc, 2236 bin, mtype), pplist); 2237 2238 CHK_LPG(pplist, cur_szc); 2239 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST); 2240 2241 /* 2242 * Number of PAGESIZE pages for smaller new_szc 2243 * page. 2244 */ 2245 npgs = page_get_pagecnt(new_szc); 2246 2247 while (pplist) { 2248 pp = pplist; 2249 2250 ASSERT(pp->p_szc == cur_szc); 2251 2252 /* 2253 * We either break it up into PAGESIZE pages or larger. 2254 */ 2255 if (npgs == 1) { /* PAGESIZE case */ 2256 mach_page_sub(&pplist, pp); 2257 ASSERT(pp->p_szc == cur_szc); 2258 ASSERT(new_szc == 0); 2259 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2260 pp->p_szc = new_szc; 2261 bin = PP_2_BIN(pp); 2262 if ((bin == color) && (flags == PC_ALLOC) && 2263 (ret_pp == NULL) && (pfnmax == 0 || 2264 pp->p_pagenum < pfnmax) && 2265 page_trylock_cons(pp, SE_EXCL)) { 2266 ret_pp = pp; 2267 } else { 2268 mtype = PP_2_MTYPE(pp); 2269 ASSERT(!PP_ISKFLT(pp)); 2270 mach_page_add(PAGE_FREELISTP(PFLT_USER, mnode, 2271 0, bin, mtype), pp); 2272 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2273 } 2274 } else { 2275 page_t *try_to_return_this_page = NULL; 2276 int count = 0; 2277 2278 /* 2279 * Break down into smaller lists of pages. 2280 */ 2281 page_list_break(&pplist, &npplist, npgs); 2282 2283 pp = pplist; 2284 n = npgs; 2285 while (n--) { 2286 ASSERT(pp->p_szc == cur_szc); 2287 /* 2288 * Check whether all the pages in this list 2289 * fit the request criteria. 2290 */ 2291 if (pfnmax == 0 || pp->p_pagenum < pfnmax) { 2292 count++; 2293 } 2294 pp->p_szc = new_szc; 2295 pp = pp->p_next; 2296 } 2297 2298 if (count == npgs && 2299 (pfnmax == 0 || pp->p_pagenum < pfnmax)) { 2300 try_to_return_this_page = pp; 2301 } 2302 2303 CHK_LPG(pplist, new_szc); 2304 2305 bin = PP_2_BIN(pplist); 2306 if (try_to_return_this_page) 2307 ASSERT(mnode == 2308 PP_2_MEM_NODE(try_to_return_this_page)); 2309 if ((bin == color) && (flags == PC_ALLOC) && 2310 (ret_pp == NULL) && try_to_return_this_page && 2311 page_trylock_cons(try_to_return_this_page, 2312 SE_EXCL)) { 2313 ret_pp = try_to_return_this_page; 2314 } else { 2315 mtype = PP_2_MTYPE(pp); 2316 page_vpadd(PAGE_FREELISTP(PFLT_USER, mnode, 2317 new_szc, bin, mtype), pplist); 2318 2319 page_ctr_add(mnode, mtype, pplist, 2320 PG_FREE_LIST); 2321 } 2322 pplist = npplist; 2323 } 2324 } 2325 return (ret_pp); 2326 } 2327 2328 int mpss_coalesce_disable = 0; 2329 2330 /* 2331 * Coalesce free pages into a page of the given szc and color if possible. 2332 * Return the pointer to the page created, otherwise, return NULL. 2333 * 2334 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2335 */ 2336 page_t * 2337 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask, 2338 int mtype, pfn_t pfnhi) 2339 { 2340 int r = szc; /* region size */ 2341 int mrange; 2342 uint_t full, bin, color_mask, wrap = 0; 2343 pfn_t pfnum, lo, hi; 2344 size_t len, idx, idx0; 2345 pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc); 2346 page_t *ret_pp; 2347 MEM_NODE_ITERATOR_DECL(it); 2348 #if defined(__sparc) 2349 pfn_t pfnum0, nlo, nhi; 2350 #endif 2351 if (mpss_coalesce_disable) { 2352 ASSERT(szc < MMU_PAGE_SIZES); 2353 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]); 2354 return (NULL); 2355 } 2356 2357 ASSERT(szc < mmu_page_sizes); 2358 color_mask = PAGE_GET_PAGECOLORS(szc) - 1; 2359 ASSERT(ceq_mask <= color_mask); 2360 ASSERT(color <= color_mask); 2361 color &= ceq_mask; 2362 2363 /* Prevent page_counters dynamic memory from being freed */ 2364 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2365 2366 mrange = MTYPE_2_MRANGE(mnode, mtype); 2367 ASSERT(mrange < mnode_nranges[mnode]); 2368 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]); 2369 2370 /* get pfn range for mtype */ 2371 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2372 MNODETYPE_2_PFN(mnode, mtype, lo, hi); 2373 hi++; 2374 2375 /* use lower limit if given */ 2376 if (pfnhi != PFNNULL && pfnhi < hi) 2377 hi = pfnhi; 2378 2379 /* round to szcpgcnt boundaries */ 2380 lo = P2ROUNDUP(lo, szcpgcnt); 2381 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 2382 if (lo == (pfn_t)-1) { 2383 rw_exit(&page_ctrs_rwlock[mnode]); 2384 return (NULL); 2385 } 2386 hi = hi & ~(szcpgcnt - 1); 2387 2388 /* set lo to the closest pfn of the right color */ 2389 if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) || 2390 (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) { 2391 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask, 2392 &it); 2393 } 2394 2395 if (hi <= lo) { 2396 rw_exit(&page_ctrs_rwlock[mnode]); 2397 return (NULL); 2398 } 2399 2400 full = FULL_REGION_CNT(r); 2401 2402 /* calculate the number of page candidates and initial search index */ 2403 bin = color; 2404 idx0 = (size_t)(-1); 2405 do { 2406 pgcnt_t acand; 2407 2408 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand); 2409 if (acand) { 2410 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, 2411 r, bin, mrange); 2412 idx0 = MIN(idx0, idx); 2413 cands += acand; 2414 } 2415 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask); 2416 } while (bin != color); 2417 2418 if (cands == 0) { 2419 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]); 2420 rw_exit(&page_ctrs_rwlock[mnode]); 2421 return (NULL); 2422 } 2423 2424 pfnum = IDX_TO_PNUM(mnode, r, idx0); 2425 if (pfnum < lo || pfnum >= hi) { 2426 pfnum = lo; 2427 } else { 2428 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2429 if (pfnum == (pfn_t)-1) { 2430 pfnum = lo; 2431 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2432 ASSERT(pfnum != (pfn_t)-1); 2433 } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask || 2434 (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) { 2435 /* invalid color, get the closest correct pfn */ 2436 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2437 color_mask, &it); 2438 if (pfnum >= hi) { 2439 pfnum = lo; 2440 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2441 } 2442 } 2443 } 2444 2445 /* set starting index */ 2446 idx0 = PNUM_TO_IDX(mnode, r, pfnum); 2447 ASSERT(idx0 < len); 2448 2449 #if defined(__sparc) 2450 pfnum0 = pfnum; /* page corresponding to idx0 */ 2451 nhi = 0; /* search kcage ranges */ 2452 #endif 2453 2454 for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) { 2455 2456 #if defined(__sparc) 2457 /* 2458 * Find lowest intersection of kcage ranges and mnode. 2459 * MTYPE_NORELOC means look in the cage, otherwise outside. 2460 */ 2461 if (nhi <= pfnum) { 2462 if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum, 2463 (wrap == 0 ? hi : pfnum0), &nlo, &nhi)) 2464 goto wrapit; 2465 2466 /* jump to the next page in the range */ 2467 if (pfnum < nlo) { 2468 pfnum = P2ROUNDUP(nlo, szcpgcnt); 2469 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2470 idx = PNUM_TO_IDX(mnode, r, pfnum); 2471 if (idx >= len || pfnum >= hi) 2472 goto wrapit; 2473 if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & 2474 ceq_mask) 2475 goto next; 2476 if (interleaved_mnodes && 2477 PFN_2_MEM_NODE(pfnum) != mnode) 2478 goto next; 2479 } 2480 } 2481 #endif 2482 2483 if (PAGE_COUNTERS(mnode, r, idx) != full) 2484 goto next; 2485 2486 /* 2487 * RFE: For performance maybe we can do something less 2488 * brutal than locking the entire freelist. So far 2489 * this doesn't seem to be a performance problem? 2490 */ 2491 page_freelist_lock(mnode); 2492 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2493 ret_pp = 2494 page_promote(mnode, pfnum, r, PC_ALLOC, mtype); 2495 if (ret_pp != NULL) { 2496 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]); 2497 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, 2498 PFN_2_COLOR(pfnum, szc, &it), mrange) = idx; 2499 page_freelist_unlock(mnode); 2500 rw_exit(&page_ctrs_rwlock[mnode]); 2501 #if defined(__sparc) 2502 if (PP_ISNORELOC(ret_pp)) { 2503 pgcnt_t npgs; 2504 2505 npgs = page_get_pagecnt(ret_pp->p_szc); 2506 kcage_freemem_sub(npgs); 2507 } 2508 #elif defined(__amd64) && !defined(__xpv) 2509 /* 2510 * Only a single page size is supported on 2511 * the kernel freelist. This will need to 2512 * be changed to increase the availability 2513 * of more than one large page size. 2514 */ 2515 ASSERT(!PP_ISKFLT(ret_pp)); 2516 #endif /* __sparc */ 2517 return (ret_pp); 2518 } 2519 #ifdef VM_STATS 2520 } else { 2521 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]); 2522 #endif 2523 } 2524 2525 page_freelist_unlock(mnode); 2526 /* 2527 * No point looking for another page if we've 2528 * already tried all of the ones that 2529 * page_ctr_cands indicated. Stash off where we left 2530 * off. 2531 * Note: this is not exact since we don't hold the 2532 * page_freelist_locks before we initially get the 2533 * value of cands for performance reasons, but should 2534 * be a decent approximation. 2535 */ 2536 if (--cands == 0) { 2537 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) = 2538 idx; 2539 break; 2540 } 2541 next: 2542 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2543 color_mask, &it); 2544 idx = PNUM_TO_IDX(mnode, r, pfnum); 2545 if (idx >= len || pfnum >= hi) { 2546 wrapit: 2547 pfnum = lo; 2548 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2549 idx = PNUM_TO_IDX(mnode, r, pfnum); 2550 wrap++; 2551 #if defined(__sparc) 2552 nhi = 0; /* search kcage ranges */ 2553 #endif 2554 } 2555 } 2556 2557 rw_exit(&page_ctrs_rwlock[mnode]); 2558 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]); 2559 return (NULL); 2560 } 2561 2562 /* 2563 * For the given mnode, promote as many small pages to large pages as possible. 2564 * mnode can be -1, which means do them all 2565 */ 2566 void 2567 page_freelist_coalesce_all(int mnode) 2568 { 2569 int r; /* region size */ 2570 int idx, full; 2571 size_t len; 2572 int doall = interleaved_mnodes || mnode < 0; 2573 int mlo = doall ? 0 : mnode; 2574 int mhi = doall ? max_mem_nodes : (mnode + 1); 2575 2576 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); 2577 2578 if (mpss_coalesce_disable) { 2579 return; 2580 } 2581 2582 /* 2583 * Lock the entire freelist and coalesce what we can. 2584 * 2585 * Always promote to the largest page possible 2586 * first to reduce the number of page promotions. 2587 */ 2588 for (mnode = mlo; mnode < mhi; mnode++) { 2589 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2590 page_freelist_lock(mnode); 2591 } 2592 for (r = mmu_page_sizes - 1; r > 0; r--) { 2593 for (mnode = mlo; mnode < mhi; mnode++) { 2594 pgcnt_t cands = 0; 2595 int mrange, nranges = mnode_nranges[mnode]; 2596 2597 for (mrange = 0; mrange < nranges; mrange++) { 2598 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands); 2599 if (cands != 0) 2600 break; 2601 } 2602 if (cands == 0) { 2603 VM_STAT_ADD(vmm_vmstats. 2604 page_ctrs_cands_skip_all); 2605 continue; 2606 } 2607 2608 full = FULL_REGION_CNT(r); 2609 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2610 2611 for (idx = 0; idx < len; idx++) { 2612 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2613 pfn_t pfnum = 2614 IDX_TO_PNUM(mnode, r, idx); 2615 int tmnode = interleaved_mnodes ? 2616 PFN_2_MEM_NODE(pfnum) : mnode; 2617 2618 ASSERT(pfnum >= 2619 mem_node_config[tmnode].physbase && 2620 pfnum < 2621 mem_node_config[tmnode].physmax); 2622 2623 (void) page_promote(tmnode, 2624 pfnum, r, PC_FREE, PC_MTYPE_ANY); 2625 } 2626 } 2627 /* shared hpm_counters covers all mnodes, so we quit */ 2628 if (interleaved_mnodes) 2629 break; 2630 } 2631 } 2632 for (mnode = mlo; mnode < mhi; mnode++) { 2633 page_freelist_unlock(mnode); 2634 rw_exit(&page_ctrs_rwlock[mnode]); 2635 } 2636 } 2637 2638 /* 2639 * This is where all polices for moving pages around 2640 * to different page size free lists is implemented. 2641 * Returns 1 on success, 0 on failure. 2642 * 2643 * So far these are the priorities for this algorithm in descending 2644 * order: 2645 * 2646 * 1) When servicing a request try to do so with a free page 2647 * from next size up. Helps defer fragmentation as long 2648 * as possible. 2649 * 2650 * 2) Page coalesce on demand. Only when a freelist 2651 * larger than PAGESIZE is empty and step 1 2652 * will not work since all larger size lists are 2653 * also empty. 2654 * 2655 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2656 */ 2657 2658 page_t * 2659 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype, 2660 pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw) 2661 { 2662 uchar_t nszc = szc + 1; 2663 uint_t bin, sbin, bin_prev; 2664 page_t *pp, *firstpp; 2665 page_t *ret_pp = NULL; 2666 uint_t color_mask; 2667 2668 if (nszc == mmu_page_sizes) 2669 return (NULL); 2670 2671 ASSERT(nszc < mmu_page_sizes); 2672 color_mask = PAGE_GET_PAGECOLORS(nszc) - 1; 2673 bin = sbin = PAGE_GET_NSZ_COLOR(szc, color); 2674 bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR : 2675 PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev); 2676 2677 VM_STAT_ADD(vmm_vmstats.pfs_req[szc]); 2678 /* 2679 * First try to break up a larger page to fill current size freelist. 2680 */ 2681 while (plw->plw_bins[nszc] != 0) { 2682 2683 ASSERT(nszc < mmu_page_sizes); 2684 2685 /* 2686 * If page found then demote it. 2687 */ 2688 if (PAGE_FREELISTS(PFLT_USER, mnode, nszc, bin, mtype)) { 2689 page_freelist_lock(mnode); 2690 firstpp = pp = PAGE_FREELISTS(PFLT_USER, mnode, 2691 nszc, bin, mtype); 2692 2693 /* 2694 * If pfnhi is not PFNNULL, look for large page below 2695 * pfnhi. PFNNULL signifies no pfn requirement. 2696 */ 2697 if (pp && 2698 ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) || 2699 (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) { 2700 do { 2701 pp = pp->p_vpnext; 2702 if (pp == firstpp) { 2703 pp = NULL; 2704 break; 2705 } 2706 } while ((pfnhi != PFNNULL && 2707 pp->p_pagenum >= pfnhi) || 2708 (pfnlo != PFNNULL && 2709 pp->p_pagenum < pfnlo)); 2710 2711 if (pfnhi != PFNNULL && pp != NULL) 2712 ASSERT(pp->p_pagenum < pfnhi); 2713 2714 if (pfnlo != PFNNULL && pp != NULL) 2715 ASSERT(pp->p_pagenum >= pfnlo); 2716 } 2717 if (pp) { 2718 uint_t ccolor = page_correct_color(szc, nszc, 2719 color, bin, plw->plw_ceq_mask[szc]); 2720 2721 ASSERT(pp->p_szc == nszc); 2722 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]); 2723 ret_pp = page_demote(mnode, pp->p_pagenum, 2724 pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC); 2725 if (ret_pp) { 2726 page_freelist_unlock(mnode); 2727 #if defined(__sparc) 2728 if (PP_ISNORELOC(ret_pp)) { 2729 pgcnt_t npgs; 2730 2731 npgs = page_get_pagecnt( 2732 ret_pp->p_szc); 2733 kcage_freemem_sub(npgs); 2734 } 2735 #elif defined(__amd64) && !defined(__xpv) 2736 ASSERT(!PP_ISKFLT(pp)); 2737 #endif /* __sparc */ 2738 return (ret_pp); 2739 } 2740 } 2741 page_freelist_unlock(mnode); 2742 } 2743 2744 /* loop through next size bins */ 2745 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask); 2746 plw->plw_bins[nszc]--; 2747 2748 if (bin == sbin) { 2749 uchar_t nnszc = nszc + 1; 2750 2751 /* we are done with this page size - check next */ 2752 if (plw->plw_bins[nnszc] == 0) 2753 /* we have already checked next size bins */ 2754 break; 2755 2756 bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin); 2757 if (bin_prev != INVALID_COLOR) { 2758 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev); 2759 if (!((bin ^ bin_prev) & 2760 plw->plw_ceq_mask[nnszc])) 2761 break; 2762 } 2763 ASSERT(nnszc < mmu_page_sizes); 2764 color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1; 2765 nszc = nnszc; 2766 ASSERT(nszc < mmu_page_sizes); 2767 } 2768 } 2769 2770 return (ret_pp); 2771 } 2772 2773 /* 2774 * Helper routine used only by the freelist code to lock 2775 * a page. If the page is a large page then it succeeds in 2776 * locking all the constituent pages or none at all. 2777 * Returns 1 on sucess, 0 on failure. 2778 */ 2779 static int 2780 page_trylock_cons(page_t *pp, se_t se) 2781 { 2782 page_t *tpp, *first_pp = pp; 2783 2784 /* 2785 * Fail if can't lock first or only page. 2786 */ 2787 if (!page_trylock(pp, se)) { 2788 return (0); 2789 } 2790 2791 /* 2792 * PAGESIZE: common case. 2793 */ 2794 if (pp->p_szc == 0) { 2795 return (1); 2796 } 2797 2798 /* 2799 * Large page case. 2800 */ 2801 tpp = pp->p_next; 2802 while (tpp != pp) { 2803 if (!page_trylock(tpp, se)) { 2804 /* 2805 * On failure unlock what we have locked so far. 2806 * We want to avoid attempting to capture these 2807 * pages as the pcm mutex may be held which could 2808 * lead to a recursive mutex panic. 2809 */ 2810 while (first_pp != tpp) { 2811 page_unlock_nocapture(first_pp); 2812 first_pp = first_pp->p_next; 2813 } 2814 return (0); 2815 } 2816 tpp = tpp->p_next; 2817 } 2818 return (1); 2819 } 2820 2821 /* 2822 * init context for walking page lists 2823 * Called when a page of the given szc in unavailable. Sets markers 2824 * for the beginning of the search to detect when search has 2825 * completed a full cycle. Sets flags for splitting larger pages 2826 * and coalescing smaller pages. Page walking procedes until a page 2827 * of the desired equivalent color is found. 2828 */ 2829 void 2830 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split, 2831 int use_ceq, page_list_walker_t *plw) 2832 { 2833 uint_t nszc, ceq_mask, colors; 2834 uchar_t ceq = use_ceq ? colorequivszc[szc] : 0; 2835 2836 ASSERT(szc < mmu_page_sizes); 2837 colors = PAGE_GET_PAGECOLORS(szc); 2838 2839 plw->plw_colors = colors; 2840 plw->plw_color_mask = colors - 1; 2841 plw->plw_bin_marker = plw->plw_bin0 = bin; 2842 plw->plw_bin_split_prev = bin; 2843 plw->plw_bin_step = (szc == 0) ? vac_colors : 1; 2844 2845 /* 2846 * if vac aliasing is possible make sure lower order color 2847 * bits are never ignored 2848 */ 2849 if (vac_colors > 1) 2850 ceq &= 0xf0; 2851 2852 /* 2853 * calculate the number of non-equivalent colors and 2854 * color equivalency mask 2855 */ 2856 plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 2857 ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors); 2858 ASSERT(plw->plw_ceq_dif > 0); 2859 plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf); 2860 2861 if (flags & PG_MATCH_COLOR) { 2862 if (cpu_page_colors < 0) { 2863 /* 2864 * this is a heterogeneous machine with different CPUs 2865 * having different size e$ (not supported for ni2/rock 2866 */ 2867 uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc); 2868 cpucolors = MAX(cpucolors, 1); 2869 ceq_mask = plw->plw_color_mask & (cpucolors - 1); 2870 plw->plw_ceq_mask[szc] = 2871 MIN(ceq_mask, plw->plw_ceq_mask[szc]); 2872 } 2873 plw->plw_ceq_dif = 1; 2874 } 2875 2876 /* we can split pages in the freelist, but not the cachelist */ 2877 if (can_split) { 2878 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0; 2879 2880 /* set next szc color masks and number of free list bins */ 2881 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) { 2882 plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc, 2883 plw->plw_ceq_mask[szc]); 2884 plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc); 2885 } 2886 plw->plw_ceq_mask[nszc] = INVALID_MASK; 2887 plw->plw_bins[nszc] = 0; 2888 2889 } else { 2890 ASSERT(szc == 0); 2891 plw->plw_do_split = 0; 2892 plw->plw_bins[1] = 0; 2893 plw->plw_ceq_mask[1] = INVALID_MASK; 2894 } 2895 ASSERT(bin < plw->plw_colors); 2896 } 2897 2898 /* 2899 * Walker variables for the kernel freelist are initialized so that all 2900 * kernel page colors are treated as equivalent. This mimimizes the amount 2901 * of memory used by the the kernel freelist. 2902 */ 2903 /* ARGSUSED */ 2904 void 2905 page_kflt_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split, 2906 int use_ceq, page_list_walker_t *plw) 2907 { 2908 /* 2909 * Note that the following values are only valid for pages with 2910 * szc == 0. 2911 */ 2912 ASSERT(szc == 0); 2913 2914 /* The number of colors for kernel pages */ 2915 plw->plw_colors = KFLT_PAGE_COLORS; 2916 plw->plw_color_mask = KFLT_PAGE_COLORS - 1; 2917 2918 /* The marker indicates when at all the bins have been processed */ 2919 plw->plw_bin_marker = plw->plw_bin0 = bin; 2920 plw->plw_bin_split_prev = bin; 2921 2922 /* Add plw_bin_step to get the next bin to process */ 2923 plw->plw_bin_step = vac_colors; 2924 2925 /* There is only 1 color group i.e. all colors are equivalent */ 2926 plw->plw_ceq_dif = 1; 2927 plw->plw_ceq_mask[0] = 0; 2928 plw->plw_do_split = 0; 2929 2930 ASSERT(bin < plw->plw_colors); 2931 } 2932 2933 /* 2934 * set mark to flag where next split should occur 2935 */ 2936 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) { \ 2937 uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin); \ 2938 uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0); \ 2939 uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask; \ 2940 plw->plw_split_next = \ 2941 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask); \ 2942 if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \ 2943 plw->plw_split_next = \ 2944 INC_MASKED(plw->plw_split_next, \ 2945 neq_mask, plw->plw_color_mask); \ 2946 } \ 2947 } 2948 2949 uint_t 2950 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw) 2951 { 2952 uint_t neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask; 2953 uint_t bin0_nsz, nbin_nsz, nbin0, nbin; 2954 uchar_t nszc = szc + 1; 2955 2956 nbin = ADD_MASKED(bin, 2957 plw->plw_bin_step, neq_mask, plw->plw_color_mask); 2958 2959 if (plw->plw_do_split) { 2960 plw->plw_bin_split_prev = bin; 2961 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw); 2962 plw->plw_do_split = 0; 2963 } 2964 2965 if (szc == 0) { 2966 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) { 2967 if (nbin == plw->plw_bin0 && 2968 (vac_colors == 1 || nbin != plw->plw_bin_marker)) { 2969 nbin = ADD_MASKED(nbin, plw->plw_bin_step, 2970 neq_mask, plw->plw_color_mask); 2971 plw->plw_bin_split_prev = plw->plw_bin0; 2972 } 2973 2974 if (vac_colors > 1 && nbin == plw->plw_bin_marker) { 2975 plw->plw_bin_marker = 2976 nbin = INC_MASKED(nbin, neq_mask, 2977 plw->plw_color_mask); 2978 plw->plw_bin_split_prev = plw->plw_bin0; 2979 /* 2980 * large pages all have the same vac color 2981 * so by now we should be done with next 2982 * size page splitting process 2983 */ 2984 ASSERT(plw->plw_bins[1] == 0); 2985 plw->plw_do_split = 0; 2986 return (nbin); 2987 } 2988 2989 } else { 2990 uint_t bin_jump = (vac_colors == 1) ? 2991 (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP; 2992 2993 bin_jump &= ~(vac_colors - 1); 2994 2995 nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask, 2996 plw->plw_color_mask); 2997 2998 if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) { 2999 3000 plw->plw_bin_marker = nbin = nbin0; 3001 3002 if (plw->plw_bins[nszc] != 0) { 3003 /* 3004 * check if next page size bin is the 3005 * same as the next page size bin for 3006 * bin0 3007 */ 3008 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, 3009 nbin); 3010 bin0_nsz = PAGE_GET_NSZ_COLOR(szc, 3011 plw->plw_bin0); 3012 3013 if ((bin0_nsz ^ nbin_nsz) & 3014 plw->plw_ceq_mask[nszc]) 3015 plw->plw_do_split = 1; 3016 } 3017 return (nbin); 3018 } 3019 } 3020 } 3021 3022 if (plw->plw_bins[nszc] != 0) { 3023 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin); 3024 if (!((plw->plw_split_next ^ nbin_nsz) & 3025 plw->plw_ceq_mask[nszc])) 3026 plw->plw_do_split = 1; 3027 } 3028 3029 return (nbin); 3030 } 3031 3032 page_t * 3033 page_get_mnode_freelist(page_freelist_type_t *fp, int mnode, uint_t bin, 3034 int mtype, uchar_t szc, uint_t flags) 3035 { 3036 kmutex_t *pcm; 3037 page_t *pp, *first_pp; 3038 uint_t sbin; 3039 int plw_initialized; 3040 page_list_walker_t plw; 3041 3042 ASSERT(szc < mmu_page_sizes); 3043 3044 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]); 3045 3046 MTYPE_START(mnode, mtype, flags); 3047 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3048 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]); 3049 return (NULL); 3050 } 3051 try_again: 3052 plw_initialized = 0; 3053 plw.plw_ceq_dif = 1; 3054 3055 /* 3056 * Only hold one freelist lock at a time, that way we 3057 * can start anywhere and not have to worry about lock 3058 * ordering. 3059 */ 3060 for (plw.plw_count = 0; 3061 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 3062 sbin = bin; 3063 do { 3064 if (!PAGE_FREELISTS(PC_ISKFLT(fp), mnode, szc, 3065 bin, mtype)) { 3066 goto bin_empty_1; 3067 } 3068 3069 pcm = PC_BIN_MUTEX(PC_ISKFLT(fp), mnode, bin, 3070 PG_FREE_LIST); 3071 mutex_enter(pcm); 3072 pp = PAGE_FREELISTS(PC_ISKFLT(fp), mnode, szc, 3073 bin, mtype); 3074 if (pp == NULL) { 3075 goto bin_empty_0; 3076 } 3077 3078 /* 3079 * These were set before the page 3080 * was put on the free list, 3081 * they must still be set. 3082 */ 3083 ASSERT(PP_ISFREE(pp)); 3084 ASSERT(PP_ISAGED(pp)); 3085 ASSERT(pp->p_vnode == NULL); 3086 ASSERT(pp->p_hash == NULL); 3087 ASSERT(pp->p_offset == (u_offset_t)-1); 3088 ASSERT(pp->p_szc == szc); 3089 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 3090 3091 /* 3092 * Walk down the hash chain. 3093 * 8k pages are linked on p_next 3094 * and p_prev fields. Large pages 3095 * are a contiguous group of 3096 * constituent pages linked together 3097 * on their p_next and p_prev fields. 3098 * The large pages are linked together 3099 * on the hash chain using p_vpnext 3100 * p_vpprev of the base constituent 3101 * page of each large page. 3102 */ 3103 first_pp = pp; 3104 while (!page_trylock_cons(pp, SE_EXCL) || 3105 IS_DUMP_PAGE(pp)) { 3106 if (szc == 0) { 3107 pp = pp->p_next; 3108 } else { 3109 pp = pp->p_vpnext; 3110 } 3111 3112 ASSERT(PP_ISFREE(pp)); 3113 ASSERT(PP_ISAGED(pp)); 3114 ASSERT(pp->p_vnode == NULL); 3115 ASSERT(pp->p_hash == NULL); 3116 ASSERT(pp->p_offset == (u_offset_t)-1); 3117 ASSERT(pp->p_szc == szc); 3118 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 3119 3120 if (pp == first_pp) 3121 goto bin_empty_0; 3122 } 3123 3124 ASSERT(pp != NULL); 3125 ASSERT(mtype == PP_2_MTYPE(pp)); 3126 ASSERT(pp->p_szc == szc); 3127 if (szc == 0) { 3128 page_sub(PAGE_FREELISTP(PC_ISKFLT(fp), mnode, 3129 szc, bin, mtype), pp); 3130 } else { 3131 page_vpsub(PAGE_FREELISTP(PC_ISKFLT(fp), mnode, 3132 szc, bin, mtype), pp); 3133 CHK_LPG(pp, szc); 3134 } 3135 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 3136 3137 if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0)) 3138 panic("free page is not. pp %p", (void *)pp); 3139 mutex_exit(pcm); 3140 3141 #if defined(__sparc) 3142 ASSERT(!kcage_on || PP_ISNORELOC(pp) || 3143 (flags & PG_NORELOC) == 0); 3144 3145 if (PP_ISNORELOC(pp)) 3146 kcage_freemem_sub(page_get_pagecnt(szc)); 3147 #elif defined(__amd64) && !defined(__xpv) 3148 if (PP_ISKFLT(pp)) { 3149 ASSERT(szc == 0); 3150 kflt_freemem_sub(1); 3151 } 3152 #endif /* __sparc */ 3153 VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]); 3154 return (pp); 3155 3156 bin_empty_0: 3157 mutex_exit(pcm); 3158 bin_empty_1: 3159 if (plw_initialized == 0) { 3160 PAGE_LIST_WALK_INIT(fp, szc, flags, bin, 1, 1, 3161 &plw); 3162 plw_initialized = 1; 3163 ASSERT(plw.plw_colors <= 3164 PAGE_GET_PAGECOLORS(szc)); 3165 ASSERT(plw.plw_colors > 0); 3166 ASSERT((plw.plw_colors & 3167 (plw.plw_colors - 1)) == 0); 3168 ASSERT(bin < plw.plw_colors); 3169 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors); 3170 } 3171 /* calculate the next bin with equivalent color */ 3172 bin = ADD_MASKED(bin, plw.plw_bin_step, 3173 plw.plw_ceq_mask[szc], plw.plw_color_mask); 3174 3175 } while (sbin != bin); 3176 3177 /* 3178 * color bins are all empty if color match. Try and 3179 * satisfy the request by breaking up or coalescing 3180 * pages from a different size freelist of the correct 3181 * color that satisfies the ORIGINAL color requested. 3182 * If that fails then try pages of the same size but 3183 * different colors assuming we are not called with 3184 * PG_MATCH_COLOR. 3185 */ 3186 if (plw.plw_do_split && 3187 (pp = page_freelist_split(szc, bin, mnode, 3188 mtype, PFNNULL, PFNNULL, &plw)) != NULL) 3189 return (pp); 3190 3191 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc, 3192 bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) != NULL) 3193 return (pp); 3194 3195 if (plw.plw_ceq_dif > 1) 3196 bin = PAGE_LIST_WALK_NEXT(fp, szc, bin, &plw); 3197 } 3198 3199 /* if allowed, cycle through additional mtypes */ 3200 MTYPE_NEXT(mnode, mtype, flags); 3201 if (mtype >= 0) 3202 goto try_again; 3203 3204 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]); 3205 3206 return (NULL); 3207 } 3208 3209 /* 3210 * Returns the count of free pages for 'pp' with size code 'szc'. 3211 * Note: This function does not return an exact value as the page freelist 3212 * locks are not held and thus the values in the page_counters may be 3213 * changing as we walk through the data. 3214 */ 3215 static int 3216 page_freecnt(int mnode, page_t *pp, uchar_t szc) 3217 { 3218 pgcnt_t pgfree; 3219 pgcnt_t cnt; 3220 ssize_t r = szc; /* region size */ 3221 ssize_t idx; 3222 int i; 3223 int full, range; 3224 3225 /* Make sure pagenum passed in is aligned properly */ 3226 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0); 3227 ASSERT(szc > 0); 3228 3229 /* Prevent page_counters dynamic memory from being freed */ 3230 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 3231 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3232 cnt = PAGE_COUNTERS(mnode, r, idx); 3233 pgfree = cnt << PNUM_SHIFT(r - 1); 3234 range = FULL_REGION_CNT(szc); 3235 3236 /* Check for completely full region */ 3237 if (cnt == range) { 3238 rw_exit(&page_ctrs_rwlock[mnode]); 3239 return (pgfree); 3240 } 3241 3242 while (--r > 0) { 3243 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3244 full = FULL_REGION_CNT(r); 3245 for (i = 0; i < range; i++, idx++) { 3246 cnt = PAGE_COUNTERS(mnode, r, idx); 3247 /* 3248 * If cnt here is full, that means we have already 3249 * accounted for these pages earlier. 3250 */ 3251 if (cnt != full) { 3252 pgfree += (cnt << PNUM_SHIFT(r - 1)); 3253 } 3254 } 3255 range *= full; 3256 } 3257 rw_exit(&page_ctrs_rwlock[mnode]); 3258 return (pgfree); 3259 } 3260 3261 /* 3262 * Called from page_geti_contig_pages to exclusively lock constituent pages 3263 * starting from 'spp' for page size code 'szc'. 3264 * 3265 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc' 3266 * region needs to be greater than or equal to the threshold. 3267 */ 3268 static int 3269 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags) 3270 { 3271 pgcnt_t pgcnt = PNUM_SIZE(szc); 3272 pgcnt_t pgfree, i; 3273 page_t *pp; 3274 3275 VM_STAT_ADD(vmm_vmstats.ptcp[szc]); 3276 3277 3278 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI)) 3279 goto skipptcpcheck; 3280 /* 3281 * check if there are sufficient free pages available before attempting 3282 * to trylock. Count is approximate as page counters can change. 3283 */ 3284 pgfree = page_freecnt(mnode, spp, szc); 3285 3286 /* attempt to trylock if there are sufficient already free pages */ 3287 if (pgfree < pgcnt/ptcpthreshold) { 3288 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]); 3289 return (0); 3290 } 3291 3292 skipptcpcheck: 3293 3294 for (i = 0; i < pgcnt; i++) { 3295 pp = &spp[i]; 3296 if (!page_trylock(pp, SE_EXCL)) { 3297 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]); 3298 while (--i != (pgcnt_t)-1) { 3299 pp = &spp[i]; 3300 ASSERT(PAGE_EXCL(pp)); 3301 page_unlock_nocapture(pp); 3302 } 3303 return (0); 3304 } 3305 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i); 3306 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) && 3307 !PP_ISFREE(pp)) { 3308 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); 3309 ASSERT(i == 0); 3310 page_unlock_nocapture(pp); 3311 return (0); 3312 } 3313 if (PP_ISNORELOC(pp)) { 3314 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]); 3315 while (i != (pgcnt_t)-1) { 3316 pp = &spp[i]; 3317 ASSERT(PAGE_EXCL(pp)); 3318 page_unlock_nocapture(pp); 3319 i--; 3320 } 3321 return (0); 3322 } 3323 if (PP_ISKFLT(pp)) { 3324 VM_STAT_ADD(vmm_vmstats.ptcpfailkflt[szc]); 3325 ASSERT(i == 0); 3326 while (i != (pgcnt_t)-1) { 3327 pp = &spp[i]; 3328 ASSERT(PAGE_EXCL(pp)); 3329 page_unlock_nocapture(pp); 3330 i--; 3331 } 3332 return (0); 3333 } 3334 } 3335 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]); 3336 return (1); 3337 } 3338 3339 /* 3340 * Claim large page pointed to by 'pp'. 'pp' is the starting set 3341 * of 'szc' constituent pages that had been locked exclusively previously. 3342 * Will attempt to relocate constituent pages in use. 3343 */ 3344 static page_t * 3345 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) 3346 { 3347 spgcnt_t pgcnt, npgs, i; 3348 page_t *targpp, *rpp, *hpp; 3349 page_t *replpp = NULL; 3350 page_t *pplist = NULL; 3351 3352 ASSERT(pp != NULL); 3353 3354 pgcnt = page_get_pagecnt(szc); 3355 while (pgcnt) { 3356 ASSERT(PAGE_EXCL(pp)); 3357 ASSERT(!PP_ISNORELOC(pp)); 3358 ASSERT(!PP_ISKFLT(pp)); 3359 if (PP_ISFREE(pp)) { 3360 /* 3361 * If this is a PG_FREE_LIST page then its 3362 * size code can change underneath us due to 3363 * page promotion or demotion. As an optimzation 3364 * use page_list_sub_pages() instead of 3365 * page_list_sub(). 3366 */ 3367 if (PP_ISAGED(pp)) { 3368 page_list_sub_pages(pp, szc); 3369 if (pp->p_szc == szc) { 3370 return (pp); 3371 } 3372 ASSERT(pp->p_szc < szc); 3373 npgs = page_get_pagecnt(pp->p_szc); 3374 hpp = pp; 3375 for (i = 0; i < npgs; i++, pp++) { 3376 pp->p_szc = szc; 3377 } 3378 page_list_concat(&pplist, &hpp); 3379 pgcnt -= npgs; 3380 continue; 3381 } 3382 ASSERT(!PP_ISAGED(pp)); 3383 ASSERT(pp->p_szc == 0); 3384 page_list_sub(pp, PG_CACHE_LIST); 3385 page_hashout(pp, NULL); 3386 PP_SETAGED(pp); 3387 pp->p_szc = szc; 3388 page_list_concat(&pplist, &pp); 3389 pp++; 3390 pgcnt--; 3391 continue; 3392 } 3393 npgs = page_get_pagecnt(pp->p_szc); 3394 3395 /* 3396 * page_create_wait freemem accounting done by caller of 3397 * page_get_freelist and not necessary to call it prior to 3398 * calling page_get_replacement_page. 3399 * 3400 * page_get_replacement_page can call page_get_contig_pages 3401 * to acquire a large page (szc > 0); the replacement must be 3402 * smaller than the contig page size to avoid looping or 3403 * szc == 0 and PGI_PGCPSZC0 is set. 3404 */ 3405 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) { 3406 replpp = page_get_replacement_page(pp, NULL, 0); 3407 if (replpp) { 3408 npgs = page_get_pagecnt(pp->p_szc); 3409 ASSERT(npgs <= pgcnt); 3410 targpp = pp; 3411 } 3412 } 3413 3414 /* 3415 * If replacement is NULL or do_page_relocate fails, fail 3416 * coalescing of pages. 3417 */ 3418 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0, 3419 &npgs, NULL) != 0)) { 3420 /* 3421 * Unlock un-processed target list 3422 */ 3423 while (pgcnt--) { 3424 ASSERT(PAGE_EXCL(pp)); 3425 page_unlock_nocapture(pp); 3426 pp++; 3427 } 3428 /* 3429 * Free the processed target list. 3430 */ 3431 while (pplist) { 3432 pp = pplist; 3433 page_sub(&pplist, pp); 3434 ASSERT(PAGE_EXCL(pp)); 3435 ASSERT(pp->p_szc == szc); 3436 ASSERT(PP_ISFREE(pp)); 3437 ASSERT(PP_ISAGED(pp)); 3438 pp->p_szc = 0; 3439 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 3440 page_unlock_nocapture(pp); 3441 } 3442 3443 if (replpp != NULL) 3444 page_free_replacement_page(replpp); 3445 3446 return (NULL); 3447 } 3448 ASSERT(pp == targpp); 3449 3450 /* LINTED */ 3451 ASSERT(hpp = pp); /* That's right, it's an assignment */ 3452 3453 pp += npgs; 3454 pgcnt -= npgs; 3455 3456 while (npgs--) { 3457 ASSERT(PAGE_EXCL(targpp)); 3458 ASSERT(!PP_ISFREE(targpp)); 3459 ASSERT(!PP_ISNORELOC(targpp)); 3460 ASSERT(!PP_ISKFLT(targpp)); 3461 PP_SETFREE(targpp); 3462 ASSERT(PP_ISAGED(targpp)); 3463 ASSERT(targpp->p_szc < szc || (szc == 0 && 3464 (flags & PGI_PGCPSZC0))); 3465 targpp->p_szc = szc; 3466 targpp = targpp->p_next; 3467 3468 rpp = replpp; 3469 ASSERT(rpp != NULL); 3470 page_sub(&replpp, rpp); 3471 ASSERT(PAGE_EXCL(rpp)); 3472 ASSERT(!PP_ISFREE(rpp)); 3473 page_unlock_nocapture(rpp); 3474 } 3475 ASSERT(targpp == hpp); 3476 ASSERT(replpp == NULL); 3477 page_list_concat(&pplist, &targpp); 3478 } 3479 CHK_LPG(pplist, szc); 3480 return (pplist); 3481 } 3482 3483 /* 3484 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code 3485 * of 0 means nothing left after trim. 3486 */ 3487 /* LINTED */ 3488 int 3489 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) 3490 { 3491 pfn_t kcagepfn; 3492 int decr; 3493 int rc = 0; 3494 3495 if (PP_ISNORELOC(mseg->pages)) { 3496 if (PP_ISNORELOC(mseg->epages - 1) == 0) { 3497 3498 /* lower part of this mseg inside kernel cage */ 3499 decr = kcage_current_pfn(&kcagepfn); 3500 3501 /* kernel cage may have transitioned past mseg */ 3502 if (kcagepfn >= mseg->pages_base && 3503 kcagepfn < mseg->pages_end) { 3504 ASSERT(decr == 0); 3505 *lo = MAX(kcagepfn, pfnlo); 3506 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 3507 rc = 1; 3508 } 3509 } 3510 /* else entire mseg in the cage */ 3511 } else { 3512 if (PP_ISNORELOC(mseg->epages - 1)) { 3513 3514 /* upper part of this mseg inside kernel cage */ 3515 decr = kcage_current_pfn(&kcagepfn); 3516 3517 /* kernel cage may have transitioned past mseg */ 3518 if (kcagepfn >= mseg->pages_base && 3519 kcagepfn < mseg->pages_end) { 3520 ASSERT(decr); 3521 *hi = MIN(kcagepfn, pfnhi); 3522 *lo = MAX(pfnlo, mseg->pages_base); 3523 rc = 1; 3524 } 3525 } else { 3526 /* entire mseg outside of kernel cage */ 3527 *lo = MAX(pfnlo, mseg->pages_base); 3528 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 3529 rc = 1; 3530 } 3531 } 3532 return (rc); 3533 } 3534 3535 /* 3536 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a 3537 * page with size code 'szc'. Claiming such a page requires acquiring 3538 * exclusive locks on all constituent pages (page_trylock_contig_pages), 3539 * relocating pages in use and concatenating these constituent pages into a 3540 * large page. 3541 * 3542 * The page lists do not have such a large page and page_freelist_split has 3543 * already failed to demote larger pages and/or coalesce smaller free pages. 3544 * 3545 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large 3546 * pages with the same color as 'bin'. 3547 * 3548 * 'pfnflag' specifies the subset of the pfn range to search. 3549 */ 3550 static page_t * 3551 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, 3552 pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag) 3553 { 3554 struct memseg *mseg; 3555 pgcnt_t szcpgcnt = page_get_pagecnt(szc); 3556 pfn_t randpfn; 3557 page_t *pp, *randpp, *endpp; 3558 uint_t colors, ceq_mask; 3559 /* LINTED : set but not used in function */ 3560 uint_t color_mask; 3561 pfn_t hi, lo; 3562 uint_t skip; 3563 MEM_NODE_ITERATOR_DECL(it); 3564 #ifdef DEBUG 3565 pgcnt_t szcpgmask = szcpgcnt - 1; 3566 #endif 3567 3568 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); 3569 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt); 3570 3571 if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi) { 3572 return (NULL); 3573 } 3574 3575 ASSERT(szc < mmu_page_sizes); 3576 3577 colors = PAGE_GET_PAGECOLORS(szc); 3578 color_mask = colors - 1; 3579 if ((colors > 1) && (flags & PG_MATCH_COLOR)) { 3580 uchar_t ceq = colorequivszc[szc]; 3581 uint_t ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 3582 3583 ASSERT(ceq_dif > 0); 3584 ceq_mask = (ceq_dif - 1) << (ceq & 0xf); 3585 } else { 3586 ceq_mask = 0; 3587 } 3588 3589 ASSERT(bin < colors); 3590 3591 /* clear "non-significant" color bits */ 3592 bin &= ceq_mask; 3593 3594 /* 3595 * trim the pfn range to search based on pfnflag. pfnflag is set 3596 * when there have been previous page_get_contig_page failures to 3597 * limit the search. 3598 * 3599 * The high bit in pfnflag specifies the number of 'slots' in the 3600 * pfn range and the remainder of pfnflag specifies which slot. 3601 * For example, a value of 1010b would mean the second slot of 3602 * the pfn range that has been divided into 8 slots. 3603 */ 3604 if (pfnflag > 1) { 3605 int slots = 1 << (highbit(pfnflag) - 1); 3606 int slotid = pfnflag & (slots - 1); 3607 pgcnt_t szcpages; 3608 int slotlen; 3609 3610 pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1; 3611 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt; 3612 slotlen = howmany(szcpages, slots); 3613 /* skip if 'slotid' slot is empty */ 3614 if (slotid * slotlen >= szcpages) { 3615 return (NULL); 3616 } 3617 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt); 3618 ASSERT(pfnlo < pfnhi); 3619 if (pfnhi > pfnlo + (slotlen * szcpgcnt)) 3620 pfnhi = pfnlo + (slotlen * szcpgcnt) - 1; 3621 } 3622 3623 /* 3624 * This routine is can be called recursively so we shouldn't 3625 * acquire a reader lock if a write request is pending. This 3626 * could lead to a deadlock with the DR thread. 3627 * 3628 * Returning NULL informs the caller that we could not get 3629 * a contig page with the required characteristics. 3630 */ 3631 3632 if (!memsegs_trylock(0)) 3633 return (NULL); 3634 3635 /* 3636 * loop through memsegs to look for contig page candidates 3637 */ 3638 3639 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) { 3640 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) { 3641 /* no overlap */ 3642 continue; 3643 } 3644 3645 if (mseg->pages_end - mseg->pages_base < szcpgcnt) 3646 /* mseg too small */ 3647 continue; 3648 3649 /* 3650 * trim off kernel cage pages from pfn range and check for 3651 * a trimmed pfn range returned that does not span the 3652 * desired large page size. 3653 */ 3654 if (kcage_on) { 3655 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 || 3656 lo >= hi || ((hi - lo) + 1) < szcpgcnt) 3657 continue; 3658 } else { 3659 lo = MAX(pfnlo, mseg->pages_base); 3660 hi = MIN(pfnhi, (mseg->pages_end - 1)); 3661 } 3662 3663 /* round to szcpgcnt boundaries */ 3664 lo = P2ROUNDUP(lo, szcpgcnt); 3665 3666 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 3667 hi = P2ALIGN((hi + 1), szcpgcnt) - 1; 3668 3669 if (hi <= lo) 3670 continue; 3671 3672 /* 3673 * set lo to point to the pfn for the desired bin. Large 3674 * page sizes may only have a single page color 3675 */ 3676 skip = szcpgcnt; 3677 if (ceq_mask > 0 || interleaved_mnodes) { 3678 /* set lo to point at appropriate color */ 3679 if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) || 3680 (interleaved_mnodes && 3681 PFN_2_MEM_NODE(lo) != mnode)) { 3682 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask, 3683 color_mask, &it); 3684 } 3685 if (hi <= lo) 3686 /* mseg cannot satisfy color request */ 3687 continue; 3688 } 3689 3690 /* randomly choose a point between lo and hi to begin search */ 3691 3692 randpfn = (pfn_t)GETTICK(); 3693 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); 3694 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it); 3695 if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) { 3696 if (randpfn != (pfn_t)-1) { 3697 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin, 3698 ceq_mask, color_mask, &it); 3699 } 3700 if (randpfn >= hi) { 3701 randpfn = lo; 3702 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, 3703 &it); 3704 } 3705 } 3706 randpp = mseg->pages + (randpfn - mseg->pages_base); 3707 3708 ASSERT(randpp->p_pagenum == randpfn); 3709 3710 pp = randpp; 3711 endpp = mseg->pages + (hi - mseg->pages_base) + 1; 3712 3713 ASSERT(randpp + szcpgcnt <= endpp); 3714 3715 do { 3716 ASSERT(!(pp->p_pagenum & szcpgmask)); 3717 ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0); 3718 3719 /* Skip over pages on the kernel freelist */ 3720 if (PP_ISKFLT(pp)) { 3721 pp += skip; 3722 goto skip_contig; 3723 } 3724 3725 if (page_trylock_contig_pages(mnode, pp, szc, flags)) { 3726 /* pages unlocked by page_claim on failure */ 3727 if (page_claim_contig_pages(pp, szc, flags)) { 3728 memsegs_unlock(0); 3729 return (pp); 3730 } 3731 } 3732 3733 if (ceq_mask == 0 && !interleaved_mnodes) { 3734 pp += skip; 3735 } else { 3736 pfn_t pfn = pp->p_pagenum; 3737 3738 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin, 3739 ceq_mask, color_mask, &it); 3740 if (pfn == (pfn_t)-1) { 3741 pp = endpp; 3742 } else { 3743 pp = mseg->pages + 3744 (pfn - mseg->pages_base); 3745 } 3746 } 3747 skip_contig: 3748 if (pp >= endpp) { 3749 /* start from the beginning */ 3750 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 3751 pp = mseg->pages + (lo - mseg->pages_base); 3752 ASSERT(pp->p_pagenum == lo); 3753 ASSERT(pp + szcpgcnt <= endpp); 3754 } 3755 } while (pp != randpp); 3756 } 3757 memsegs_unlock(0); 3758 return (NULL); 3759 } 3760 3761 /* 3762 * controlling routine that searches through physical memory in an attempt to 3763 * claim a large page based on the input parameters. 3764 * on the page free lists. 3765 * 3766 * calls page_geti_contig_pages with an initial pfn range from the mnode 3767 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range 3768 * that overlaps with the kernel cage or does not match the requested page 3769 * color if PG_MATCH_COLOR is set. Since this search is very expensive, 3770 * page_geti_contig_pages may further limit the search range based on 3771 * previous failure counts (pgcpfailcnt[]). 3772 * 3773 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base 3774 * pagesize page that satisfies mtype. 3775 */ 3776 /* ARGSUSED */ 3777 page_t * 3778 page_get_contig_pages(page_freelist_type_t *fp, int mnode, uint_t bin, 3779 int mtype, uchar_t szc, uint_t flags) 3780 { 3781 pfn_t pfnlo, pfnhi; /* contig pages pfn range */ 3782 page_t *pp; 3783 pgcnt_t pfnflag = 0; /* no limit on search if 0 */ 3784 3785 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]); 3786 3787 /* no allocations from cage */ 3788 flags |= PGI_NOCAGE; 3789 3790 /* LINTED */ 3791 MTYPE_START(mnode, mtype, flags); 3792 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3793 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]); 3794 return (NULL); 3795 } 3796 3797 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3798 3799 /* do not limit search and ignore color if hi pri */ 3800 3801 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0)) 3802 pfnflag = pgcpfailcnt[szc]; 3803 3804 /* remove color match to improve chances */ 3805 3806 if (flags & PGI_PGCPHIPRI || pfnflag) 3807 flags &= ~PG_MATCH_COLOR; 3808 3809 do { 3810 /* get pfn range based on mnode and mtype */ 3811 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi); 3812 ASSERT(pfnhi >= pfnlo); 3813 3814 pp = page_geti_contig_pages(mnode, bin, szc, flags, 3815 pfnlo, pfnhi, pfnflag); 3816 3817 if (pp != NULL) { 3818 pfnflag = pgcpfailcnt[szc]; 3819 if (pfnflag) { 3820 /* double the search size */ 3821 pgcpfailcnt[szc] = pfnflag >> 1; 3822 } 3823 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]); 3824 return (pp); 3825 } 3826 MTYPE_NEXT(mnode, mtype, flags); 3827 } while (mtype >= 0); 3828 3829 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]); 3830 return (NULL); 3831 } 3832 3833 #if defined(__i386) || defined(__amd64) 3834 /* 3835 * Determine the likelihood of finding/coalescing a szc page. 3836 * Return 0 if the likelihood is small otherwise return 1. 3837 * 3838 * For now, be conservative and check only 1g pages and return 0 3839 * if there had been previous coalescing failures and the szc pages 3840 * needed to satisfy request would exhaust most of freemem. 3841 */ 3842 int 3843 page_chk_freelist(uint_t szc) 3844 { 3845 pgcnt_t pgcnt; 3846 3847 if (szc <= 1) 3848 return (1); 3849 3850 pgcnt = page_get_pagecnt(szc); 3851 if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) { 3852 VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]); 3853 return (0); 3854 } 3855 VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]); 3856 return (1); 3857 } 3858 #endif 3859 3860 /* 3861 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair. 3862 * 3863 * Does its own locking and accounting. 3864 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3865 * pages of the proper color even if there are pages of a different color. 3866 * 3867 * Finds a page, removes it, THEN locks it. 3868 */ 3869 3870 /*ARGSUSED*/ 3871 page_t * 3872 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3873 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp) 3874 { 3875 page_t *pp; 3876 3877 PAGE_GET_FREELISTS(pp, vp, off, seg, vaddr, size, flags, lgrp); 3878 return (pp); 3879 } 3880 3881 /* 3882 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair. 3883 * 3884 * Does its own locking. 3885 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3886 * pages of the proper color even if there are pages of a different color. 3887 * Otherwise, scan the bins for ones with pages. For each bin with pages, 3888 * try to lock one of them. If no page can be locked, try the 3889 * next bin. Return NULL if a page can not be found and locked. 3890 * 3891 * Finds a pages, trys to lock it, then removes it. 3892 */ 3893 3894 /*ARGSUSED*/ 3895 page_t * 3896 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3897 caddr_t vaddr, uint_t flags, struct lgrp *lgrp) 3898 { 3899 page_t *pp; 3900 struct as *as = seg->s_as; 3901 ulong_t bin; 3902 /*LINTED*/ 3903 int mnode; 3904 int mtype; 3905 lgrp_mnode_cookie_t lgrp_cookie; 3906 3907 /* 3908 * If we aren't passed a specific lgroup, or pasased a freed lgrp 3909 * assume we wish to allocate near to the current thread's home. 3910 */ 3911 if (!LGRP_EXISTS(lgrp)) 3912 lgrp = lgrp_home_lgrp(); 3913 3914 if (!kcage_on) { 3915 flags &= ~PG_NORELOC; 3916 flags |= PGI_NOCAGE; 3917 } 3918 3919 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC && 3920 kcage_freemem <= kcage_throttlefree) { 3921 /* 3922 * Reserve kcage_throttlefree pages for critical kernel 3923 * threads. 3924 * 3925 * Everybody else has to go to page_create_get_something() 3926 * to get a cage page, so we don't deadlock cageout. 3927 */ 3928 return (NULL); 3929 } 3930 3931 /* LINTED */ 3932 AS_2_BIN(PFLT_USER, as, seg, vp, vaddr, bin, 0); 3933 3934 ASSERT(bin < PAGE_GET_PAGECOLORS(0)); 3935 3936 /* LINTED */ 3937 MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE); 3938 3939 VM_STAT_ADD(vmm_vmstats.pgc_alloc); 3940 3941 /* 3942 * Try local cachelists first 3943 */ 3944 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3945 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3946 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3947 if (pp != NULL) { 3948 VM_STAT_ADD(vmm_vmstats.pgc_allocok); 3949 DTRACE_PROBE4(page__get, 3950 lgrp_t *, lgrp, 3951 int, mnode, 3952 ulong_t, bin, 3953 uint_t, flags); 3954 return (pp); 3955 } 3956 } 3957 3958 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3959 3960 /* 3961 * Try freelists/cachelists that are farther away 3962 * This is our only chance to allocate remote pages for PAGESIZE 3963 * requests. 3964 */ 3965 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3966 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3967 pp = page_get_mnode_freelist(ufltp, mnode, bin, mtype, 3968 0, flags); 3969 if (pp != NULL) { 3970 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred); 3971 DTRACE_PROBE4(page__get, 3972 lgrp_t *, lgrp, 3973 int, mnode, 3974 ulong_t, bin, 3975 uint_t, flags); 3976 return (pp); 3977 } 3978 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3979 if (pp != NULL) { 3980 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem); 3981 DTRACE_PROBE4(page__get, 3982 lgrp_t *, lgrp, 3983 int, mnode, 3984 ulong_t, bin, 3985 uint_t, flags); 3986 return (pp); 3987 } 3988 } 3989 3990 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed); 3991 return (NULL); 3992 } 3993 3994 page_t * 3995 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype) 3996 { 3997 kmutex_t *pcm; 3998 page_t *pp, *first_pp; 3999 uint_t sbin; 4000 int plw_initialized; 4001 page_list_walker_t plw; 4002 4003 VM_STAT_ADD(vmm_vmstats.pgmc_alloc); 4004 4005 /* LINTED */ 4006 MTYPE_START(mnode, mtype, flags); 4007 if (mtype < 0) { /* mnode does not have memory in mtype range */ 4008 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty); 4009 return (NULL); 4010 } 4011 4012 try_again: 4013 4014 plw_initialized = 0; 4015 plw.plw_ceq_dif = 1; 4016 4017 /* 4018 * Only hold one cachelist lock at a time, that way we 4019 * can start anywhere and not have to worry about lock 4020 * ordering. 4021 */ 4022 4023 for (plw.plw_count = 0; 4024 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 4025 sbin = bin; 4026 do { 4027 4028 if (!PAGE_CACHELISTS(mnode, bin, mtype)) 4029 goto bin_empty_1; 4030 /* 4031 * The first parameter is irrelevant here as the flags 4032 * parameter to this macro decides which mutex to lock. 4033 * With the PG_CACHE_LIST flag, we lock the cpc_mutex[]. 4034 * 4035 * User pages from the kernel page freelist may be 4036 * on the cachelist. 4037 */ 4038 pcm = PC_BIN_MUTEX(PFLT_USER, mnode, bin, 4039 PG_CACHE_LIST); 4040 mutex_enter(pcm); 4041 pp = PAGE_CACHELISTS(mnode, bin, mtype); 4042 if (pp == NULL) 4043 goto bin_empty_0; 4044 4045 first_pp = pp; 4046 ASSERT(pp->p_vnode); 4047 ASSERT(PP_ISAGED(pp) == 0); 4048 ASSERT(pp->p_szc == 0); 4049 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 4050 while (!page_trylock(pp, SE_EXCL)) { 4051 pp = pp->p_next; 4052 ASSERT(pp->p_szc == 0); 4053 if (pp == first_pp) { 4054 /* 4055 * We have searched the complete list! 4056 * And all of them (might only be one) 4057 * are locked. This can happen since 4058 * these pages can also be found via 4059 * the hash list. When found via the 4060 * hash list, they are locked first, 4061 * then removed. We give up to let the 4062 * other thread run. 4063 */ 4064 pp = NULL; 4065 break; 4066 } 4067 ASSERT(pp->p_vnode); 4068 ASSERT(PP_ISFREE(pp)); 4069 ASSERT(PP_ISAGED(pp) == 0); 4070 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 4071 mnode); 4072 } 4073 4074 if (pp) { 4075 page_t **ppp; 4076 /* 4077 * Found and locked a page. 4078 * Pull it off the list. 4079 */ 4080 ASSERT(mtype == PP_2_MTYPE(pp)); 4081 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 4082 page_sub(ppp, pp); 4083 /* 4084 * Subtract counters before releasing pcm mutex 4085 * to avoid a race with page_freelist_coalesce 4086 * and page_freelist_split. 4087 */ 4088 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 4089 mutex_exit(pcm); 4090 ASSERT(pp->p_vnode); 4091 ASSERT(PP_ISAGED(pp) == 0); 4092 #if defined(__sparc) 4093 ASSERT(!kcage_on || 4094 (flags & PG_NORELOC) == 0 || 4095 PP_ISNORELOC(pp)); 4096 if (PP_ISNORELOC(pp)) { 4097 kcage_freemem_sub(1); 4098 } 4099 #elif defined(__amd64) && !defined(__xpv) 4100 if (PP_ISKFLT(pp)) { 4101 kflt_freemem_sub(1); 4102 } 4103 #endif /* __sparc */ 4104 VM_STAT_ADD(vmm_vmstats. pgmc_allocok); 4105 return (pp); 4106 } 4107 bin_empty_0: 4108 mutex_exit(pcm); 4109 bin_empty_1: 4110 if (plw_initialized == 0) { 4111 page_list_walk_init(0, flags, bin, 0, 1, &plw); 4112 plw_initialized = 1; 4113 } 4114 /* calculate the next bin with equivalent color */ 4115 bin = ADD_MASKED(bin, plw.plw_bin_step, 4116 plw.plw_ceq_mask[0], plw.plw_color_mask); 4117 } while (sbin != bin); 4118 4119 if (plw.plw_ceq_dif > 1) 4120 bin = page_list_walk_next_bin(0, bin, &plw); 4121 } 4122 4123 MTYPE_NEXT(mnode, mtype, flags); 4124 if (mtype >= 0) 4125 goto try_again; 4126 4127 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed); 4128 return (NULL); 4129 } 4130 4131 #ifdef DEBUG 4132 #define REPL_PAGE_STATS 4133 #endif /* DEBUG */ 4134 4135 #ifdef REPL_PAGE_STATS 4136 struct repl_page_stats { 4137 uint_t ngets; 4138 uint_t ngets_noreloc; 4139 uint_t npgr_noreloc; 4140 uint_t nnopage_first; 4141 uint_t nnopage; 4142 uint_t nhashout; 4143 uint_t nnofree; 4144 uint_t nnext_pp; 4145 } repl_page_stats; 4146 #define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1) 4147 #else /* REPL_PAGE_STATS */ 4148 #define REPL_STAT_INCR(v) 4149 #endif /* REPL_PAGE_STATS */ 4150 4151 int pgrppgcp; 4152 4153 /* 4154 * The freemem accounting must be done by the caller. 4155 * First we try to get a replacement page of the same size as like_pp, 4156 * if that is not possible, then we just get a set of discontiguous 4157 * PAGESIZE pages. 4158 */ 4159 page_t * 4160 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, 4161 uint_t pgrflags) 4162 { 4163 page_t *like_pp; 4164 page_t *pp, *pplist; 4165 page_t *pl = NULL; 4166 ulong_t bin; 4167 int mnode, page_mnode; 4168 int szc; 4169 spgcnt_t npgs, pg_cnt; 4170 pfn_t pfnum; 4171 int mtype; 4172 int flags = 0; 4173 lgrp_mnode_cookie_t lgrp_cookie; 4174 lgrp_t *lgrp; 4175 4176 REPL_STAT_INCR(ngets); 4177 like_pp = orig_like_pp; 4178 ASSERT(PAGE_EXCL(like_pp)); 4179 4180 szc = like_pp->p_szc; 4181 npgs = page_get_pagecnt(szc); 4182 /* 4183 * Now we reset like_pp to the base page_t. 4184 * That way, we won't walk past the end of this 'szc' page. 4185 */ 4186 pfnum = PFN_BASE(like_pp->p_pagenum, szc); 4187 like_pp = page_numtopp_nolock(pfnum); 4188 ASSERT(like_pp->p_szc == szc); 4189 4190 if (PP_ISNORELOC(like_pp)) { 4191 ASSERT(kcage_on); 4192 REPL_STAT_INCR(ngets_noreloc); 4193 flags = PGI_RELOCONLY; 4194 } else if (pgrflags & PGR_NORELOC) { 4195 ASSERT(kcage_on); 4196 REPL_STAT_INCR(npgr_noreloc); 4197 flags = PG_NORELOC; 4198 } 4199 4200 /* 4201 * Kernel pages must always be replaced with the same size 4202 * pages, since we cannot properly handle demotion of kernel 4203 * pages. 4204 */ 4205 if (PP_ISKAS(like_pp)) 4206 pgrflags |= PGR_SAMESZC; 4207 4208 /* LINTED */ 4209 MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs); 4210 4211 while (npgs) { 4212 pplist = NULL; 4213 for (;;) { 4214 pg_cnt = page_get_pagecnt(szc); 4215 bin = PP_2_BIN(like_pp); 4216 ASSERT(like_pp->p_szc == orig_like_pp->p_szc); 4217 ASSERT(pg_cnt <= npgs); 4218 4219 /* 4220 * If an lgroup was specified, try to get the 4221 * page from that lgroup. 4222 * NOTE: Must be careful with code below because 4223 * lgroup may disappear and reappear since there 4224 * is no locking for lgroup here. 4225 */ 4226 if (LGRP_EXISTS(lgrp_target)) { 4227 /* 4228 * Keep local variable for lgroup separate 4229 * from lgroup argument since this code should 4230 * only be exercised when lgroup argument 4231 * exists.... 4232 */ 4233 lgrp = lgrp_target; 4234 4235 /* Try the lgroup's freelists first */ 4236 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4237 LGRP_SRCH_LOCAL); 4238 while ((pplist == NULL) && 4239 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4240 != -1) { 4241 pplist = 4242 page_get_mnode_freelist(ufltp, 4243 mnode, bin, mtype, szc, flags); 4244 } 4245 4246 /* 4247 * Now try it's cachelists if this is a 4248 * small page. Don't need to do it for 4249 * larger ones since page_freelist_coalesce() 4250 * already failed. 4251 */ 4252 if (pplist != NULL || szc != 0) 4253 break; 4254 4255 /* Now try it's cachelists */ 4256 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4257 LGRP_SRCH_LOCAL); 4258 4259 while ((pplist == NULL) && 4260 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4261 != -1) { 4262 pplist = 4263 page_get_mnode_cachelist(bin, flags, 4264 mnode, mtype); 4265 } 4266 if (pplist != NULL) { 4267 page_hashout(pplist, NULL); 4268 PP_SETAGED(pplist); 4269 REPL_STAT_INCR(nhashout); 4270 break; 4271 } 4272 /* Done looking in this lgroup. Bail out. */ 4273 break; 4274 } 4275 4276 /* 4277 * No lgroup was specified (or lgroup was removed by 4278 * DR, so just try to get the page as close to 4279 * like_pp's mnode as possible. 4280 * First try the local freelist... 4281 */ 4282 mnode = PP_2_MEM_NODE(like_pp); 4283 pplist = page_get_mnode_freelist(ufltp, mnode, bin, 4284 mtype, szc, flags); 4285 if (pplist != NULL) 4286 break; 4287 4288 REPL_STAT_INCR(nnofree); 4289 4290 /* 4291 * ...then the local cachelist. Don't need to do it for 4292 * larger pages cause page_freelist_coalesce() already 4293 * failed there anyway. 4294 */ 4295 if (szc == 0) { 4296 pplist = page_get_mnode_cachelist(bin, flags, 4297 mnode, mtype); 4298 if (pplist != NULL) { 4299 page_hashout(pplist, NULL); 4300 PP_SETAGED(pplist); 4301 REPL_STAT_INCR(nhashout); 4302 break; 4303 } 4304 } 4305 4306 /* Now try remote freelists */ 4307 page_mnode = mnode; 4308 lgrp = 4309 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode)); 4310 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4311 LGRP_SRCH_HIER); 4312 while (pplist == NULL && 4313 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4314 != -1) { 4315 /* 4316 * Skip local mnode. 4317 */ 4318 if ((mnode == page_mnode) || 4319 (mem_node_config[mnode].exists == 0)) 4320 continue; 4321 4322 pplist = page_get_mnode_freelist(ufltp, mnode, 4323 bin, mtype, szc, flags); 4324 } 4325 4326 if (pplist != NULL) 4327 break; 4328 4329 /* Now try remote cachelists */ 4330 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4331 LGRP_SRCH_HIER); 4332 while (pplist == NULL && szc == 0) { 4333 mnode = lgrp_memnode_choose(&lgrp_cookie); 4334 if (mnode == -1) 4335 break; 4336 /* 4337 * Skip local mnode. 4338 */ 4339 if ((mnode == page_mnode) || 4340 (mem_node_config[mnode].exists == 0)) 4341 continue; 4342 4343 pplist = page_get_mnode_cachelist(bin, 4344 flags, mnode, mtype); 4345 4346 if (pplist != NULL) { 4347 page_hashout(pplist, NULL); 4348 PP_SETAGED(pplist); 4349 REPL_STAT_INCR(nhashout); 4350 break; 4351 } 4352 } 4353 4354 /* 4355 * Break out of while loop under the following cases: 4356 * - If we successfully got a page. 4357 * - If pgrflags specified only returning a specific 4358 * page size and we could not find that page size. 4359 * - If we could not satisfy the request with PAGESIZE 4360 * or larger pages. 4361 */ 4362 if (pplist != NULL || szc == 0) 4363 break; 4364 4365 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) { 4366 /* try to find contig page */ 4367 4368 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4369 LGRP_SRCH_HIER); 4370 4371 while ((pplist == NULL) && 4372 (mnode = 4373 lgrp_memnode_choose(&lgrp_cookie)) 4374 != -1) { 4375 pplist = page_get_contig_pages( 4376 ufltp, mnode, bin, mtype, szc, 4377 flags | PGI_PGCPHIPRI); 4378 } 4379 break; 4380 } 4381 4382 /* 4383 * The correct thing to do here is try the next 4384 * page size down using szc--. Due to a bug 4385 * with the processing of HAT_RELOAD_SHARE 4386 * where the sfmmu_ttecnt arrays of all 4387 * hats sharing an ISM segment don't get updated, 4388 * using intermediate size pages for relocation 4389 * can lead to continuous page faults. 4390 */ 4391 szc = 0; 4392 } 4393 4394 if (pplist != NULL) { 4395 DTRACE_PROBE4(page__get, 4396 lgrp_t *, lgrp, 4397 int, mnode, 4398 ulong_t, bin, 4399 uint_t, flags); 4400 4401 while (pplist != NULL && pg_cnt--) { 4402 ASSERT(pplist != NULL); 4403 pp = pplist; 4404 page_sub(&pplist, pp); 4405 PP_CLRFREE(pp); 4406 PP_CLRAGED(pp); 4407 page_list_concat(&pl, &pp); 4408 npgs--; 4409 like_pp = like_pp + 1; 4410 REPL_STAT_INCR(nnext_pp); 4411 } 4412 ASSERT(pg_cnt == 0); 4413 } else { 4414 break; 4415 } 4416 } 4417 4418 if (npgs) { 4419 /* 4420 * We were unable to allocate the necessary number 4421 * of pages. 4422 * We need to free up any pl. 4423 */ 4424 REPL_STAT_INCR(nnopage); 4425 page_free_replacement_page(pl); 4426 return (NULL); 4427 } else { 4428 return (pl); 4429 } 4430 } 4431 4432 /* 4433 * demote a free large page to it's constituent pages 4434 */ 4435 void 4436 page_demote_free_pages(page_t *pp) 4437 { 4438 4439 int mnode; 4440 4441 ASSERT(pp != NULL); 4442 ASSERT(PAGE_LOCKED(pp)); 4443 ASSERT(PP_ISFREE(pp)); 4444 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 4445 4446 mnode = PP_2_MEM_NODE(pp); 4447 page_freelist_lock(mnode); 4448 if (pp->p_szc != 0) { 4449 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, 4450 pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 4451 } 4452 page_freelist_unlock(mnode); 4453 ASSERT(pp->p_szc == 0); 4454 } 4455 4456 /* 4457 * Factor in colorequiv to check additional 'equivalent' bins. 4458 * colorequiv may be set in /etc/system 4459 */ 4460 void 4461 page_set_colorequiv_arr(void) 4462 { 4463 if (colorequiv > 1) { 4464 int i; 4465 uint_t sv_a = lowbit(colorequiv) - 1; 4466 4467 if (sv_a > 15) 4468 sv_a = 15; 4469 4470 for (i = 0; i < MMU_PAGE_SIZES; i++) { 4471 uint_t colors; 4472 uint_t a = sv_a; 4473 4474 if ((colors = hw_page_array[i].hp_colors) <= 1) { 4475 continue; 4476 } 4477 while ((colors >> a) == 0) 4478 a--; 4479 if ((a << 4) > colorequivszc[i]) { 4480 colorequivszc[i] = (a << 4); 4481 } 4482 } 4483 } 4484 } 4485 4486 /* 4487 * The freelist type data structures allow freelist type specific allocation 4488 * and policy routines to be configured. There are two freelist types currently 4489 * defined, one for kernel memory allocation and the the other for user memory. 4490 * The page_get_uflt() routine is called by the PAGE_GET_FREELISTS() macro to 4491 * allocate memory from the user freelist type. 4492 */ 4493 4494 /* ARGSUSED */ 4495 page_t * 4496 page_get_uflt(struct vnode *vp, u_offset_t off, struct seg *seg, caddr_t vaddr, 4497 size_t size, uint_t flags, struct lgrp *lgrp) 4498 { 4499 struct as *as = seg->s_as; 4500 ulong_t bin; 4501 uchar_t szc; 4502 int mtype; 4503 4504 /* 4505 * If we aren't passed a specific lgroup, or passed a freed lgrp 4506 * assume we wish to allocate near the current thread's home. 4507 */ 4508 if (!LGRP_EXISTS(lgrp)) 4509 lgrp = lgrp_home_lgrp(); 4510 4511 if (kcage_on) { 4512 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC && 4513 kcage_freemem < kcage_throttlefree + btop(size) && 4514 curthread != kcage_cageout_thread) { 4515 /* 4516 * Set a "reserve" of kcage_throttlefree pages for 4517 * PG_PANIC and cageout thread allocations. 4518 * 4519 * Everybody else has to serialize in 4520 * page_create_get_something() to get a cage page, so 4521 * that we don't deadlock cageout! 4522 */ 4523 return (NULL); 4524 } 4525 } else { 4526 flags &= ~PG_NORELOC; 4527 flags |= PGI_NOCAGE; 4528 } 4529 4530 /* LINTED */ 4531 MTYPE_INIT(mtype, vp, vaddr, flags, size); 4532 4533 /* 4534 * Convert size to page size code. 4535 */ 4536 if ((szc = page_szc(size)) == (uchar_t)-1) 4537 panic("page_get_uflt: illegal page size request"); 4538 ASSERT(szc < mmu_page_sizes); 4539 4540 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc][ufltp->pflt_type]); 4541 4542 /* LINTED */ 4543 AS_2_BIN(PFLT_USER, as, seg, vp, vaddr, bin, szc); 4544 4545 ASSERT(bin < PAGE_GET_PAGECOLORS(szc)); 4546 4547 return (page_get_flist(ufltp, bin, mtype, szc, flags, lgrp)); 4548 } 4549 4550 /* 4551 * This routine is passed a page color and inital mtype, and calls the page 4552 * freelist type policy routines which actually do the allocations, first 4553 * trying the local and then remote lgroups. The policy routines for user 4554 * page allocations are currently configured to be: 4555 * 4556 * x64 systems support two freelist types, user and kernel. 4557 * 4558 * The user freelist has 3 policy routines. 4559 * 4560 * 1. page_get_mnode_freelist to allocate a page from the user freelists. 4561 * 2. page_user_alloc_kflt to allocate a page from the kernel freelists 4562 * 3. page_get_contig_pages to search for a large page in physical memory. 4563 * 4564 * The kernel freelist has only 1 policy routine. 4565 * 4566 * 1. page_get_mnode_freelist to allocate a page from the kernel freelists. 4567 * 4568 * Sparc, x32 and Xen, systems support only the user freelist type. 4569 * 4570 * The user freelist has 2 policy routines. 4571 * 4572 * 1. page_get_mnode_freelist to allocate a page from the user freelists. 4573 * 2. page_get_contig_pages to search for a large page in physical memory. 4574 * 4575 */ 4576 page_t * 4577 page_get_flist(page_freelist_type_t *fltp, uint_t bin, int mtype, 4578 uchar_t szc, uint_t flags, struct lgrp *lgrp) 4579 { 4580 page_t *pp = NULL; 4581 page_t *(*page_get_func)(page_freelist_type_t *, 4582 int, uint_t, int, uchar_t, uint_t); 4583 lgrp_mnode_cookie_t lgrp_cookie; 4584 int i; 4585 int mnode; 4586 4587 for (i = 0; i < fltp->pflt_num_policies; i++) { 4588 page_get_func = PAGE_GET_FREELISTS_POLICY(fltp, i); 4589 4590 /* 4591 * when the cage and the kernel freelist are off chances are 4592 * that page_get_contig_pages() will fail to lock a large 4593 * page chunk therefore in this case it's not called by 4594 * default. This can be changed via /etc/system. 4595 * 4596 * page_get_contig_pages() also called to acquire a base 4597 * pagesize page for page_create_get_something(). 4598 */ 4599 if (page_get_func == page_get_contig_pages) { 4600 if ((flags & PG_NORELOC) || 4601 (pg_contig_disable != 0) || 4602 (!kcage_on && !kflt_on && 4603 !pg_lpgcreate_nocage && szc != 0)) { 4604 continue; 4605 #ifdef VM_STATS 4606 } else { 4607 VM_STAT_ADD( 4608 vmm_vmstats. 4609 pgf_allocretry[szc][fltp->pflt_type]); 4610 #endif 4611 } 4612 } 4613 4614 /* 4615 * Try to get a local page first, but try remote if we can't 4616 * get a page of the right color. 4617 */ 4618 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 4619 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 4620 4621 pp = page_get_func(fltp, mnode, bin, mtype, szc, 4622 flags); 4623 if (pp != NULL) { 4624 #ifdef VM_STATS 4625 VM_STAT_ADD( 4626 vmm_vmstats. 4627 pgf_allocok[szc][fltp->pflt_type]); 4628 #endif 4629 DTRACE_PROBE4(page__get__page, 4630 lgrp_t *, lgrp, 4631 int, mnode, 4632 ulong_t, bin, 4633 uint_t, flags); 4634 return (pp); 4635 } 4636 } 4637 ASSERT(pp == NULL); 4638 4639 /* 4640 * for non-PGI_PGCPSZC0 PAGESIZE requests, check cachelist 4641 * before checking remote free lists. Caller expected to call 4642 * page_get_cachelist which will check local cache lists 4643 * and remote free lists. 4644 */ 4645 if (!PC_ISKFLT(fltp) && szc == 0 && 4646 ((flags & PGI_PGCPSZC0) == 0)) { 4647 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred); 4648 return (NULL); 4649 } 4650 4651 ASSERT(PC_ISKFLT(fltp) || szc > 0 || (flags & PGI_PGCPSZC0)); 4652 4653 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 4654 4655 if (!(flags & PG_LOCAL)) { 4656 /* 4657 * Try to get a non-local freelist page. 4658 */ 4659 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 4660 while ((mnode = 4661 lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 4662 pp = page_get_func(fltp, mnode, bin, mtype, 4663 szc, flags); 4664 if (pp != NULL) { 4665 DTRACE_PROBE4(page__get, 4666 lgrp_t *, lgrp, 4667 int, mnode, 4668 ulong_t, bin, 4669 uint_t, flags); 4670 #ifdef VM_STATS 4671 VM_STAT_ADD(vmm_vmstats. 4672 pgf_allocokrem[szc] 4673 [fltp->pflt_type]); 4674 #endif 4675 return (pp); 4676 } 4677 } 4678 ASSERT(pp == NULL); 4679 } 4680 4681 if (!(flags & PG_LOCAL) && pgcplimitsearch && 4682 page_get_func == page_get_contig_pages) 4683 SETPGCPFAILCNT(szc); 4684 } 4685 4686 #ifdef VM_STATS 4687 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc][fltp->pflt_type]); 4688 #endif 4689 4690 return (NULL); 4691 } 4692 #if defined(__amd64) && !defined(__xpv) 4693 /* 4694 * The page_get_kflt() routine is called by the PAGE_GET_FREELISTS() macro to 4695 * allocate memory from the kernel freelist type. 4696 */ 4697 /* ARGSUSED */ 4698 page_t * 4699 page_get_kflt(struct vnode *vp, u_offset_t off, struct seg *seg, caddr_t vaddr, 4700 size_t size, uint_t flags, struct lgrp *lgrp) 4701 { 4702 struct as *as = seg->s_as; 4703 page_t *pp = NULL; 4704 ulong_t bin; 4705 uchar_t szc; 4706 int mtype; 4707 4708 ASSERT(!kcage_on); 4709 ASSERT(kflt_on); 4710 ASSERT((flags & PG_KFLT) == PG_KFLT); 4711 4712 flags &= ~PG_NORELOC; 4713 flags |= PGI_NOCAGE; 4714 4715 if ((flags & PG_PANIC) == 0 && 4716 kflt_freemem < kflt_throttlefree + btop(size) && 4717 curthread != kflt_evict_thread) { 4718 return (NULL); 4719 } 4720 4721 /* LINTED */ 4722 MTYPE_INIT(mtype, vp, vaddr, flags, size); 4723 4724 /* 4725 * If we aren't passed a specific lgroup, or passed a freed lgrp 4726 * assume we wish to allocate near to the current thread's home. 4727 */ 4728 if (!LGRP_EXISTS(lgrp)) 4729 lgrp = lgrp_home_lgrp(); 4730 4731 /* 4732 * Convert size to page size code. 4733 */ 4734 if ((szc = page_szc(size)) == (uchar_t)-1) 4735 panic("page_get_kflt: illegal page size request"); 4736 ASSERT(szc == 0); 4737 ASSERT(!(flags & PG_LOCAL)); 4738 4739 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc][kfltp->pflt_type]); 4740 4741 /* LINTED */ 4742 AS_2_BIN(PFLT_KMEM, as, seg, vp, vaddr, bin, szc); 4743 4744 ASSERT(bin < PAGE_GET_PAGECOLORS(szc)); 4745 ASSERT(bin < KFLT_PAGE_COLORS); 4746 4747 retry: 4748 pp = page_get_flist(kfltp, bin, mtype, szc, flags, lgrp); 4749 4750 if (pp != NULL) { 4751 return (pp); 4752 } 4753 4754 #if defined(__amd64) 4755 if (kernel_page_update_flags_x86(&flags)) { 4756 goto retry; 4757 } 4758 #endif 4759 /* 4760 * Import memory from user page freelists. 4761 */ 4762 4763 /* LINTED: constant in conditional context */ 4764 AS_2_BIN(PFLT_USER, as, seg, vp, vaddr, bin, KFLT_PAGESIZE); 4765 4766 ASSERT(bin < PAGE_GET_PAGECOLORS(KFLT_PAGESIZE)); 4767 4768 if ((pp = page_import_kflt(kfltp, bin, mtype, szc, 4769 flags | PGI_NOPGALLOC | PGI_PGCPHIPRI, NULL)) != NULL) { 4770 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc][kfltp->pflt_type]); 4771 return (pp); 4772 } 4773 4774 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc][kfltp->pflt_type]); 4775 return (NULL); 4776 } 4777 4778 /* 4779 * This is the policy routine used to allocate user memory on the kernel 4780 * freelist. 4781 */ 4782 /* ARGSUSED */ 4783 page_t * 4784 page_user_alloc_kflt(page_freelist_type_t *fp, int mnode, uint_t bin, int mtype, 4785 uchar_t szc, uint_t flags) 4786 { 4787 page_t *pp; 4788 4789 if (szc != 0) 4790 return (NULL); 4791 4792 if (kflt_freemem < kflt_desfree) { 4793 kflt_evict_wakeup(); 4794 } 4795 flags &= ~PG_MATCH_COLOR; 4796 4797 bin = USER_2_KMEM_BIN(bin); 4798 4799 if ((pp = page_get_mnode_freelist(kfltp, mnode, 4800 bin, mtype, szc, flags)) != NULL) { 4801 VM_STAT_ADD(vmm_vmstats.puak_allocok); 4802 atomic_add_long(&kflt_user_alloc, 1); 4803 PP_SETUSERKFLT(pp); 4804 return (pp); 4805 } 4806 4807 VM_STAT_ADD(vmm_vmstats.puak_allocfailed); 4808 return (NULL); 4809 } 4810 4811 /* 4812 * This routine is called in order to allocate a large page from the user page 4813 * freelist and split this into small pages which are then placed on the kernel 4814 * freelist. If it is is called from kflt_expand() routine the PGI_NOPGALLOC 4815 * flag is set to indicate that all pages should be placed on the freelist, 4816 * otherwise a page of the requested type and color will be returned. 4817 */ 4818 /* ARGSUSED */ 4819 page_t * 4820 page_import_kflt(page_freelist_type_t *fp, uint_t bin, int mtype, 4821 uchar_t szc, uint_t flags, int *np) 4822 { 4823 page_t *pp, *pplist; 4824 uint_t alloc_szc = KFLT_PAGESIZE; 4825 kmutex_t *pcm; 4826 page_t *ret_pp = NULL; 4827 uint_t req_bin = bin; 4828 int req_mtype = mtype; 4829 int pgcnt = 0; 4830 int pgalloc; 4831 int mnode; 4832 struct lgrp *lgrp; 4833 4834 ASSERT(szc == 0); 4835 4836 flags &= ~(PG_LOCAL|PG_MATCH_COLOR); 4837 lgrp = lgrp_home_lgrp(); 4838 4839 pgalloc = ((flags & PGI_NOPGALLOC) == 0); 4840 4841 /* Allocate a large page from the user pagelist */ 4842 if ((pplist = page_get_flist(ufltp, bin, mtype, alloc_szc, 4843 flags, lgrp)) != NULL) { 4844 4845 VM_STAT_ADD(vmm_vmstats.pgik_allocok); 4846 CHK_LPG(pplist, alloc_szc); 4847 mnode = PP_2_MEM_NODE(pplist); 4848 /* 4849 * Split up the large page and put the constituent pages 4850 * on the kernel freelist. 4851 */ 4852 while (pplist) { 4853 pgcnt++; 4854 pp = pplist; 4855 ASSERT(pp->p_szc == alloc_szc); 4856 ASSERT(PP_ISFREE(pp)); 4857 mach_page_sub(&pplist, pp); 4858 4859 pp->p_szc = 0; 4860 PP_SETKFLT(pp); 4861 mtype = PP_2_MTYPE(pp); 4862 bin = PP_2_BIN(pp); 4863 if (pgalloc && (ret_pp == NULL) && 4864 ((bin == req_bin && mtype == req_mtype))) { 4865 ret_pp = pp; 4866 } else { 4867 pcm = PC_BIN_MUTEX(PFLT_KMEM, mnode, bin, 4868 PG_FREE_LIST); 4869 ASSERT(mtype == PP_2_MTYPE(pp)); 4870 mutex_enter(pcm); 4871 mach_page_add(PAGE_FREELISTP(PFLT_KMEM, mnode, 4872 0, bin, mtype), pp); 4873 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 4874 mutex_exit(pcm); 4875 page_unlock(pp); 4876 } 4877 } 4878 4879 if (np != NULL) 4880 *np = pgcnt; 4881 4882 if (ret_pp == NULL) { 4883 kflt_freemem_add(pgcnt); 4884 } else { 4885 kflt_freemem_add(pgcnt - 1); 4886 } 4887 return (ret_pp); 4888 4889 } else { 4890 4891 VM_STAT_ADD(vmm_vmstats.pgik_allocfailed); 4892 return (NULL); 4893 } 4894 } 4895 4896 /* 4897 * This routine is called from the kflt_user_evict() thread when kernel 4898 * memory is low and the thread has not managed to increase it by freeing up 4899 * user pages 4900 */ 4901 void 4902 kflt_expand() 4903 { 4904 ulong_t bin; 4905 int mtype; 4906 uint_t flags; 4907 spgcnt_t wanted; 4908 caddr_t vaddr; 4909 int np; 4910 int lpallocated = 0; 4911 int retries; 4912 4913 ASSERT(kflt_on); 4914 vaddr = 0; 4915 flags = PGI_NOPGALLOC | PGI_PGCPHIPRI; 4916 4917 wanted = MAX(kflt_lotsfree, kflt_throttlefree + kflt_needfree) 4918 - kflt_freemem; 4919 4920 if (wanted <= 0) { 4921 return; 4922 } 4923 4924 /* LINTED */ 4925 MTYPE_INIT(mtype, &kvp, vaddr, flags, KFLT_PAGESIZE); 4926 4927 #if defined(__amd64) 4928 (void) kernel_page_update_flags_x86(&flags); 4929 #endif 4930 /* LINTED */ 4931 AS_2_BIN(PFLT_USER, &kas, NULL, &kvp, vaddr, bin, 1); 4932 4933 retries = 0; 4934 while (kflt_on && wanted > 0) { 4935 (void) page_import_kflt(kfltp, bin, mtype, 0, 4936 flags, &np); 4937 4938 if (np == 0) { 4939 if (lpallocated == 0 && 4940 retries < KFLT_EXPAND_RETRIES) { 4941 retries++; 4942 ASSERT((flags & (PGI_NOPGALLOC | PGI_PGCPHIPRI)) 4943 == (PGI_NOPGALLOC | PGI_PGCPHIPRI)); 4944 continue; 4945 } 4946 break; 4947 } else { 4948 wanted -= np; 4949 lpallocated = 1; 4950 } 4951 4952 } 4953 4954 #ifdef DEBUG 4955 if (lpallocated) { 4956 VM_STAT_ADD(vmm_vmstats.pgkx_allocok); 4957 } else { 4958 VM_STAT_ADD(vmm_vmstats.pgkx_allocfailed); 4959 } 4960 #endif 4961 } 4962 #endif /* __amd64 && !__xpv */ 4963