1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
26 /* All Rights Reserved */
27
28 /*
29 * Portions of this source code were derived from Berkeley 4.3 BSD
30 * under license from the Regents of the University of California.
31 */
32
33
34 /*
35 * This file contains common functions to access and manage the page lists.
36 * Many of these routines originated from platform dependent modules
37 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
38 * a platform independent manner.
39 *
40 * vm/vm_dep.h provides for platform specific support.
41 */
42
43 #include <sys/types.h>
44 #include <sys/debug.h>
45 #include <sys/cmn_err.h>
46 #include <sys/systm.h>
47 #include <sys/atomic.h>
48 #include <sys/sysmacros.h>
49 #include <vm/as.h>
50 #include <vm/page.h>
51 #include <vm/seg_kmem.h>
52 #include <vm/seg_vn.h>
53 #include <sys/vmsystm.h>
54 #include <sys/memnode.h>
55 #include <vm/vm_dep.h>
56 #include <sys/lgrp.h>
57 #include <sys/mem_config.h>
58 #include <sys/callb.h>
59 #include <sys/mem_cage.h>
60 #include <sys/sdt.h>
61 #include <sys/dumphdr.h>
62 #include <sys/swap.h>
63
64 extern uint_t vac_colors;
65
66 #define MAX_PRAGMA_ALIGN 128
67
68 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
69
70 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
71 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0)
72 #else
73 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0)
74 #endif
75 char vm_cpu_data0[VM_CPU_DATA_PADSIZE];
76
77 /*
78 * number of page colors equivalent to reqested color in page_get routines.
79 * If set, keeps large pages intact longer and keeps MPO allocation
80 * from the local mnode in favor of acquiring the 'correct' page color from
81 * a demoted large page or from a remote mnode.
82 */
83 uint_t colorequiv;
84
85 /*
86 * color equivalency mask for each page size.
87 * Mask is computed based on cpu L2$ way sizes and colorequiv global.
88 * High 4 bits determine the number of high order bits of the color to ignore.
89 * Low 4 bits determines number of low order bits of color to ignore (it's only
90 * relevant for hashed index based page coloring).
91 */
92 uchar_t colorequivszc[MMU_PAGE_SIZES];
93
94 /*
95 * if set, specifies the percentage of large pages that are free from within
96 * a large page region before attempting to lock those pages for
97 * page_get_contig_pages processing.
98 *
99 * Should be turned on when kpr is available when page_trylock_contig_pages
100 * can be more selective.
101 */
102
103 int ptcpthreshold;
104
105 /*
106 * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
107 * Enabled by default via pgcplimitsearch.
108 *
109 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
110 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
111 * bound. This upper bound range guarantees:
112 * - all large page 'slots' will be searched over time
113 * - the minimum (1) large page candidates considered on each pgcp call
114 * - count doesn't wrap around to 0
115 */
116 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES];
117 int pgcplimitsearch = 1;
118
119 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1))
120 #define SETPGCPFAILCNT(szc) \
121 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \
122 pgcpfailcnt[szc] = PGCPFAILMAX / 2;
123
124 #ifdef VM_STATS
125 struct vmm_vmstats_str vmm_vmstats;
126
127 #endif /* VM_STATS */
128
129 #if defined(__sparc)
130 #define LPGCREATE 0
131 #else
132 /* enable page_get_contig_pages */
133 #define LPGCREATE 1
134 #endif
135
136 int pg_contig_disable;
137 int pg_lpgcreate_nocage = LPGCREATE;
138
139 /*
140 * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
141 */
142 #define PFNNULL 0
143
144 /* Flags involved in promotion and demotion routines */
145 #define PC_FREE 0x1 /* put page on freelist */
146 #define PC_ALLOC 0x2 /* return page for allocation */
147
148 /*
149 * Flag for page_demote to be used with PC_FREE to denote that we don't care
150 * what the color is as the color parameter to the function is ignored.
151 */
152 #define PC_NO_COLOR (-1)
153
154 /* mtype value for page_promote to use when mtype does not matter */
155 #define PC_MTYPE_ANY (-1)
156
157 /*
158 * page counters candidates info
159 * See page_ctrs_cands comment below for more details.
160 * fields are as follows:
161 * pcc_pages_free: # pages which freelist coalesce can create
162 * pcc_color_free: pointer to page free counts per color
163 */
164 typedef struct pcc_info {
165 pgcnt_t pcc_pages_free;
166 pgcnt_t *pcc_color_free;
167 uint_t pad[12];
168 } pcc_info_t;
169
170 /*
171 * On big machines it can take a long time to check page_counters
172 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
173 * updated sum of all elements of the corresponding page_counters arrays.
174 * page_freelist_coalesce() searches page_counters only if an appropriate
175 * element of page_ctrs_cands array is greater than 0.
176 *
177 * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
178 */
179 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
180
181 /*
182 * Return in val the total number of free pages which can be created
183 * for the given mnode (m), mrange (g), and region size (r)
184 */
185 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) { \
186 int i; \
187 val = 0; \
188 for (i = 0; i < NPC_MUTEX; i++) { \
189 val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free; \
190 } \
191 }
192
193 /*
194 * Return in val the total number of free pages which can be created
195 * for the given mnode (m), mrange (g), region size (r), and color (c)
196 */
197 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) { \
198 int i; \
199 val = 0; \
200 ASSERT((c) < PAGE_GET_PAGECOLORS(r)); \
201 for (i = 0; i < NPC_MUTEX; i++) { \
202 val += \
203 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)]; \
204 } \
205 }
206
207 /*
208 * We can only allow a single thread to update a counter within the physical
209 * range of the largest supported page size. That is the finest granularity
210 * possible since the counter values are dependent on each other
211 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
212 * ctr_mutex lock index for a particular physical range.
213 */
214 static kmutex_t *ctr_mutex[NPC_MUTEX];
215
216 #define PP_CTR_LOCK_INDX(pp) \
217 (((pp)->p_pagenum >> \
218 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
219
220 #define INVALID_COLOR 0xffffffff
221 #define INVALID_MASK 0xffffffff
222
223 /*
224 * Local functions prototypes.
225 */
226
227 void page_ctr_add(int, int, page_t *, int);
228 void page_ctr_add_internal(int, int, page_t *, int);
229 void page_ctr_sub(int, int, page_t *, int);
230 void page_ctr_sub_internal(int, int, page_t *, int);
231 void page_freelist_lock(int);
232 void page_freelist_unlock(int);
233 page_t *page_promote(int, pfn_t, uchar_t, int, int);
234 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int);
235 page_t *page_freelist_split(uchar_t,
236 uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *);
237 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
238 static int page_trylock_cons(page_t *pp, se_t se);
239
240 /*
241 * The page_counters array below is used to keep track of free contiguous
242 * physical memory. A hw_page_map_t will be allocated per mnode per szc.
243 * This contains an array of counters, the size of the array, a shift value
244 * used to convert a pagenum into a counter array index or vice versa, as
245 * well as a cache of the last successful index to be promoted to a larger
246 * page size. As an optimization, we keep track of the last successful index
247 * to be promoted per page color for the given size region, and this is
248 * allocated dynamically based upon the number of colors for a given
249 * region size.
250 *
251 * Conceptually, the page counters are represented as:
252 *
253 * page_counters[region_size][mnode]
254 *
255 * region_size: size code of a candidate larger page made up
256 * of contiguous free smaller pages.
257 *
258 * page_counters[region_size][mnode].hpm_counters[index]:
259 * represents how many (region_size - 1) pages either
260 * exist or can be created within the given index range.
261 *
262 * Let's look at a sparc example:
263 * If we want to create a free 512k page, we look at region_size 2
264 * for the mnode we want. We calculate the index and look at a specific
265 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at
266 * this location, it means that 8 64k pages either exist or can be created
267 * from 8K pages in order to make a single free 512k page at the given
268 * index. Note that when a region is full, it will contribute to the
269 * counts in the region above it. Thus we will not know what page
270 * size the free pages will be which can be promoted to this new free
271 * page unless we look at all regions below the current region.
272 */
273
274 /*
275 * Note: hpmctr_t is defined in platform vm_dep.h
276 * hw_page_map_t contains all the information needed for the page_counters
277 * logic. The fields are as follows:
278 *
279 * hpm_counters: dynamically allocated array to hold counter data
280 * hpm_entries: entries in hpm_counters
281 * hpm_shift: shift for pnum/array index conv
282 * hpm_base: PFN mapped to counter index 0
283 * hpm_color_current: last index in counter array for this color at
284 * which we successfully created a large page
285 */
286 typedef struct hw_page_map {
287 hpmctr_t *hpm_counters;
288 size_t hpm_entries;
289 int hpm_shift;
290 pfn_t hpm_base;
291 size_t *hpm_color_current[MAX_MNODE_MRANGES];
292 #if defined(__sparc)
293 uint_t pad[4];
294 #endif
295 } hw_page_map_t;
296
297 /*
298 * Element zero is not used, but is allocated for convenience.
299 */
300 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
301
302 /*
303 * Cached value of MNODE_RANGE_CNT(mnode).
304 * This is a function call in x86.
305 */
306 static int mnode_nranges[MAX_MEM_NODES];
307 static int mnode_maxmrange[MAX_MEM_NODES];
308
309 /*
310 * The following macros are convenient ways to get access to the individual
311 * elements of the page_counters arrays. They can be used on both
312 * the left side and right side of equations.
313 */
314 #define PAGE_COUNTERS(mnode, rg_szc, idx) \
315 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
316
317 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \
318 (page_counters[(rg_szc)][(mnode)].hpm_counters)
319
320 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \
321 (page_counters[(rg_szc)][(mnode)].hpm_shift)
322
323 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \
324 (page_counters[(rg_szc)][(mnode)].hpm_entries)
325
326 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \
327 (page_counters[(rg_szc)][(mnode)].hpm_base)
328
329 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g) \
330 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
331
332 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange) \
333 (page_counters[(rg_szc)][(mnode)]. \
334 hpm_color_current[(mrange)][(color)])
335
336 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \
337 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \
338 PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
339
340 #define IDX_TO_PNUM(mnode, rg_szc, index) \
341 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \
342 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
343
344 /*
345 * Protects the hpm_counters and hpm_color_current memory from changing while
346 * looking at page counters information.
347 * Grab the write lock to modify what these fields point at.
348 * Grab the read lock to prevent any pointers from changing.
349 * The write lock can not be held during memory allocation due to a possible
350 * recursion deadlock with trying to grab the read lock while the
351 * write lock is already held.
352 */
353 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
354
355
356 /*
357 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
358 */
359 void
cpu_vm_data_init(struct cpu * cp)360 cpu_vm_data_init(struct cpu *cp)
361 {
362 if (cp == CPU0) {
363 cp->cpu_vm_data = (void *)&vm_cpu_data0;
364 } else {
365 void *kmptr;
366 int align;
367 size_t sz;
368
369 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
370 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
371 kmptr = kmem_zalloc(sz, KM_SLEEP);
372 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
373 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
374 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
375 }
376 }
377
378 /*
379 * free cpu_vm_data
380 */
381 void
cpu_vm_data_destroy(struct cpu * cp)382 cpu_vm_data_destroy(struct cpu *cp)
383 {
384 if (cp->cpu_seqid && cp->cpu_vm_data) {
385 ASSERT(cp != CPU0);
386 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
387 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
388 }
389 cp->cpu_vm_data = NULL;
390 }
391
392
393 /*
394 * page size to page size code
395 */
396 int
page_szc(size_t pagesize)397 page_szc(size_t pagesize)
398 {
399 int i = 0;
400
401 while (hw_page_array[i].hp_size) {
402 if (pagesize == hw_page_array[i].hp_size)
403 return (i);
404 i++;
405 }
406 return (-1);
407 }
408
409 /*
410 * page size to page size code with the restriction that it be a supported
411 * user page size. If it's not a supported user page size, -1 will be returned.
412 */
413 int
page_szc_user_filtered(size_t pagesize)414 page_szc_user_filtered(size_t pagesize)
415 {
416 int szc = page_szc(pagesize);
417 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
418 return (szc);
419 }
420 return (-1);
421 }
422
423 /*
424 * Return how many page sizes are available for the user to use. This is
425 * what the hardware supports and not based upon how the OS implements the
426 * support of different page sizes.
427 *
428 * If legacy is non-zero, return the number of pagesizes available to legacy
429 * applications. The number of legacy page sizes might be less than the
430 * exported user page sizes. This is to prevent legacy applications that
431 * use the largest page size returned from getpagesizes(3c) from inadvertantly
432 * using the 'new' large pagesizes.
433 */
434 uint_t
page_num_user_pagesizes(int legacy)435 page_num_user_pagesizes(int legacy)
436 {
437 if (legacy)
438 return (mmu_legacy_page_sizes);
439 return (mmu_exported_page_sizes);
440 }
441
442 uint_t
page_num_pagesizes(void)443 page_num_pagesizes(void)
444 {
445 return (mmu_page_sizes);
446 }
447
448 /*
449 * returns the count of the number of base pagesize pages associated with szc
450 */
451 pgcnt_t
page_get_pagecnt(uint_t szc)452 page_get_pagecnt(uint_t szc)
453 {
454 if (szc >= mmu_page_sizes)
455 panic("page_get_pagecnt: out of range %d", szc);
456 return (hw_page_array[szc].hp_pgcnt);
457 }
458
459 size_t
page_get_pagesize(uint_t szc)460 page_get_pagesize(uint_t szc)
461 {
462 if (szc >= mmu_page_sizes)
463 panic("page_get_pagesize: out of range %d", szc);
464 return (hw_page_array[szc].hp_size);
465 }
466
467 /*
468 * Return the size of a page based upon the index passed in. An index of
469 * zero refers to the smallest page size in the system, and as index increases
470 * it refers to the next larger supported page size in the system.
471 * Note that szc and userszc may not be the same due to unsupported szc's on
472 * some systems.
473 */
474 size_t
page_get_user_pagesize(uint_t userszc)475 page_get_user_pagesize(uint_t userszc)
476 {
477 uint_t szc = USERSZC_2_SZC(userszc);
478
479 if (szc >= mmu_page_sizes)
480 panic("page_get_user_pagesize: out of range %d", szc);
481 return (hw_page_array[szc].hp_size);
482 }
483
484 uint_t
page_get_shift(uint_t szc)485 page_get_shift(uint_t szc)
486 {
487 if (szc >= mmu_page_sizes)
488 panic("page_get_shift: out of range %d", szc);
489 return (PAGE_GET_SHIFT(szc));
490 }
491
492 uint_t
page_get_pagecolors(uint_t szc)493 page_get_pagecolors(uint_t szc)
494 {
495 if (szc >= mmu_page_sizes)
496 panic("page_get_pagecolors: out of range %d", szc);
497 return (PAGE_GET_PAGECOLORS(szc));
498 }
499
500 /*
501 * this assigns the desired equivalent color after a split
502 */
503 uint_t
page_correct_color(uchar_t szc,uchar_t nszc,uint_t color,uint_t ncolor,uint_t ceq_mask)504 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
505 uint_t ncolor, uint_t ceq_mask)
506 {
507 ASSERT(nszc > szc);
508 ASSERT(szc < mmu_page_sizes);
509 ASSERT(color < PAGE_GET_PAGECOLORS(szc));
510 ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
511
512 color &= ceq_mask;
513 ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
514 return (color | (ncolor & ~ceq_mask));
515 }
516
517 /*
518 * The interleaved_mnodes flag is set when mnodes overlap in
519 * the physbase..physmax range, but have disjoint slices.
520 * In this case hpm_counters is shared by all mnodes.
521 * This flag is set dynamically by the platform.
522 */
523 int interleaved_mnodes = 0;
524
525 /*
526 * Called by startup().
527 * Size up the per page size free list counters based on physmax
528 * of each node and max_mem_nodes.
529 *
530 * If interleaved_mnodes is set we need to find the first mnode that
531 * exists. hpm_counters for the first mnode will then be shared by
532 * all other mnodes. If interleaved_mnodes is not set, just set
533 * first=mnode each time. That means there will be no sharing.
534 */
535 size_t
page_ctrs_sz(void)536 page_ctrs_sz(void)
537 {
538 int r; /* region size */
539 int mnode;
540 int firstmn; /* first mnode that exists */
541 int nranges;
542 pfn_t physbase;
543 pfn_t physmax;
544 uint_t ctrs_sz = 0;
545 int i;
546 pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
547
548 /*
549 * We need to determine how many page colors there are for each
550 * page size in order to allocate memory for any color specific
551 * arrays.
552 */
553 for (i = 0; i < mmu_page_sizes; i++) {
554 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
555 }
556
557 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
558
559 pgcnt_t r_pgcnt;
560 pfn_t r_base;
561 pgcnt_t r_align;
562
563 if (mem_node_config[mnode].exists == 0)
564 continue;
565
566 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
567 nranges = MNODE_RANGE_CNT(mnode);
568 mnode_nranges[mnode] = nranges;
569 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
570
571 /*
572 * determine size needed for page counter arrays with
573 * base aligned to large page size.
574 */
575 for (r = 1; r < mmu_page_sizes; r++) {
576 /* add in space for hpm_color_current */
577 ctrs_sz += sizeof (size_t) *
578 colors_per_szc[r] * nranges;
579
580 if (firstmn != mnode)
581 continue;
582
583 /* add in space for hpm_counters */
584 r_align = page_get_pagecnt(r);
585 r_base = physbase;
586 r_base &= ~(r_align - 1);
587 r_pgcnt = howmany(physmax - r_base + 1, r_align);
588
589 /*
590 * Round up to always allocate on pointer sized
591 * boundaries.
592 */
593 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
594 sizeof (hpmctr_t *));
595 }
596 }
597
598 for (r = 1; r < mmu_page_sizes; r++) {
599 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
600 }
601
602 /* add in space for page_ctrs_cands and pcc_color_free */
603 ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
604 mmu_page_sizes * NPC_MUTEX;
605
606 for (mnode = 0; mnode < max_mem_nodes; mnode++) {
607
608 if (mem_node_config[mnode].exists == 0)
609 continue;
610
611 nranges = mnode_nranges[mnode];
612 ctrs_sz += sizeof (pcc_info_t) * nranges *
613 mmu_page_sizes * NPC_MUTEX;
614 for (r = 1; r < mmu_page_sizes; r++) {
615 ctrs_sz += sizeof (pgcnt_t) * nranges *
616 colors_per_szc[r] * NPC_MUTEX;
617 }
618 }
619
620 /* ctr_mutex */
621 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
622
623 /* size for page list counts */
624 PLCNT_SZ(ctrs_sz);
625
626 /*
627 * add some slop for roundups. page_ctrs_alloc will roundup the start
628 * address of the counters to ecache_alignsize boundary for every
629 * memory node.
630 */
631 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
632 }
633
634 caddr_t
page_ctrs_alloc(caddr_t alloc_base)635 page_ctrs_alloc(caddr_t alloc_base)
636 {
637 int mnode;
638 int mrange, nranges;
639 int r; /* region size */
640 int i;
641 int firstmn; /* first mnode that exists */
642 pfn_t physbase;
643 pfn_t physmax;
644 pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
645
646 /*
647 * We need to determine how many page colors there are for each
648 * page size in order to allocate memory for any color specific
649 * arrays.
650 */
651 for (i = 0; i < mmu_page_sizes; i++) {
652 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
653 }
654
655 for (r = 1; r < mmu_page_sizes; r++) {
656 page_counters[r] = (hw_page_map_t *)alloc_base;
657 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
658 }
659
660 /* page_ctrs_cands and pcc_color_free array */
661 for (i = 0; i < NPC_MUTEX; i++) {
662 for (r = 1; r < mmu_page_sizes; r++) {
663
664 page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
665 alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
666
667 for (mnode = 0; mnode < max_mem_nodes; mnode++) {
668 pcc_info_t *pi;
669
670 if (mem_node_config[mnode].exists == 0)
671 continue;
672
673 nranges = mnode_nranges[mnode];
674
675 pi = (pcc_info_t *)alloc_base;
676 alloc_base += sizeof (pcc_info_t) * nranges;
677 page_ctrs_cands[i][r][mnode] = pi;
678
679 for (mrange = 0; mrange < nranges; mrange++) {
680 pi->pcc_color_free =
681 (pgcnt_t *)alloc_base;
682 alloc_base += sizeof (pgcnt_t) *
683 colors_per_szc[r];
684 pi++;
685 }
686 }
687 }
688 }
689
690 /* ctr_mutex */
691 for (i = 0; i < NPC_MUTEX; i++) {
692 ctr_mutex[i] = (kmutex_t *)alloc_base;
693 alloc_base += (max_mem_nodes * sizeof (kmutex_t));
694 }
695
696 /* initialize page list counts */
697 PLCNT_INIT(alloc_base);
698
699 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
700
701 pgcnt_t r_pgcnt;
702 pfn_t r_base;
703 pgcnt_t r_align;
704 int r_shift;
705 int nranges = mnode_nranges[mnode];
706
707 if (mem_node_config[mnode].exists == 0)
708 continue;
709
710 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
711
712 for (r = 1; r < mmu_page_sizes; r++) {
713 /*
714 * the page_counters base has to be aligned to the
715 * page count of page size code r otherwise the counts
716 * will cross large page boundaries.
717 */
718 r_align = page_get_pagecnt(r);
719 r_base = physbase;
720 /* base needs to be aligned - lower to aligned value */
721 r_base &= ~(r_align - 1);
722 r_pgcnt = howmany(physmax - r_base + 1, r_align);
723 r_shift = PAGE_BSZS_SHIFT(r);
724
725 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
726 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
727 PAGE_COUNTERS_BASE(mnode, r) = r_base;
728 for (mrange = 0; mrange < nranges; mrange++) {
729 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
730 r, mrange) = (size_t *)alloc_base;
731 alloc_base += sizeof (size_t) *
732 colors_per_szc[r];
733 }
734 for (i = 0; i < colors_per_szc[r]; i++) {
735 uint_t color_mask = colors_per_szc[r] - 1;
736 pfn_t pfnum = r_base;
737 size_t idx;
738 int mrange;
739 MEM_NODE_ITERATOR_DECL(it);
740
741 MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it);
742 if (pfnum == (pfn_t)-1) {
743 idx = 0;
744 } else {
745 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
746 color_mask, color_mask, &it);
747 idx = PNUM_TO_IDX(mnode, r, pfnum);
748 idx = (idx >= r_pgcnt) ? 0 : idx;
749 }
750 for (mrange = 0; mrange < nranges; mrange++) {
751 PAGE_COUNTERS_CURRENT_COLOR(mnode,
752 r, i, mrange) = idx;
753 }
754 }
755
756 /* hpm_counters may be shared by all mnodes */
757 if (firstmn == mnode) {
758 PAGE_COUNTERS_COUNTERS(mnode, r) =
759 (hpmctr_t *)alloc_base;
760 alloc_base +=
761 P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
762 sizeof (hpmctr_t *));
763 } else {
764 PAGE_COUNTERS_COUNTERS(mnode, r) =
765 PAGE_COUNTERS_COUNTERS(firstmn, r);
766 }
767
768 /*
769 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
770 * satisfy the identity requirement.
771 * We should be able to go from one to the other
772 * and get consistent values.
773 */
774 ASSERT(PNUM_TO_IDX(mnode, r,
775 (IDX_TO_PNUM(mnode, r, 0))) == 0);
776 ASSERT(IDX_TO_PNUM(mnode, r,
777 (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
778 }
779 /*
780 * Roundup the start address of the page_counters to
781 * cache aligned boundary for every memory node.
782 * page_ctrs_sz() has added some slop for these roundups.
783 */
784 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
785 L2CACHE_ALIGN);
786 }
787
788 /* Initialize other page counter specific data structures. */
789 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
790 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
791 }
792
793 return (alloc_base);
794 }
795
796 /*
797 * Functions to adjust region counters for each size free list.
798 * Caller is responsible to acquire the ctr_mutex lock if necessary and
799 * thus can be called during startup without locks.
800 */
801 /* ARGSUSED */
802 void
page_ctr_add_internal(int mnode,int mtype,page_t * pp,int flags)803 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
804 {
805 ssize_t r; /* region size */
806 ssize_t idx;
807 pfn_t pfnum;
808 int lckidx;
809
810 ASSERT(mnode == PP_2_MEM_NODE(pp));
811 ASSERT(mtype == PP_2_MTYPE(pp));
812
813 ASSERT(pp->p_szc < mmu_page_sizes);
814
815 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
816
817 /* no counter update needed for largest page size */
818 if (pp->p_szc >= mmu_page_sizes - 1) {
819 return;
820 }
821
822 r = pp->p_szc + 1;
823 pfnum = pp->p_pagenum;
824 lckidx = PP_CTR_LOCK_INDX(pp);
825
826 /*
827 * Increment the count of free pages for the current
828 * region. Continue looping up in region size incrementing
829 * count if the preceeding region is full.
830 */
831 while (r < mmu_page_sizes) {
832 idx = PNUM_TO_IDX(mnode, r, pfnum);
833
834 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
835 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
836
837 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
838 break;
839 } else {
840 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
841 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
842 [MTYPE_2_MRANGE(mnode, root_mtype)];
843
844 cand->pcc_pages_free++;
845 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
846 }
847 r++;
848 }
849 }
850
851 void
page_ctr_add(int mnode,int mtype,page_t * pp,int flags)852 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
853 {
854 int lckidx = PP_CTR_LOCK_INDX(pp);
855 kmutex_t *lock = &ctr_mutex[lckidx][mnode];
856
857 mutex_enter(lock);
858 page_ctr_add_internal(mnode, mtype, pp, flags);
859 mutex_exit(lock);
860 }
861
862 void
page_ctr_sub_internal(int mnode,int mtype,page_t * pp,int flags)863 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
864 {
865 int lckidx;
866 ssize_t r; /* region size */
867 ssize_t idx;
868 pfn_t pfnum;
869
870 ASSERT(mnode == PP_2_MEM_NODE(pp));
871 ASSERT(mtype == PP_2_MTYPE(pp));
872
873 ASSERT(pp->p_szc < mmu_page_sizes);
874
875 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
876
877 /* no counter update needed for largest page size */
878 if (pp->p_szc >= mmu_page_sizes - 1) {
879 return;
880 }
881
882 r = pp->p_szc + 1;
883 pfnum = pp->p_pagenum;
884 lckidx = PP_CTR_LOCK_INDX(pp);
885
886 /*
887 * Decrement the count of free pages for the current
888 * region. Continue looping up in region size decrementing
889 * count if the preceeding region was full.
890 */
891 while (r < mmu_page_sizes) {
892 idx = PNUM_TO_IDX(mnode, r, pfnum);
893
894 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
895 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
896
897 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
898 break;
899 } else {
900 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
901 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
902 [MTYPE_2_MRANGE(mnode, root_mtype)];
903
904 ASSERT(cand->pcc_pages_free != 0);
905 ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
906
907 cand->pcc_pages_free--;
908 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
909 }
910 r++;
911 }
912 }
913
914 void
page_ctr_sub(int mnode,int mtype,page_t * pp,int flags)915 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
916 {
917 int lckidx = PP_CTR_LOCK_INDX(pp);
918 kmutex_t *lock = &ctr_mutex[lckidx][mnode];
919
920 mutex_enter(lock);
921 page_ctr_sub_internal(mnode, mtype, pp, flags);
922 mutex_exit(lock);
923 }
924
925 /*
926 * Adjust page counters following a memory attach, since typically the
927 * size of the array needs to change, and the PFN to counter index
928 * mapping needs to change.
929 *
930 * It is possible this mnode did not exist at startup. In that case
931 * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
932 * to change (a theoretical possibility on x86), which means pcc_color_free
933 * arrays must be extended.
934 */
935 uint_t
page_ctrs_adjust(int mnode)936 page_ctrs_adjust(int mnode)
937 {
938 pgcnt_t npgs;
939 int r; /* region size */
940 int i;
941 size_t pcsz, old_csz;
942 hpmctr_t *new_ctr, *old_ctr;
943 pfn_t oldbase, newbase;
944 pfn_t physbase, physmax;
945 size_t old_npgs;
946 hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
947 size_t size_cache[MMU_PAGE_SIZES];
948 size_t *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
949 size_t *old_color_array[MAX_MNODE_MRANGES];
950 pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
951 pcc_info_t **cands_cache;
952 pcc_info_t *old_pi, *pi;
953 pgcnt_t *pgcntp;
954 int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
955 int cands_cache_nranges;
956 int old_maxmrange, new_maxmrange;
957 int rc = 0;
958 int oldmnode;
959
960 cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
961 MMU_PAGE_SIZES, KM_NOSLEEP);
962 if (cands_cache == NULL)
963 return (ENOMEM);
964
965 i = -1;
966 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
967
968 newbase = physbase & ~PC_BASE_ALIGN_MASK;
969 npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
970
971 /* prepare to free non-null pointers on the way out */
972 cands_cache_nranges = nranges;
973 bzero(ctr_cache, sizeof (ctr_cache));
974 bzero(color_cache, sizeof (color_cache));
975
976 /*
977 * We need to determine how many page colors there are for each
978 * page size in order to allocate memory for any color specific
979 * arrays.
980 */
981 for (r = 0; r < mmu_page_sizes; r++) {
982 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
983 }
984
985 /*
986 * Preallocate all of the new hpm_counters arrays as we can't
987 * hold the page_ctrs_rwlock as a writer and allocate memory.
988 * If we can't allocate all of the arrays, undo our work so far
989 * and return failure.
990 */
991 for (r = 1; r < mmu_page_sizes; r++) {
992 pcsz = npgs >> PAGE_BSZS_SHIFT(r);
993 size_cache[r] = pcsz;
994 ctr_cache[r] = kmem_zalloc(pcsz *
995 sizeof (hpmctr_t), KM_NOSLEEP);
996 if (ctr_cache[r] == NULL) {
997 rc = ENOMEM;
998 goto cleanup;
999 }
1000 }
1001
1002 /*
1003 * Preallocate all of the new color current arrays as we can't
1004 * hold the page_ctrs_rwlock as a writer and allocate memory.
1005 * If we can't allocate all of the arrays, undo our work so far
1006 * and return failure.
1007 */
1008 for (r = 1; r < mmu_page_sizes; r++) {
1009 for (mrange = 0; mrange < nranges; mrange++) {
1010 color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
1011 colors_per_szc[r], KM_NOSLEEP);
1012 if (color_cache[r][mrange] == NULL) {
1013 rc = ENOMEM;
1014 goto cleanup;
1015 }
1016 }
1017 }
1018
1019 /*
1020 * Preallocate all of the new pcc_info_t arrays as we can't
1021 * hold the page_ctrs_rwlock as a writer and allocate memory.
1022 * If we can't allocate all of the arrays, undo our work so far
1023 * and return failure.
1024 */
1025 for (r = 1; r < mmu_page_sizes; r++) {
1026 for (i = 0; i < NPC_MUTEX; i++) {
1027 pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
1028 KM_NOSLEEP);
1029 if (pi == NULL) {
1030 rc = ENOMEM;
1031 goto cleanup;
1032 }
1033 cands_cache[i * MMU_PAGE_SIZES + r] = pi;
1034
1035 for (mrange = 0; mrange < nranges; mrange++, pi++) {
1036 pgcntp = kmem_zalloc(colors_per_szc[r] *
1037 sizeof (pgcnt_t), KM_NOSLEEP);
1038 if (pgcntp == NULL) {
1039 rc = ENOMEM;
1040 goto cleanup;
1041 }
1042 pi->pcc_color_free = pgcntp;
1043 }
1044 }
1045 }
1046
1047 /*
1048 * Grab the write lock to prevent others from walking these arrays
1049 * while we are modifying them.
1050 */
1051 PAGE_CTRS_WRITE_LOCK(mnode);
1052
1053 /*
1054 * For interleaved mnodes, find the first mnode
1055 * with valid page counters since the current
1056 * mnode may have just been added and not have
1057 * valid page counters.
1058 */
1059 if (interleaved_mnodes) {
1060 for (i = 0; i < max_mem_nodes; i++)
1061 if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL)
1062 break;
1063 ASSERT(i < max_mem_nodes);
1064 oldmnode = i;
1065 } else
1066 oldmnode = mnode;
1067
1068 old_nranges = mnode_nranges[mnode];
1069 cands_cache_nranges = old_nranges;
1070 mnode_nranges[mnode] = nranges;
1071 old_maxmrange = mnode_maxmrange[mnode];
1072 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
1073 new_maxmrange = mnode_maxmrange[mnode];
1074
1075 for (r = 1; r < mmu_page_sizes; r++) {
1076 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
1077 old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r);
1078 old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r);
1079 oldbase = PAGE_COUNTERS_BASE(oldmnode, r);
1080 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r);
1081 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1082 old_color_array[mrange] =
1083 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
1084 r, mrange);
1085 }
1086
1087 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
1088 new_ctr = ctr_cache[r];
1089 ctr_cache[r] = NULL;
1090 if (old_ctr != NULL &&
1091 (oldbase + old_npgs > newbase) &&
1092 (newbase + npgs > oldbase)) {
1093 /*
1094 * Map the intersection of the old and new
1095 * counters into the new array.
1096 */
1097 size_t offset;
1098 if (newbase > oldbase) {
1099 offset = (newbase - oldbase) >>
1100 PAGE_COUNTERS_SHIFT(mnode, r);
1101 bcopy(old_ctr + offset, new_ctr,
1102 MIN(pcsz, (old_csz - offset)) *
1103 sizeof (hpmctr_t));
1104 } else {
1105 offset = (oldbase - newbase) >>
1106 PAGE_COUNTERS_SHIFT(mnode, r);
1107 bcopy(old_ctr, new_ctr + offset,
1108 MIN(pcsz - offset, old_csz) *
1109 sizeof (hpmctr_t));
1110 }
1111 }
1112
1113 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
1114 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
1115 PAGE_COUNTERS_BASE(mnode, r) = newbase;
1116
1117 /* update shared hpm_counters in other mnodes */
1118 if (interleaved_mnodes) {
1119 for (i = 0; i < max_mem_nodes; i++) {
1120 if ((i == mnode) ||
1121 (mem_node_config[i].exists == 0))
1122 continue;
1123 ASSERT(
1124 PAGE_COUNTERS_COUNTERS(i, r) == old_ctr ||
1125 PAGE_COUNTERS_COUNTERS(i, r) == NULL);
1126 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
1127 PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
1128 PAGE_COUNTERS_BASE(i, r) = newbase;
1129 }
1130 }
1131
1132 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1133 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
1134 color_cache[r][mrange];
1135 color_cache[r][mrange] = NULL;
1136 }
1137 /*
1138 * for now, just reset on these events as it's probably
1139 * not worthwhile to try and optimize this.
1140 */
1141 for (i = 0; i < colors_per_szc[r]; i++) {
1142 uint_t color_mask = colors_per_szc[r] - 1;
1143 int mlo = interleaved_mnodes ? 0 : mnode;
1144 int mhi = interleaved_mnodes ? max_mem_nodes :
1145 (mnode + 1);
1146 int m;
1147 pfn_t pfnum;
1148 size_t idx;
1149 MEM_NODE_ITERATOR_DECL(it);
1150
1151 for (m = mlo; m < mhi; m++) {
1152 if (mem_node_config[m].exists == 0)
1153 continue;
1154 pfnum = newbase;
1155 MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it);
1156 if (pfnum == (pfn_t)-1) {
1157 idx = 0;
1158 } else {
1159 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
1160 color_mask, color_mask, &it);
1161 idx = PNUM_TO_IDX(m, r, pfnum);
1162 idx = (idx < pcsz) ? idx : 0;
1163 }
1164 for (mrange = 0; mrange < nranges; mrange++) {
1165 if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m,
1166 r, mrange) != NULL)
1167 PAGE_COUNTERS_CURRENT_COLOR(m,
1168 r, i, mrange) = idx;
1169 }
1170 }
1171 }
1172
1173 /* cache info for freeing out of the critical path */
1174 if ((caddr_t)old_ctr >= kernelheap &&
1175 (caddr_t)old_ctr < ekernelheap) {
1176 ctr_cache[r] = old_ctr;
1177 size_cache[r] = old_csz;
1178 }
1179 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1180 size_t *tmp = old_color_array[mrange];
1181 if ((caddr_t)tmp >= kernelheap &&
1182 (caddr_t)tmp < ekernelheap) {
1183 color_cache[r][mrange] = tmp;
1184 }
1185 }
1186 /*
1187 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1188 * satisfy the identity requirement.
1189 * We should be able to go from one to the other
1190 * and get consistent values.
1191 */
1192 ASSERT(PNUM_TO_IDX(mnode, r,
1193 (IDX_TO_PNUM(mnode, r, 0))) == 0);
1194 ASSERT(IDX_TO_PNUM(mnode, r,
1195 (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
1196
1197 /* pcc_info_t and pcc_color_free */
1198 for (i = 0; i < NPC_MUTEX; i++) {
1199 pcc_info_t *epi;
1200 pcc_info_t *eold_pi;
1201
1202 pi = cands_cache[i * MMU_PAGE_SIZES + r];
1203 old_pi = page_ctrs_cands[i][r][mnode];
1204 page_ctrs_cands[i][r][mnode] = pi;
1205 cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
1206
1207 /* preserve old pcc_color_free values, if any */
1208 if (old_pi == NULL)
1209 continue;
1210
1211 /*
1212 * when/if x86 does DR, must account for
1213 * possible change in range index when
1214 * preserving pcc_info
1215 */
1216 epi = &pi[nranges];
1217 eold_pi = &old_pi[old_nranges];
1218 if (new_maxmrange > old_maxmrange) {
1219 pi += new_maxmrange - old_maxmrange;
1220 } else if (new_maxmrange < old_maxmrange) {
1221 old_pi += old_maxmrange - new_maxmrange;
1222 }
1223 for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
1224 pcc_info_t tmp = *pi;
1225 *pi = *old_pi;
1226 *old_pi = tmp;
1227 }
1228 }
1229 }
1230 PAGE_CTRS_WRITE_UNLOCK(mnode);
1231
1232 /*
1233 * Now that we have dropped the write lock, it is safe to free all
1234 * of the memory we have cached above.
1235 * We come thru here to free memory when pre-alloc fails, and also to
1236 * free old pointers which were recorded while locked.
1237 */
1238 cleanup:
1239 for (r = 1; r < mmu_page_sizes; r++) {
1240 if (ctr_cache[r] != NULL) {
1241 kmem_free(ctr_cache[r],
1242 size_cache[r] * sizeof (hpmctr_t));
1243 }
1244 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1245 if (color_cache[r][mrange] != NULL) {
1246 kmem_free(color_cache[r][mrange],
1247 colors_per_szc[r] * sizeof (size_t));
1248 }
1249 }
1250 for (i = 0; i < NPC_MUTEX; i++) {
1251 pi = cands_cache[i * MMU_PAGE_SIZES + r];
1252 if (pi == NULL)
1253 continue;
1254 nr = cands_cache_nranges;
1255 for (mrange = 0; mrange < nr; mrange++, pi++) {
1256 pgcntp = pi->pcc_color_free;
1257 if (pgcntp == NULL)
1258 continue;
1259 if ((caddr_t)pgcntp >= kernelheap &&
1260 (caddr_t)pgcntp < ekernelheap) {
1261 kmem_free(pgcntp,
1262 colors_per_szc[r] *
1263 sizeof (pgcnt_t));
1264 }
1265 }
1266 pi = cands_cache[i * MMU_PAGE_SIZES + r];
1267 if ((caddr_t)pi >= kernelheap &&
1268 (caddr_t)pi < ekernelheap) {
1269 kmem_free(pi, nr * sizeof (pcc_info_t));
1270 }
1271 }
1272 }
1273
1274 kmem_free(cands_cache,
1275 sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
1276 return (rc);
1277 }
1278
1279 /*
1280 * Cleanup the hpm_counters field in the page counters
1281 * array.
1282 */
1283 void
page_ctrs_cleanup(void)1284 page_ctrs_cleanup(void)
1285 {
1286 int r; /* region size */
1287 int i; /* mnode index */
1288
1289 /*
1290 * Get the page counters write lock while we are
1291 * setting the page hpm_counters field to NULL
1292 * for non-existent mnodes.
1293 */
1294 for (i = 0; i < max_mem_nodes; i++) {
1295 PAGE_CTRS_WRITE_LOCK(i);
1296 if (mem_node_config[i].exists) {
1297 PAGE_CTRS_WRITE_UNLOCK(i);
1298 continue;
1299 }
1300 for (r = 1; r < mmu_page_sizes; r++) {
1301 PAGE_COUNTERS_COUNTERS(i, r) = NULL;
1302 }
1303 PAGE_CTRS_WRITE_UNLOCK(i);
1304 }
1305 }
1306
1307 #ifdef DEBUG
1308
1309 /*
1310 * confirm pp is a large page corresponding to szc
1311 */
1312 void
chk_lpg(page_t * pp,uchar_t szc)1313 chk_lpg(page_t *pp, uchar_t szc)
1314 {
1315 spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
1316 uint_t noreloc;
1317
1318 if (npgs == 1) {
1319 ASSERT(pp->p_szc == 0);
1320 ASSERT(pp->p_next == pp);
1321 ASSERT(pp->p_prev == pp);
1322 return;
1323 }
1324
1325 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1326 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1327
1328 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
1329 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
1330 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
1331 ASSERT(pp->p_prev == (pp + (npgs - 1)));
1332
1333 /*
1334 * Check list of pages.
1335 */
1336 noreloc = PP_ISNORELOC(pp);
1337 while (npgs--) {
1338 if (npgs != 0) {
1339 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1340 ASSERT(pp->p_next == (pp + 1));
1341 }
1342 ASSERT(pp->p_szc == szc);
1343 ASSERT(PP_ISFREE(pp));
1344 ASSERT(PP_ISAGED(pp));
1345 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1346 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1347 ASSERT(pp->p_vnode == NULL);
1348 ASSERT(PP_ISNORELOC(pp) == noreloc);
1349
1350 pp = pp->p_next;
1351 }
1352 }
1353 #endif /* DEBUG */
1354
1355 void
page_freelist_lock(int mnode)1356 page_freelist_lock(int mnode)
1357 {
1358 int i;
1359 for (i = 0; i < NPC_MUTEX; i++) {
1360 mutex_enter(FPC_MUTEX(mnode, i));
1361 mutex_enter(CPC_MUTEX(mnode, i));
1362 }
1363 }
1364
1365 void
page_freelist_unlock(int mnode)1366 page_freelist_unlock(int mnode)
1367 {
1368 int i;
1369 for (i = 0; i < NPC_MUTEX; i++) {
1370 mutex_exit(FPC_MUTEX(mnode, i));
1371 mutex_exit(CPC_MUTEX(mnode, i));
1372 }
1373 }
1374
1375 /*
1376 * add pp to the specified page list. Defaults to head of the page list
1377 * unless PG_LIST_TAIL is specified.
1378 */
1379 void
page_list_add(page_t * pp,int flags)1380 page_list_add(page_t *pp, int flags)
1381 {
1382 page_t **ppp;
1383 kmutex_t *pcm;
1384 uint_t bin, mtype;
1385 int mnode;
1386
1387 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1388 ASSERT(PP_ISFREE(pp));
1389 ASSERT(!hat_page_is_mapped(pp));
1390 ASSERT(hat_page_getshare(pp) == 0);
1391
1392 /*
1393 * Large pages should be freed via page_list_add_pages().
1394 */
1395 ASSERT(pp->p_szc == 0);
1396
1397 /*
1398 * Don't need to lock the freelist first here
1399 * because the page isn't on the freelist yet.
1400 * This means p_szc can't change on us.
1401 */
1402
1403 bin = PP_2_BIN(pp);
1404 mnode = PP_2_MEM_NODE(pp);
1405 mtype = PP_2_MTYPE(pp);
1406
1407 if (flags & PG_LIST_ISINIT) {
1408 /*
1409 * PG_LIST_ISINIT is set during system startup (ie. single
1410 * threaded), add a page to the free list and add to the
1411 * the free region counters w/o any locking
1412 */
1413 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1414
1415 /* inline version of page_add() */
1416 if (*ppp != NULL) {
1417 pp->p_next = *ppp;
1418 pp->p_prev = (*ppp)->p_prev;
1419 (*ppp)->p_prev = pp;
1420 pp->p_prev->p_next = pp;
1421 } else
1422 *ppp = pp;
1423
1424 page_ctr_add_internal(mnode, mtype, pp, flags);
1425 VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1426 } else {
1427 pcm = PC_BIN_MUTEX(mnode, bin, flags);
1428
1429 if (flags & PG_FREE_LIST) {
1430 VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1431 ASSERT(PP_ISAGED(pp));
1432 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1433
1434 } else {
1435 VM_STAT_ADD(vmm_vmstats.pladd_cache);
1436 ASSERT(pp->p_vnode);
1437 ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1438 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1439 }
1440 mutex_enter(pcm);
1441 page_add(ppp, pp);
1442
1443 if (flags & PG_LIST_TAIL)
1444 *ppp = (*ppp)->p_next;
1445 /*
1446 * Add counters before releasing pcm mutex to avoid a race with
1447 * page_freelist_coalesce and page_freelist_split.
1448 */
1449 page_ctr_add(mnode, mtype, pp, flags);
1450 mutex_exit(pcm);
1451 }
1452
1453
1454 #if defined(__sparc)
1455 if (PP_ISNORELOC(pp)) {
1456 kcage_freemem_add(1);
1457 }
1458 #endif
1459 /*
1460 * It is up to the caller to unlock the page!
1461 */
1462 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1463 }
1464
1465
1466 #ifdef __sparc
1467 /*
1468 * This routine is only used by kcage_init during system startup.
1469 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
1470 * without the overhead of taking locks and updating counters.
1471 */
1472 void
page_list_noreloc_startup(page_t * pp)1473 page_list_noreloc_startup(page_t *pp)
1474 {
1475 page_t **ppp;
1476 uint_t bin;
1477 int mnode;
1478 int mtype;
1479 int flags = 0;
1480
1481 /*
1482 * If this is a large page on the freelist then
1483 * break it up into smaller pages.
1484 */
1485 if (pp->p_szc != 0)
1486 page_boot_demote(pp);
1487
1488 /*
1489 * Get list page is currently on.
1490 */
1491 bin = PP_2_BIN(pp);
1492 mnode = PP_2_MEM_NODE(pp);
1493 mtype = PP_2_MTYPE(pp);
1494 ASSERT(mtype == MTYPE_RELOC);
1495 ASSERT(pp->p_szc == 0);
1496
1497 if (PP_ISAGED(pp)) {
1498 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1499 flags |= PG_FREE_LIST;
1500 } else {
1501 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1502 flags |= PG_CACHE_LIST;
1503 }
1504
1505 ASSERT(*ppp != NULL);
1506
1507 /*
1508 * Delete page from current list.
1509 */
1510 if (*ppp == pp)
1511 *ppp = pp->p_next; /* go to next page */
1512 if (*ppp == pp) {
1513 *ppp = NULL; /* page list is gone */
1514 } else {
1515 pp->p_prev->p_next = pp->p_next;
1516 pp->p_next->p_prev = pp->p_prev;
1517 }
1518
1519 /*
1520 * Decrement page counters
1521 */
1522 page_ctr_sub_internal(mnode, mtype, pp, flags);
1523
1524 /*
1525 * Set no reloc for cage initted pages.
1526 */
1527 PP_SETNORELOC(pp);
1528
1529 mtype = PP_2_MTYPE(pp);
1530 ASSERT(mtype == MTYPE_NORELOC);
1531
1532 /*
1533 * Get new list for page.
1534 */
1535 if (PP_ISAGED(pp)) {
1536 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1537 } else {
1538 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1539 }
1540
1541 /*
1542 * Insert page on new list.
1543 */
1544 if (*ppp == NULL) {
1545 *ppp = pp;
1546 pp->p_next = pp->p_prev = pp;
1547 } else {
1548 pp->p_next = *ppp;
1549 pp->p_prev = (*ppp)->p_prev;
1550 (*ppp)->p_prev = pp;
1551 pp->p_prev->p_next = pp;
1552 }
1553
1554 /*
1555 * Increment page counters
1556 */
1557 page_ctr_add_internal(mnode, mtype, pp, flags);
1558
1559 /*
1560 * Update cage freemem counter
1561 */
1562 atomic_add_long(&kcage_freemem, 1);
1563 }
1564 #else /* __sparc */
1565
1566 /* ARGSUSED */
1567 void
page_list_noreloc_startup(page_t * pp)1568 page_list_noreloc_startup(page_t *pp)
1569 {
1570 panic("page_list_noreloc_startup: should be here only for sparc");
1571 }
1572 #endif
1573
1574 void
page_list_add_pages(page_t * pp,int flags)1575 page_list_add_pages(page_t *pp, int flags)
1576 {
1577 kmutex_t *pcm;
1578 pgcnt_t pgcnt;
1579 uint_t bin, mtype, i;
1580 int mnode;
1581
1582 /* default to freelist/head */
1583 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1584
1585 CHK_LPG(pp, pp->p_szc);
1586 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1587
1588 bin = PP_2_BIN(pp);
1589 mnode = PP_2_MEM_NODE(pp);
1590 mtype = PP_2_MTYPE(pp);
1591
1592 if (flags & PG_LIST_ISINIT) {
1593 ASSERT(pp->p_szc == mmu_page_sizes - 1);
1594 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1595 ASSERT(!PP_ISNORELOC(pp));
1596 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1597 } else {
1598
1599 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1600
1601 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1602
1603 mutex_enter(pcm);
1604 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1605 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1606 mutex_exit(pcm);
1607
1608 pgcnt = page_get_pagecnt(pp->p_szc);
1609 #if defined(__sparc)
1610 if (PP_ISNORELOC(pp))
1611 kcage_freemem_add(pgcnt);
1612 #endif
1613 for (i = 0; i < pgcnt; i++, pp++)
1614 page_unlock_nocapture(pp);
1615 }
1616 }
1617
1618 /*
1619 * During boot, need to demote a large page to base
1620 * pagesize pages for seg_kmem for use in boot_alloc()
1621 */
1622 void
page_boot_demote(page_t * pp)1623 page_boot_demote(page_t *pp)
1624 {
1625 ASSERT(pp->p_szc != 0);
1626 ASSERT(PP_ISFREE(pp));
1627 ASSERT(PP_ISAGED(pp));
1628
1629 (void) page_demote(PP_2_MEM_NODE(pp),
1630 PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR,
1631 PC_FREE);
1632
1633 ASSERT(PP_ISFREE(pp));
1634 ASSERT(PP_ISAGED(pp));
1635 ASSERT(pp->p_szc == 0);
1636 }
1637
1638 /*
1639 * Take a particular page off of whatever freelist the page
1640 * is claimed to be on.
1641 *
1642 * NOTE: Only used for PAGESIZE pages.
1643 */
1644 void
page_list_sub(page_t * pp,int flags)1645 page_list_sub(page_t *pp, int flags)
1646 {
1647 int bin;
1648 uint_t mtype;
1649 int mnode;
1650 kmutex_t *pcm;
1651 page_t **ppp;
1652
1653 ASSERT(PAGE_EXCL(pp));
1654 ASSERT(PP_ISFREE(pp));
1655
1656 /*
1657 * The p_szc field can only be changed by page_promote()
1658 * and page_demote(). Only free pages can be promoted and
1659 * demoted and the free list MUST be locked during these
1660 * operations. So to prevent a race in page_list_sub()
1661 * between computing which bin of the freelist lock to
1662 * grab and actually grabing the lock we check again that
1663 * the bin we locked is still the correct one. Notice that
1664 * the p_szc field could have actually changed on us but
1665 * if the bin happens to still be the same we are safe.
1666 */
1667 try_again:
1668 bin = PP_2_BIN(pp);
1669 mnode = PP_2_MEM_NODE(pp);
1670 pcm = PC_BIN_MUTEX(mnode, bin, flags);
1671 mutex_enter(pcm);
1672 if (PP_2_BIN(pp) != bin) {
1673 mutex_exit(pcm);
1674 goto try_again;
1675 }
1676 mtype = PP_2_MTYPE(pp);
1677
1678 if (flags & PG_FREE_LIST) {
1679 VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1680 ASSERT(PP_ISAGED(pp));
1681 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1682 } else {
1683 VM_STAT_ADD(vmm_vmstats.plsub_cache);
1684 ASSERT(!PP_ISAGED(pp));
1685 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1686 }
1687
1688 /*
1689 * Common PAGESIZE case.
1690 *
1691 * Note that we locked the freelist. This prevents
1692 * any page promotion/demotion operations. Therefore
1693 * the p_szc will not change until we drop pcm mutex.
1694 */
1695 if (pp->p_szc == 0) {
1696 page_sub(ppp, pp);
1697 /*
1698 * Subtract counters before releasing pcm mutex
1699 * to avoid race with page_freelist_coalesce.
1700 */
1701 page_ctr_sub(mnode, mtype, pp, flags);
1702 mutex_exit(pcm);
1703
1704 #if defined(__sparc)
1705 if (PP_ISNORELOC(pp)) {
1706 kcage_freemem_sub(1);
1707 }
1708 #endif
1709 return;
1710 }
1711
1712 /*
1713 * Large pages on the cache list are not supported.
1714 */
1715 if (flags & PG_CACHE_LIST)
1716 panic("page_list_sub: large page on cachelist");
1717
1718 /*
1719 * Slow but rare.
1720 *
1721 * Somebody wants this particular page which is part
1722 * of a large page. In this case we just demote the page
1723 * if it's on the freelist.
1724 *
1725 * We have to drop pcm before locking the entire freelist.
1726 * Once we have re-locked the freelist check to make sure
1727 * the page hasn't already been demoted or completely
1728 * freed.
1729 */
1730 mutex_exit(pcm);
1731 page_freelist_lock(mnode);
1732 if (pp->p_szc != 0) {
1733 /*
1734 * Large page is on freelist.
1735 */
1736 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1737 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1738 }
1739 ASSERT(PP_ISFREE(pp));
1740 ASSERT(PP_ISAGED(pp));
1741 ASSERT(pp->p_szc == 0);
1742
1743 /*
1744 * Subtract counters before releasing pcm mutex
1745 * to avoid race with page_freelist_coalesce.
1746 */
1747 bin = PP_2_BIN(pp);
1748 mtype = PP_2_MTYPE(pp);
1749 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1750
1751 page_sub(ppp, pp);
1752 page_ctr_sub(mnode, mtype, pp, flags);
1753 page_freelist_unlock(mnode);
1754
1755 #if defined(__sparc)
1756 if (PP_ISNORELOC(pp)) {
1757 kcage_freemem_sub(1);
1758 }
1759 #endif
1760 }
1761
1762 void
page_list_sub_pages(page_t * pp,uint_t szc)1763 page_list_sub_pages(page_t *pp, uint_t szc)
1764 {
1765 kmutex_t *pcm;
1766 uint_t bin, mtype;
1767 int mnode;
1768
1769 ASSERT(PAGE_EXCL(pp));
1770 ASSERT(PP_ISFREE(pp));
1771 ASSERT(PP_ISAGED(pp));
1772
1773 /*
1774 * See comment in page_list_sub().
1775 */
1776 try_again:
1777 bin = PP_2_BIN(pp);
1778 mnode = PP_2_MEM_NODE(pp);
1779 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1780 mutex_enter(pcm);
1781 if (PP_2_BIN(pp) != bin) {
1782 mutex_exit(pcm);
1783 goto try_again;
1784 }
1785
1786 /*
1787 * If we're called with a page larger than szc or it got
1788 * promoted above szc before we locked the freelist then
1789 * drop pcm and re-lock entire freelist. If page still larger
1790 * than szc then demote it.
1791 */
1792 if (pp->p_szc > szc) {
1793 mutex_exit(pcm);
1794 pcm = NULL;
1795 page_freelist_lock(mnode);
1796 if (pp->p_szc > szc) {
1797 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1798 (void) page_demote(mnode,
1799 PFN_BASE(pp->p_pagenum, pp->p_szc), 0,
1800 pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1801 }
1802 bin = PP_2_BIN(pp);
1803 }
1804 ASSERT(PP_ISFREE(pp));
1805 ASSERT(PP_ISAGED(pp));
1806 ASSERT(pp->p_szc <= szc);
1807 ASSERT(pp == PP_PAGEROOT(pp));
1808
1809 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1810
1811 mtype = PP_2_MTYPE(pp);
1812 if (pp->p_szc != 0) {
1813 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1814 CHK_LPG(pp, pp->p_szc);
1815 } else {
1816 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1817 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1818 }
1819 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1820
1821 if (pcm != NULL) {
1822 mutex_exit(pcm);
1823 } else {
1824 page_freelist_unlock(mnode);
1825 }
1826
1827 #if defined(__sparc)
1828 if (PP_ISNORELOC(pp)) {
1829 pgcnt_t pgcnt;
1830
1831 pgcnt = page_get_pagecnt(pp->p_szc);
1832 kcage_freemem_sub(pgcnt);
1833 }
1834 #endif
1835 }
1836
1837 /*
1838 * Add the page to the front of a linked list of pages
1839 * using the p_next & p_prev pointers for the list.
1840 * The caller is responsible for protecting the list pointers.
1841 */
1842 void
mach_page_add(page_t ** ppp,page_t * pp)1843 mach_page_add(page_t **ppp, page_t *pp)
1844 {
1845 if (*ppp == NULL) {
1846 pp->p_next = pp->p_prev = pp;
1847 } else {
1848 pp->p_next = *ppp;
1849 pp->p_prev = (*ppp)->p_prev;
1850 (*ppp)->p_prev = pp;
1851 pp->p_prev->p_next = pp;
1852 }
1853 *ppp = pp;
1854 }
1855
1856 /*
1857 * Remove this page from a linked list of pages
1858 * using the p_next & p_prev pointers for the list.
1859 *
1860 * The caller is responsible for protecting the list pointers.
1861 */
1862 void
mach_page_sub(page_t ** ppp,page_t * pp)1863 mach_page_sub(page_t **ppp, page_t *pp)
1864 {
1865 ASSERT(PP_ISFREE(pp));
1866
1867 if (*ppp == NULL || pp == NULL)
1868 panic("mach_page_sub");
1869
1870 if (*ppp == pp)
1871 *ppp = pp->p_next; /* go to next page */
1872
1873 if (*ppp == pp)
1874 *ppp = NULL; /* page list is gone */
1875 else {
1876 pp->p_prev->p_next = pp->p_next;
1877 pp->p_next->p_prev = pp->p_prev;
1878 }
1879 pp->p_prev = pp->p_next = pp; /* make pp a list of one */
1880 }
1881
1882 /*
1883 * Routine fsflush uses to gradually coalesce the free list into larger pages.
1884 */
1885 void
page_promote_size(page_t * pp,uint_t cur_szc)1886 page_promote_size(page_t *pp, uint_t cur_szc)
1887 {
1888 pfn_t pfn;
1889 int mnode;
1890 int idx;
1891 int new_szc = cur_szc + 1;
1892 int full = FULL_REGION_CNT(new_szc);
1893
1894 pfn = page_pptonum(pp);
1895 mnode = PFN_2_MEM_NODE(pfn);
1896
1897 page_freelist_lock(mnode);
1898
1899 idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1900 if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1901 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
1902
1903 page_freelist_unlock(mnode);
1904 }
1905
1906 static uint_t page_promote_err;
1907 static uint_t page_promote_noreloc_err;
1908
1909 /*
1910 * Create a single larger page (of szc new_szc) from smaller contiguous pages
1911 * for the given mnode starting at pfnum. Pages involved are on the freelist
1912 * before the call and may be returned to the caller if requested, otherwise
1913 * they will be placed back on the freelist.
1914 * If flags is PC_ALLOC, then the large page will be returned to the user in
1915 * a state which is consistent with a page being taken off the freelist. If
1916 * we failed to lock the new large page, then we will return NULL to the
1917 * caller and put the large page on the freelist instead.
1918 * If flags is PC_FREE, then the large page will be placed on the freelist,
1919 * and NULL will be returned.
1920 * The caller is responsible for locking the freelist as well as any other
1921 * accounting which needs to be done for a returned page.
1922 *
1923 * RFE: For performance pass in pp instead of pfnum so
1924 * we can avoid excessive calls to page_numtopp_nolock().
1925 * This would depend on an assumption that all contiguous
1926 * pages are in the same memseg so we can just add/dec
1927 * our pp.
1928 *
1929 * Lock ordering:
1930 *
1931 * There is a potential but rare deadlock situation
1932 * for page promotion and demotion operations. The problem
1933 * is there are two paths into the freelist manager and
1934 * they have different lock orders:
1935 *
1936 * page_create()
1937 * lock freelist
1938 * page_lock(EXCL)
1939 * unlock freelist
1940 * return
1941 * caller drops page_lock
1942 *
1943 * page_free() and page_reclaim()
1944 * caller grabs page_lock(EXCL)
1945 *
1946 * lock freelist
1947 * unlock freelist
1948 * drop page_lock
1949 *
1950 * What prevents a thread in page_create() from deadlocking
1951 * with a thread freeing or reclaiming the same page is the
1952 * page_trylock() in page_get_freelist(). If the trylock fails
1953 * it skips the page.
1954 *
1955 * The lock ordering for promotion and demotion is the same as
1956 * for page_create(). Since the same deadlock could occur during
1957 * page promotion and freeing or reclaiming of a page on the
1958 * cache list we might have to fail the operation and undo what
1959 * have done so far. Again this is rare.
1960 */
1961 page_t *
page_promote(int mnode,pfn_t pfnum,uchar_t new_szc,int flags,int mtype)1962 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
1963 {
1964 page_t *pp, *pplist, *tpp, *start_pp;
1965 pgcnt_t new_npgs, npgs;
1966 uint_t bin;
1967 pgcnt_t tmpnpgs, pages_left;
1968 uint_t noreloc;
1969 int which_list;
1970 ulong_t index;
1971 kmutex_t *phm;
1972
1973 /*
1974 * General algorithm:
1975 * Find the starting page
1976 * Walk each page struct removing it from the freelist,
1977 * and linking it to all the other pages removed.
1978 * Once all pages are off the freelist,
1979 * walk the list, modifying p_szc to new_szc and what
1980 * ever other info needs to be done to create a large free page.
1981 * According to the flags, either return the page or put it
1982 * on the freelist.
1983 */
1984
1985 start_pp = page_numtopp_nolock(pfnum);
1986 ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
1987 new_npgs = page_get_pagecnt(new_szc);
1988 ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
1989
1990 /* don't return page of the wrong mtype */
1991 if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
1992 return (NULL);
1993
1994 /*
1995 * Loop through smaller pages to confirm that all pages
1996 * give the same result for PP_ISNORELOC().
1997 * We can check this reliably here as the protocol for setting
1998 * P_NORELOC requires pages to be taken off the free list first.
1999 */
2000 noreloc = PP_ISNORELOC(start_pp);
2001 for (pp = start_pp + new_npgs; --pp > start_pp; ) {
2002 if (noreloc != PP_ISNORELOC(pp)) {
2003 page_promote_noreloc_err++;
2004 page_promote_err++;
2005 return (NULL);
2006 }
2007 }
2008
2009 pages_left = new_npgs;
2010 pplist = NULL;
2011 pp = start_pp;
2012
2013 /* Loop around coalescing the smaller pages into a big page. */
2014 while (pages_left) {
2015 /*
2016 * Remove from the freelist.
2017 */
2018 ASSERT(PP_ISFREE(pp));
2019 bin = PP_2_BIN(pp);
2020 ASSERT(mnode == PP_2_MEM_NODE(pp));
2021 mtype = PP_2_MTYPE(pp);
2022 if (PP_ISAGED(pp)) {
2023
2024 /*
2025 * PG_FREE_LIST
2026 */
2027 if (pp->p_szc) {
2028 page_vpsub(&PAGE_FREELISTS(mnode,
2029 pp->p_szc, bin, mtype), pp);
2030 } else {
2031 mach_page_sub(&PAGE_FREELISTS(mnode, 0,
2032 bin, mtype), pp);
2033 }
2034 which_list = PG_FREE_LIST;
2035 } else {
2036 ASSERT(pp->p_szc == 0);
2037
2038 /*
2039 * PG_CACHE_LIST
2040 *
2041 * Since this page comes from the
2042 * cachelist, we must destroy the
2043 * vnode association.
2044 */
2045 if (!page_trylock(pp, SE_EXCL)) {
2046 goto fail_promote;
2047 }
2048
2049 /*
2050 * We need to be careful not to deadlock
2051 * with another thread in page_lookup().
2052 * The page_lookup() thread could be holding
2053 * the same phm that we need if the two
2054 * pages happen to hash to the same phm lock.
2055 * At this point we have locked the entire
2056 * freelist and page_lookup() could be trying
2057 * to grab a freelist lock.
2058 */
2059 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
2060 phm = PAGE_HASH_MUTEX(index);
2061 if (!mutex_tryenter(phm)) {
2062 page_unlock_nocapture(pp);
2063 goto fail_promote;
2064 }
2065
2066 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
2067 page_hashout(pp, phm);
2068 mutex_exit(phm);
2069 PP_SETAGED(pp);
2070 page_unlock_nocapture(pp);
2071 which_list = PG_CACHE_LIST;
2072 }
2073 page_ctr_sub(mnode, mtype, pp, which_list);
2074
2075 /*
2076 * Concatenate the smaller page(s) onto
2077 * the large page list.
2078 */
2079 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
2080 pages_left -= npgs;
2081 tpp = pp;
2082 while (npgs--) {
2083 tpp->p_szc = new_szc;
2084 tpp = tpp->p_next;
2085 }
2086 page_list_concat(&pplist, &pp);
2087 pp += tmpnpgs;
2088 }
2089 CHK_LPG(pplist, new_szc);
2090
2091 /*
2092 * return the page to the user if requested
2093 * in the properly locked state.
2094 */
2095 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
2096 return (pplist);
2097 }
2098
2099 /*
2100 * Otherwise place the new large page on the freelist
2101 */
2102 bin = PP_2_BIN(pplist);
2103 mnode = PP_2_MEM_NODE(pplist);
2104 mtype = PP_2_MTYPE(pplist);
2105 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
2106
2107 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
2108 return (NULL);
2109
2110 fail_promote:
2111 /*
2112 * A thread must have still been freeing or
2113 * reclaiming the page on the cachelist.
2114 * To prevent a deadlock undo what we have
2115 * done sofar and return failure. This
2116 * situation can only happen while promoting
2117 * PAGESIZE pages.
2118 */
2119 page_promote_err++;
2120 while (pplist) {
2121 pp = pplist;
2122 mach_page_sub(&pplist, pp);
2123 pp->p_szc = 0;
2124 bin = PP_2_BIN(pp);
2125 mtype = PP_2_MTYPE(pp);
2126 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
2127 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2128 }
2129 return (NULL);
2130
2131 }
2132
2133 /*
2134 * Break up a large page into smaller size pages.
2135 * Pages involved are on the freelist before the call and may
2136 * be returned to the caller if requested, otherwise they will
2137 * be placed back on the freelist.
2138 * The caller is responsible for locking the freelist as well as any other
2139 * accounting which needs to be done for a returned page.
2140 * If flags is not PC_ALLOC, the color argument is ignored, and thus
2141 * technically, any value may be passed in but PC_NO_COLOR is the standard
2142 * which should be followed for clarity's sake.
2143 * Returns a page whose pfn is < pfnmax
2144 */
2145 page_t *
page_demote(int mnode,pfn_t pfnum,pfn_t pfnmax,uchar_t cur_szc,uchar_t new_szc,int color,int flags)2146 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc,
2147 uchar_t new_szc, int color, int flags)
2148 {
2149 page_t *pp, *pplist, *npplist;
2150 pgcnt_t npgs, n;
2151 uint_t bin;
2152 uint_t mtype;
2153 page_t *ret_pp = NULL;
2154
2155 ASSERT(cur_szc != 0);
2156 ASSERT(new_szc < cur_szc);
2157
2158 pplist = page_numtopp_nolock(pfnum);
2159 ASSERT(pplist != NULL);
2160
2161 ASSERT(pplist->p_szc == cur_szc);
2162
2163 bin = PP_2_BIN(pplist);
2164 ASSERT(mnode == PP_2_MEM_NODE(pplist));
2165 mtype = PP_2_MTYPE(pplist);
2166 page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
2167
2168 CHK_LPG(pplist, cur_szc);
2169 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
2170
2171 /*
2172 * Number of PAGESIZE pages for smaller new_szc
2173 * page.
2174 */
2175 npgs = page_get_pagecnt(new_szc);
2176
2177 while (pplist) {
2178 pp = pplist;
2179
2180 ASSERT(pp->p_szc == cur_szc);
2181
2182 /*
2183 * We either break it up into PAGESIZE pages or larger.
2184 */
2185 if (npgs == 1) { /* PAGESIZE case */
2186 mach_page_sub(&pplist, pp);
2187 ASSERT(pp->p_szc == cur_szc);
2188 ASSERT(new_szc == 0);
2189 ASSERT(mnode == PP_2_MEM_NODE(pp));
2190 pp->p_szc = new_szc;
2191 bin = PP_2_BIN(pp);
2192 if ((bin == color) && (flags == PC_ALLOC) &&
2193 (ret_pp == NULL) && (pfnmax == 0 ||
2194 pp->p_pagenum < pfnmax) &&
2195 page_trylock_cons(pp, SE_EXCL)) {
2196 ret_pp = pp;
2197 } else {
2198 mtype = PP_2_MTYPE(pp);
2199 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
2200 mtype), pp);
2201 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2202 }
2203 } else {
2204 page_t *try_to_return_this_page = NULL;
2205 int count = 0;
2206
2207 /*
2208 * Break down into smaller lists of pages.
2209 */
2210 page_list_break(&pplist, &npplist, npgs);
2211
2212 pp = pplist;
2213 n = npgs;
2214 while (n--) {
2215 ASSERT(pp->p_szc == cur_szc);
2216 /*
2217 * Check whether all the pages in this list
2218 * fit the request criteria.
2219 */
2220 if (pfnmax == 0 || pp->p_pagenum < pfnmax) {
2221 count++;
2222 }
2223 pp->p_szc = new_szc;
2224 pp = pp->p_next;
2225 }
2226
2227 if (count == npgs &&
2228 (pfnmax == 0 || pp->p_pagenum < pfnmax)) {
2229 try_to_return_this_page = pp;
2230 }
2231
2232 CHK_LPG(pplist, new_szc);
2233
2234 bin = PP_2_BIN(pplist);
2235 if (try_to_return_this_page)
2236 ASSERT(mnode ==
2237 PP_2_MEM_NODE(try_to_return_this_page));
2238 if ((bin == color) && (flags == PC_ALLOC) &&
2239 (ret_pp == NULL) && try_to_return_this_page &&
2240 page_trylock_cons(try_to_return_this_page,
2241 SE_EXCL)) {
2242 ret_pp = try_to_return_this_page;
2243 } else {
2244 mtype = PP_2_MTYPE(pp);
2245 page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
2246 bin, mtype), pplist);
2247
2248 page_ctr_add(mnode, mtype, pplist,
2249 PG_FREE_LIST);
2250 }
2251 pplist = npplist;
2252 }
2253 }
2254 return (ret_pp);
2255 }
2256
2257 int mpss_coalesce_disable = 0;
2258
2259 /*
2260 * Coalesce free pages into a page of the given szc and color if possible.
2261 * Return the pointer to the page created, otherwise, return NULL.
2262 *
2263 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2264 */
2265 page_t *
page_freelist_coalesce(int mnode,uchar_t szc,uint_t color,uint_t ceq_mask,int mtype,pfn_t pfnhi)2266 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
2267 int mtype, pfn_t pfnhi)
2268 {
2269 int r = szc; /* region size */
2270 int mrange;
2271 uint_t full, bin, color_mask, wrap = 0;
2272 pfn_t pfnum, lo, hi;
2273 size_t len, idx, idx0;
2274 pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc);
2275 page_t *ret_pp;
2276 MEM_NODE_ITERATOR_DECL(it);
2277 #if defined(__sparc)
2278 pfn_t pfnum0, nlo, nhi;
2279 #endif
2280
2281 if (mpss_coalesce_disable) {
2282 ASSERT(szc < MMU_PAGE_SIZES);
2283 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
2284 return (NULL);
2285 }
2286
2287 ASSERT(szc < mmu_page_sizes);
2288 color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
2289 ASSERT(ceq_mask <= color_mask);
2290 ASSERT(color <= color_mask);
2291 color &= ceq_mask;
2292
2293 /* Prevent page_counters dynamic memory from being freed */
2294 rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2295
2296 mrange = MTYPE_2_MRANGE(mnode, mtype);
2297 ASSERT(mrange < mnode_nranges[mnode]);
2298 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
2299
2300 /* get pfn range for mtype */
2301 len = PAGE_COUNTERS_ENTRIES(mnode, r);
2302 MNODETYPE_2_PFN(mnode, mtype, lo, hi);
2303 hi++;
2304
2305 /* use lower limit if given */
2306 if (pfnhi != PFNNULL && pfnhi < hi)
2307 hi = pfnhi;
2308
2309 /* round to szcpgcnt boundaries */
2310 lo = P2ROUNDUP(lo, szcpgcnt);
2311 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
2312 if (lo == (pfn_t)-1) {
2313 rw_exit(&page_ctrs_rwlock[mnode]);
2314 return (NULL);
2315 }
2316 hi = hi & ~(szcpgcnt - 1);
2317
2318 /* set lo to the closest pfn of the right color */
2319 if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
2320 (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
2321 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
2322 &it);
2323 }
2324
2325 if (hi <= lo) {
2326 rw_exit(&page_ctrs_rwlock[mnode]);
2327 return (NULL);
2328 }
2329
2330 full = FULL_REGION_CNT(r);
2331
2332 /* calculate the number of page candidates and initial search index */
2333 bin = color;
2334 idx0 = (size_t)(-1);
2335 do {
2336 pgcnt_t acand;
2337
2338 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
2339 if (acand) {
2340 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
2341 r, bin, mrange);
2342 idx0 = MIN(idx0, idx);
2343 cands += acand;
2344 }
2345 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
2346 } while (bin != color);
2347
2348 if (cands == 0) {
2349 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
2350 rw_exit(&page_ctrs_rwlock[mnode]);
2351 return (NULL);
2352 }
2353
2354 pfnum = IDX_TO_PNUM(mnode, r, idx0);
2355 if (pfnum < lo || pfnum >= hi) {
2356 pfnum = lo;
2357 } else {
2358 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2359 if (pfnum == (pfn_t)-1) {
2360 pfnum = lo;
2361 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2362 ASSERT(pfnum != (pfn_t)-1);
2363 } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
2364 (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
2365 /* invalid color, get the closest correct pfn */
2366 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2367 color_mask, &it);
2368 if (pfnum >= hi) {
2369 pfnum = lo;
2370 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2371 }
2372 }
2373 }
2374
2375 /* set starting index */
2376 idx0 = PNUM_TO_IDX(mnode, r, pfnum);
2377 ASSERT(idx0 < len);
2378
2379 #if defined(__sparc)
2380 pfnum0 = pfnum; /* page corresponding to idx0 */
2381 nhi = 0; /* search kcage ranges */
2382 #endif
2383
2384 for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
2385
2386 #if defined(__sparc)
2387 /*
2388 * Find lowest intersection of kcage ranges and mnode.
2389 * MTYPE_NORELOC means look in the cage, otherwise outside.
2390 */
2391 if (nhi <= pfnum) {
2392 if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum,
2393 (wrap == 0 ? hi : pfnum0), &nlo, &nhi))
2394 goto wrapit;
2395
2396 /* jump to the next page in the range */
2397 if (pfnum < nlo) {
2398 pfnum = P2ROUNDUP(nlo, szcpgcnt);
2399 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2400 idx = PNUM_TO_IDX(mnode, r, pfnum);
2401 if (idx >= len || pfnum >= hi)
2402 goto wrapit;
2403 if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) &
2404 ceq_mask)
2405 goto next;
2406 if (interleaved_mnodes &&
2407 PFN_2_MEM_NODE(pfnum) != mnode)
2408 goto next;
2409 }
2410 }
2411 #endif
2412
2413 if (PAGE_COUNTERS(mnode, r, idx) != full)
2414 goto next;
2415
2416 /*
2417 * RFE: For performance maybe we can do something less
2418 * brutal than locking the entire freelist. So far
2419 * this doesn't seem to be a performance problem?
2420 */
2421 page_freelist_lock(mnode);
2422 if (PAGE_COUNTERS(mnode, r, idx) == full) {
2423 ret_pp =
2424 page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
2425 if (ret_pp != NULL) {
2426 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
2427 PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
2428 PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
2429 page_freelist_unlock(mnode);
2430 rw_exit(&page_ctrs_rwlock[mnode]);
2431 #if defined(__sparc)
2432 if (PP_ISNORELOC(ret_pp)) {
2433 pgcnt_t npgs;
2434
2435 npgs = page_get_pagecnt(ret_pp->p_szc);
2436 kcage_freemem_sub(npgs);
2437 }
2438 #endif
2439 return (ret_pp);
2440 }
2441 } else {
2442 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
2443 }
2444
2445 page_freelist_unlock(mnode);
2446 /*
2447 * No point looking for another page if we've
2448 * already tried all of the ones that
2449 * page_ctr_cands indicated. Stash off where we left
2450 * off.
2451 * Note: this is not exact since we don't hold the
2452 * page_freelist_locks before we initially get the
2453 * value of cands for performance reasons, but should
2454 * be a decent approximation.
2455 */
2456 if (--cands == 0) {
2457 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
2458 idx;
2459 break;
2460 }
2461 next:
2462 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2463 color_mask, &it);
2464 idx = PNUM_TO_IDX(mnode, r, pfnum);
2465 if (idx >= len || pfnum >= hi) {
2466 wrapit:
2467 pfnum = lo;
2468 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2469 idx = PNUM_TO_IDX(mnode, r, pfnum);
2470 wrap++;
2471 #if defined(__sparc)
2472 nhi = 0; /* search kcage ranges */
2473 #endif
2474 }
2475 }
2476
2477 rw_exit(&page_ctrs_rwlock[mnode]);
2478 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
2479 return (NULL);
2480 }
2481
2482 /*
2483 * For the given mnode, promote as many small pages to large pages as possible.
2484 * mnode can be -1, which means do them all
2485 */
2486 void
page_freelist_coalesce_all(int mnode)2487 page_freelist_coalesce_all(int mnode)
2488 {
2489 int r; /* region size */
2490 int idx, full;
2491 size_t len;
2492 int doall = interleaved_mnodes || mnode < 0;
2493 int mlo = doall ? 0 : mnode;
2494 int mhi = doall ? max_mem_nodes : (mnode + 1);
2495
2496 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2497
2498 if (mpss_coalesce_disable) {
2499 return;
2500 }
2501
2502 /*
2503 * Lock the entire freelist and coalesce what we can.
2504 *
2505 * Always promote to the largest page possible
2506 * first to reduce the number of page promotions.
2507 */
2508 for (mnode = mlo; mnode < mhi; mnode++) {
2509 rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2510 page_freelist_lock(mnode);
2511 }
2512 for (r = mmu_page_sizes - 1; r > 0; r--) {
2513 for (mnode = mlo; mnode < mhi; mnode++) {
2514 pgcnt_t cands = 0;
2515 int mrange, nranges = mnode_nranges[mnode];
2516
2517 for (mrange = 0; mrange < nranges; mrange++) {
2518 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
2519 if (cands != 0)
2520 break;
2521 }
2522 if (cands == 0) {
2523 VM_STAT_ADD(vmm_vmstats.
2524 page_ctrs_cands_skip_all);
2525 continue;
2526 }
2527
2528 full = FULL_REGION_CNT(r);
2529 len = PAGE_COUNTERS_ENTRIES(mnode, r);
2530
2531 for (idx = 0; idx < len; idx++) {
2532 if (PAGE_COUNTERS(mnode, r, idx) == full) {
2533 pfn_t pfnum =
2534 IDX_TO_PNUM(mnode, r, idx);
2535 int tmnode = interleaved_mnodes ?
2536 PFN_2_MEM_NODE(pfnum) : mnode;
2537
2538 ASSERT(pfnum >=
2539 mem_node_config[tmnode].physbase &&
2540 pfnum <
2541 mem_node_config[tmnode].physmax);
2542
2543 (void) page_promote(tmnode,
2544 pfnum, r, PC_FREE, PC_MTYPE_ANY);
2545 }
2546 }
2547 /* shared hpm_counters covers all mnodes, so we quit */
2548 if (interleaved_mnodes)
2549 break;
2550 }
2551 }
2552 for (mnode = mlo; mnode < mhi; mnode++) {
2553 page_freelist_unlock(mnode);
2554 rw_exit(&page_ctrs_rwlock[mnode]);
2555 }
2556 }
2557
2558 /*
2559 * This is where all polices for moving pages around
2560 * to different page size free lists is implemented.
2561 * Returns 1 on success, 0 on failure.
2562 *
2563 * So far these are the priorities for this algorithm in descending
2564 * order:
2565 *
2566 * 1) When servicing a request try to do so with a free page
2567 * from next size up. Helps defer fragmentation as long
2568 * as possible.
2569 *
2570 * 2) Page coalesce on demand. Only when a freelist
2571 * larger than PAGESIZE is empty and step 1
2572 * will not work since all larger size lists are
2573 * also empty.
2574 *
2575 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2576 */
2577
2578 page_t *
page_freelist_split(uchar_t szc,uint_t color,int mnode,int mtype,pfn_t pfnlo,pfn_t pfnhi,page_list_walker_t * plw)2579 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
2580 pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw)
2581 {
2582 uchar_t nszc = szc + 1;
2583 uint_t bin, sbin, bin_prev;
2584 page_t *pp, *firstpp;
2585 page_t *ret_pp = NULL;
2586 uint_t color_mask;
2587
2588 if (nszc == mmu_page_sizes)
2589 return (NULL);
2590
2591 ASSERT(nszc < mmu_page_sizes);
2592 color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
2593 bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
2594 bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
2595 PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
2596
2597 VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
2598 /*
2599 * First try to break up a larger page to fill current size freelist.
2600 */
2601 while (plw->plw_bins[nszc] != 0) {
2602
2603 ASSERT(nszc < mmu_page_sizes);
2604
2605 /*
2606 * If page found then demote it.
2607 */
2608 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
2609 page_freelist_lock(mnode);
2610 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
2611
2612 /*
2613 * If pfnhi is not PFNNULL, look for large page below
2614 * pfnhi. PFNNULL signifies no pfn requirement.
2615 */
2616 if (pp &&
2617 ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) ||
2618 (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) {
2619 do {
2620 pp = pp->p_vpnext;
2621 if (pp == firstpp) {
2622 pp = NULL;
2623 break;
2624 }
2625 } while ((pfnhi != PFNNULL &&
2626 pp->p_pagenum >= pfnhi) ||
2627 (pfnlo != PFNNULL &&
2628 pp->p_pagenum < pfnlo));
2629
2630 if (pfnhi != PFNNULL && pp != NULL)
2631 ASSERT(pp->p_pagenum < pfnhi);
2632
2633 if (pfnlo != PFNNULL && pp != NULL)
2634 ASSERT(pp->p_pagenum >= pfnlo);
2635 }
2636 if (pp) {
2637 uint_t ccolor = page_correct_color(szc, nszc,
2638 color, bin, plw->plw_ceq_mask[szc]);
2639
2640 ASSERT(pp->p_szc == nszc);
2641 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
2642 ret_pp = page_demote(mnode, pp->p_pagenum,
2643 pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC);
2644 if (ret_pp) {
2645 page_freelist_unlock(mnode);
2646 #if defined(__sparc)
2647 if (PP_ISNORELOC(ret_pp)) {
2648 pgcnt_t npgs;
2649
2650 npgs = page_get_pagecnt(
2651 ret_pp->p_szc);
2652 kcage_freemem_sub(npgs);
2653 }
2654 #endif
2655 return (ret_pp);
2656 }
2657 }
2658 page_freelist_unlock(mnode);
2659 }
2660
2661 /* loop through next size bins */
2662 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
2663 plw->plw_bins[nszc]--;
2664
2665 if (bin == sbin) {
2666 uchar_t nnszc = nszc + 1;
2667
2668 /* we are done with this page size - check next */
2669 if (plw->plw_bins[nnszc] == 0)
2670 /* we have already checked next size bins */
2671 break;
2672
2673 bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
2674 if (bin_prev != INVALID_COLOR) {
2675 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
2676 if (!((bin ^ bin_prev) &
2677 plw->plw_ceq_mask[nnszc]))
2678 break;
2679 }
2680 ASSERT(nnszc < mmu_page_sizes);
2681 color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
2682 nszc = nnszc;
2683 ASSERT(nszc < mmu_page_sizes);
2684 }
2685 }
2686
2687 return (ret_pp);
2688 }
2689
2690 /*
2691 * Helper routine used only by the freelist code to lock
2692 * a page. If the page is a large page then it succeeds in
2693 * locking all the constituent pages or none at all.
2694 * Returns 1 on sucess, 0 on failure.
2695 */
2696 static int
page_trylock_cons(page_t * pp,se_t se)2697 page_trylock_cons(page_t *pp, se_t se)
2698 {
2699 page_t *tpp, *first_pp = pp;
2700
2701 /*
2702 * Fail if can't lock first or only page.
2703 */
2704 if (!page_trylock(pp, se)) {
2705 return (0);
2706 }
2707
2708 /*
2709 * PAGESIZE: common case.
2710 */
2711 if (pp->p_szc == 0) {
2712 return (1);
2713 }
2714
2715 /*
2716 * Large page case.
2717 */
2718 tpp = pp->p_next;
2719 while (tpp != pp) {
2720 if (!page_trylock(tpp, se)) {
2721 /*
2722 * On failure unlock what we have locked so far.
2723 * We want to avoid attempting to capture these
2724 * pages as the pcm mutex may be held which could
2725 * lead to a recursive mutex panic.
2726 */
2727 while (first_pp != tpp) {
2728 page_unlock_nocapture(first_pp);
2729 first_pp = first_pp->p_next;
2730 }
2731 return (0);
2732 }
2733 tpp = tpp->p_next;
2734 }
2735 return (1);
2736 }
2737
2738 /*
2739 * init context for walking page lists
2740 * Called when a page of the given szc in unavailable. Sets markers
2741 * for the beginning of the search to detect when search has
2742 * completed a full cycle. Sets flags for splitting larger pages
2743 * and coalescing smaller pages. Page walking procedes until a page
2744 * of the desired equivalent color is found.
2745 */
2746 void
page_list_walk_init(uchar_t szc,uint_t flags,uint_t bin,int can_split,int use_ceq,page_list_walker_t * plw)2747 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
2748 int use_ceq, page_list_walker_t *plw)
2749 {
2750 uint_t nszc, ceq_mask, colors;
2751 uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
2752
2753 ASSERT(szc < mmu_page_sizes);
2754 colors = PAGE_GET_PAGECOLORS(szc);
2755
2756 plw->plw_colors = colors;
2757 plw->plw_color_mask = colors - 1;
2758 plw->plw_bin_marker = plw->plw_bin0 = bin;
2759 plw->plw_bin_split_prev = bin;
2760 plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
2761
2762 /*
2763 * if vac aliasing is possible make sure lower order color
2764 * bits are never ignored
2765 */
2766 if (vac_colors > 1)
2767 ceq &= 0xf0;
2768
2769 /*
2770 * calculate the number of non-equivalent colors and
2771 * color equivalency mask
2772 */
2773 plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
2774 ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
2775 ASSERT(plw->plw_ceq_dif > 0);
2776 plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
2777
2778 if (flags & PG_MATCH_COLOR) {
2779 if (cpu_page_colors < 0) {
2780 /*
2781 * this is a heterogeneous machine with different CPUs
2782 * having different size e$ (not supported for ni2/rock
2783 */
2784 uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
2785 cpucolors = MAX(cpucolors, 1);
2786 ceq_mask = plw->plw_color_mask & (cpucolors - 1);
2787 plw->plw_ceq_mask[szc] =
2788 MIN(ceq_mask, plw->plw_ceq_mask[szc]);
2789 }
2790 plw->plw_ceq_dif = 1;
2791 }
2792
2793 /* we can split pages in the freelist, but not the cachelist */
2794 if (can_split) {
2795 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
2796
2797 /* set next szc color masks and number of free list bins */
2798 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
2799 plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
2800 plw->plw_ceq_mask[szc]);
2801 plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
2802 }
2803 plw->plw_ceq_mask[nszc] = INVALID_MASK;
2804 plw->plw_bins[nszc] = 0;
2805
2806 } else {
2807 ASSERT(szc == 0);
2808 plw->plw_do_split = 0;
2809 plw->plw_bins[1] = 0;
2810 plw->plw_ceq_mask[1] = INVALID_MASK;
2811 }
2812 }
2813
2814 /*
2815 * set mark to flag where next split should occur
2816 */
2817 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) { \
2818 uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin); \
2819 uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0); \
2820 uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask; \
2821 plw->plw_split_next = \
2822 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask); \
2823 if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2824 plw->plw_split_next = \
2825 INC_MASKED(plw->plw_split_next, \
2826 neq_mask, plw->plw_color_mask); \
2827 } \
2828 }
2829
2830 uint_t
page_list_walk_next_bin(uchar_t szc,uint_t bin,page_list_walker_t * plw)2831 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
2832 {
2833 uint_t neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
2834 uint_t bin0_nsz, nbin_nsz, nbin0, nbin;
2835 uchar_t nszc = szc + 1;
2836
2837 nbin = ADD_MASKED(bin,
2838 plw->plw_bin_step, neq_mask, plw->plw_color_mask);
2839
2840 if (plw->plw_do_split) {
2841 plw->plw_bin_split_prev = bin;
2842 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
2843 plw->plw_do_split = 0;
2844 }
2845
2846 if (szc == 0) {
2847 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
2848 if (nbin == plw->plw_bin0 &&
2849 (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
2850 nbin = ADD_MASKED(nbin, plw->plw_bin_step,
2851 neq_mask, plw->plw_color_mask);
2852 plw->plw_bin_split_prev = plw->plw_bin0;
2853 }
2854
2855 if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
2856 plw->plw_bin_marker =
2857 nbin = INC_MASKED(nbin, neq_mask,
2858 plw->plw_color_mask);
2859 plw->plw_bin_split_prev = plw->plw_bin0;
2860 /*
2861 * large pages all have the same vac color
2862 * so by now we should be done with next
2863 * size page splitting process
2864 */
2865 ASSERT(plw->plw_bins[1] == 0);
2866 plw->plw_do_split = 0;
2867 return (nbin);
2868 }
2869
2870 } else {
2871 uint_t bin_jump = (vac_colors == 1) ?
2872 (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
2873
2874 bin_jump &= ~(vac_colors - 1);
2875
2876 nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
2877 plw->plw_color_mask);
2878
2879 if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
2880
2881 plw->plw_bin_marker = nbin = nbin0;
2882
2883 if (plw->plw_bins[nszc] != 0) {
2884 /*
2885 * check if next page size bin is the
2886 * same as the next page size bin for
2887 * bin0
2888 */
2889 nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
2890 nbin);
2891 bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
2892 plw->plw_bin0);
2893
2894 if ((bin0_nsz ^ nbin_nsz) &
2895 plw->plw_ceq_mask[nszc])
2896 plw->plw_do_split = 1;
2897 }
2898 return (nbin);
2899 }
2900 }
2901 }
2902
2903 if (plw->plw_bins[nszc] != 0) {
2904 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
2905 if (!((plw->plw_split_next ^ nbin_nsz) &
2906 plw->plw_ceq_mask[nszc]))
2907 plw->plw_do_split = 1;
2908 }
2909
2910 return (nbin);
2911 }
2912
2913 page_t *
page_get_mnode_freelist(int mnode,uint_t bin,int mtype,uchar_t szc,uint_t flags)2914 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
2915 uint_t flags)
2916 {
2917 kmutex_t *pcm;
2918 page_t *pp, *first_pp;
2919 uint_t sbin;
2920 int plw_initialized;
2921 page_list_walker_t plw;
2922
2923 ASSERT(szc < mmu_page_sizes);
2924
2925 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
2926
2927 MTYPE_START(mnode, mtype, flags);
2928 if (mtype < 0) { /* mnode does not have memory in mtype range */
2929 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
2930 return (NULL);
2931 }
2932 try_again:
2933
2934 plw_initialized = 0;
2935 plw.plw_ceq_dif = 1;
2936
2937 /*
2938 * Only hold one freelist lock at a time, that way we
2939 * can start anywhere and not have to worry about lock
2940 * ordering.
2941 */
2942 for (plw.plw_count = 0;
2943 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
2944 sbin = bin;
2945 do {
2946 if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
2947 goto bin_empty_1;
2948
2949 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
2950 mutex_enter(pcm);
2951 pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
2952 if (pp == NULL)
2953 goto bin_empty_0;
2954
2955 /*
2956 * These were set before the page
2957 * was put on the free list,
2958 * they must still be set.
2959 */
2960 ASSERT(PP_ISFREE(pp));
2961 ASSERT(PP_ISAGED(pp));
2962 ASSERT(pp->p_vnode == NULL);
2963 ASSERT(pp->p_hash == NULL);
2964 ASSERT(pp->p_offset == (u_offset_t)-1);
2965 ASSERT(pp->p_szc == szc);
2966 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2967
2968 /*
2969 * Walk down the hash chain.
2970 * 8k pages are linked on p_next
2971 * and p_prev fields. Large pages
2972 * are a contiguous group of
2973 * constituent pages linked together
2974 * on their p_next and p_prev fields.
2975 * The large pages are linked together
2976 * on the hash chain using p_vpnext
2977 * p_vpprev of the base constituent
2978 * page of each large page.
2979 */
2980 first_pp = pp;
2981 while (IS_DUMP_PAGE(pp) || !page_trylock_cons(pp,
2982 SE_EXCL)) {
2983 if (szc == 0) {
2984 pp = pp->p_next;
2985 } else {
2986 pp = pp->p_vpnext;
2987 }
2988
2989 ASSERT(PP_ISFREE(pp));
2990 ASSERT(PP_ISAGED(pp));
2991 ASSERT(pp->p_vnode == NULL);
2992 ASSERT(pp->p_hash == NULL);
2993 ASSERT(pp->p_offset == (u_offset_t)-1);
2994 ASSERT(pp->p_szc == szc);
2995 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2996
2997 if (pp == first_pp)
2998 goto bin_empty_0;
2999 }
3000
3001 ASSERT(pp != NULL);
3002 ASSERT(mtype == PP_2_MTYPE(pp));
3003 ASSERT(pp->p_szc == szc);
3004 if (szc == 0) {
3005 page_sub(&PAGE_FREELISTS(mnode,
3006 szc, bin, mtype), pp);
3007 } else {
3008 page_vpsub(&PAGE_FREELISTS(mnode,
3009 szc, bin, mtype), pp);
3010 CHK_LPG(pp, szc);
3011 }
3012 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
3013
3014 if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
3015 panic("free page is not. pp %p", (void *)pp);
3016 mutex_exit(pcm);
3017
3018 #if defined(__sparc)
3019 ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
3020 (flags & PG_NORELOC) == 0);
3021
3022 if (PP_ISNORELOC(pp))
3023 kcage_freemem_sub(page_get_pagecnt(szc));
3024 #endif
3025 VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
3026 return (pp);
3027
3028 bin_empty_0:
3029 mutex_exit(pcm);
3030 bin_empty_1:
3031 if (plw_initialized == 0) {
3032 page_list_walk_init(szc, flags, bin, 1, 1,
3033 &plw);
3034 plw_initialized = 1;
3035 ASSERT(plw.plw_colors <=
3036 PAGE_GET_PAGECOLORS(szc));
3037 ASSERT(plw.plw_colors > 0);
3038 ASSERT((plw.plw_colors &
3039 (plw.plw_colors - 1)) == 0);
3040 ASSERT(bin < plw.plw_colors);
3041 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
3042 }
3043 /* calculate the next bin with equivalent color */
3044 bin = ADD_MASKED(bin, plw.plw_bin_step,
3045 plw.plw_ceq_mask[szc], plw.plw_color_mask);
3046 } while (sbin != bin);
3047
3048 /*
3049 * color bins are all empty if color match. Try and
3050 * satisfy the request by breaking up or coalescing
3051 * pages from a different size freelist of the correct
3052 * color that satisfies the ORIGINAL color requested.
3053 * If that fails then try pages of the same size but
3054 * different colors assuming we are not called with
3055 * PG_MATCH_COLOR.
3056 */
3057 if (plw.plw_do_split &&
3058 (pp = page_freelist_split(szc, bin, mnode,
3059 mtype, PFNNULL, PFNNULL, &plw)) != NULL)
3060 return (pp);
3061
3062 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
3063 bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) != NULL)
3064 return (pp);
3065
3066 if (plw.plw_ceq_dif > 1)
3067 bin = page_list_walk_next_bin(szc, bin, &plw);
3068 }
3069
3070 /* if allowed, cycle through additional mtypes */
3071 MTYPE_NEXT(mnode, mtype, flags);
3072 if (mtype >= 0)
3073 goto try_again;
3074
3075 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
3076
3077 return (NULL);
3078 }
3079
3080 /*
3081 * Returns the count of free pages for 'pp' with size code 'szc'.
3082 * Note: This function does not return an exact value as the page freelist
3083 * locks are not held and thus the values in the page_counters may be
3084 * changing as we walk through the data.
3085 */
3086 static int
page_freecnt(int mnode,page_t * pp,uchar_t szc)3087 page_freecnt(int mnode, page_t *pp, uchar_t szc)
3088 {
3089 pgcnt_t pgfree;
3090 pgcnt_t cnt;
3091 ssize_t r = szc; /* region size */
3092 ssize_t idx;
3093 int i;
3094 int full, range;
3095
3096 /* Make sure pagenum passed in is aligned properly */
3097 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
3098 ASSERT(szc > 0);
3099
3100 /* Prevent page_counters dynamic memory from being freed */
3101 rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
3102 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3103 cnt = PAGE_COUNTERS(mnode, r, idx);
3104 pgfree = cnt << PNUM_SHIFT(r - 1);
3105 range = FULL_REGION_CNT(szc);
3106
3107 /* Check for completely full region */
3108 if (cnt == range) {
3109 rw_exit(&page_ctrs_rwlock[mnode]);
3110 return (pgfree);
3111 }
3112
3113 while (--r > 0) {
3114 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3115 full = FULL_REGION_CNT(r);
3116 for (i = 0; i < range; i++, idx++) {
3117 cnt = PAGE_COUNTERS(mnode, r, idx);
3118 /*
3119 * If cnt here is full, that means we have already
3120 * accounted for these pages earlier.
3121 */
3122 if (cnt != full) {
3123 pgfree += (cnt << PNUM_SHIFT(r - 1));
3124 }
3125 }
3126 range *= full;
3127 }
3128 rw_exit(&page_ctrs_rwlock[mnode]);
3129 return (pgfree);
3130 }
3131
3132 /*
3133 * Called from page_geti_contig_pages to exclusively lock constituent pages
3134 * starting from 'spp' for page size code 'szc'.
3135 *
3136 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
3137 * region needs to be greater than or equal to the threshold.
3138 */
3139 static int
page_trylock_contig_pages(int mnode,page_t * spp,uchar_t szc,int flags)3140 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
3141 {
3142 pgcnt_t pgcnt = PNUM_SIZE(szc);
3143 pgcnt_t pgfree, i;
3144 page_t *pp;
3145
3146 VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
3147
3148
3149 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
3150 goto skipptcpcheck;
3151 /*
3152 * check if there are sufficient free pages available before attempting
3153 * to trylock. Count is approximate as page counters can change.
3154 */
3155 pgfree = page_freecnt(mnode, spp, szc);
3156
3157 /* attempt to trylock if there are sufficient already free pages */
3158 if (pgfree < pgcnt/ptcpthreshold) {
3159 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
3160 return (0);
3161 }
3162
3163 skipptcpcheck:
3164
3165 for (i = 0; i < pgcnt; i++) {
3166 pp = &spp[i];
3167 if (!page_trylock(pp, SE_EXCL)) {
3168 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
3169 while (--i != (pgcnt_t)-1) {
3170 pp = &spp[i];
3171 ASSERT(PAGE_EXCL(pp));
3172 page_unlock_nocapture(pp);
3173 }
3174 return (0);
3175 }
3176 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
3177 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
3178 !PP_ISFREE(pp)) {
3179 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
3180 ASSERT(i == 0);
3181 page_unlock_nocapture(pp);
3182 return (0);
3183 }
3184 if (PP_ISNORELOC(pp)) {
3185 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
3186 while (i != (pgcnt_t)-1) {
3187 pp = &spp[i];
3188 ASSERT(PAGE_EXCL(pp));
3189 page_unlock_nocapture(pp);
3190 i--;
3191 }
3192 return (0);
3193 }
3194 }
3195 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
3196 return (1);
3197 }
3198
3199 /*
3200 * Claim large page pointed to by 'pp'. 'pp' is the starting set
3201 * of 'szc' constituent pages that had been locked exclusively previously.
3202 * Will attempt to relocate constituent pages in use.
3203 */
3204 static page_t *
page_claim_contig_pages(page_t * pp,uchar_t szc,int flags)3205 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
3206 {
3207 spgcnt_t pgcnt, npgs, i;
3208 page_t *targpp, *rpp, *hpp;
3209 page_t *replpp = NULL;
3210 page_t *pplist = NULL;
3211
3212 ASSERT(pp != NULL);
3213
3214 pgcnt = page_get_pagecnt(szc);
3215 while (pgcnt) {
3216 ASSERT(PAGE_EXCL(pp));
3217 ASSERT(!PP_ISNORELOC(pp));
3218 if (PP_ISFREE(pp)) {
3219 /*
3220 * If this is a PG_FREE_LIST page then its
3221 * size code can change underneath us due to
3222 * page promotion or demotion. As an optimzation
3223 * use page_list_sub_pages() instead of
3224 * page_list_sub().
3225 */
3226 if (PP_ISAGED(pp)) {
3227 page_list_sub_pages(pp, szc);
3228 if (pp->p_szc == szc) {
3229 return (pp);
3230 }
3231 ASSERT(pp->p_szc < szc);
3232 npgs = page_get_pagecnt(pp->p_szc);
3233 hpp = pp;
3234 for (i = 0; i < npgs; i++, pp++) {
3235 pp->p_szc = szc;
3236 }
3237 page_list_concat(&pplist, &hpp);
3238 pgcnt -= npgs;
3239 continue;
3240 }
3241 ASSERT(!PP_ISAGED(pp));
3242 ASSERT(pp->p_szc == 0);
3243 page_list_sub(pp, PG_CACHE_LIST);
3244 page_hashout(pp, NULL);
3245 PP_SETAGED(pp);
3246 pp->p_szc = szc;
3247 page_list_concat(&pplist, &pp);
3248 pp++;
3249 pgcnt--;
3250 continue;
3251 }
3252 npgs = page_get_pagecnt(pp->p_szc);
3253
3254 /*
3255 * page_create_wait freemem accounting done by caller of
3256 * page_get_freelist and not necessary to call it prior to
3257 * calling page_get_replacement_page.
3258 *
3259 * page_get_replacement_page can call page_get_contig_pages
3260 * to acquire a large page (szc > 0); the replacement must be
3261 * smaller than the contig page size to avoid looping or
3262 * szc == 0 and PGI_PGCPSZC0 is set.
3263 */
3264 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
3265 replpp = page_get_replacement_page(pp, NULL, 0);
3266 if (replpp) {
3267 npgs = page_get_pagecnt(pp->p_szc);
3268 ASSERT(npgs <= pgcnt);
3269 targpp = pp;
3270 }
3271 }
3272
3273 /*
3274 * If replacement is NULL or do_page_relocate fails, fail
3275 * coalescing of pages.
3276 */
3277 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
3278 &npgs, NULL) != 0)) {
3279 /*
3280 * Unlock un-processed target list
3281 */
3282 while (pgcnt--) {
3283 ASSERT(PAGE_EXCL(pp));
3284 page_unlock_nocapture(pp);
3285 pp++;
3286 }
3287 /*
3288 * Free the processed target list.
3289 */
3290 while (pplist) {
3291 pp = pplist;
3292 page_sub(&pplist, pp);
3293 ASSERT(PAGE_EXCL(pp));
3294 ASSERT(pp->p_szc == szc);
3295 ASSERT(PP_ISFREE(pp));
3296 ASSERT(PP_ISAGED(pp));
3297 pp->p_szc = 0;
3298 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3299 page_unlock_nocapture(pp);
3300 }
3301
3302 if (replpp != NULL)
3303 page_free_replacement_page(replpp);
3304
3305 return (NULL);
3306 }
3307 ASSERT(pp == targpp);
3308
3309 /* LINTED */
3310 ASSERT(hpp = pp); /* That's right, it's an assignment */
3311
3312 pp += npgs;
3313 pgcnt -= npgs;
3314
3315 while (npgs--) {
3316 ASSERT(PAGE_EXCL(targpp));
3317 ASSERT(!PP_ISFREE(targpp));
3318 ASSERT(!PP_ISNORELOC(targpp));
3319 PP_SETFREE(targpp);
3320 ASSERT(PP_ISAGED(targpp));
3321 ASSERT(targpp->p_szc < szc || (szc == 0 &&
3322 (flags & PGI_PGCPSZC0)));
3323 targpp->p_szc = szc;
3324 targpp = targpp->p_next;
3325
3326 rpp = replpp;
3327 ASSERT(rpp != NULL);
3328 page_sub(&replpp, rpp);
3329 ASSERT(PAGE_EXCL(rpp));
3330 ASSERT(!PP_ISFREE(rpp));
3331 page_unlock_nocapture(rpp);
3332 }
3333 ASSERT(targpp == hpp);
3334 ASSERT(replpp == NULL);
3335 page_list_concat(&pplist, &targpp);
3336 }
3337 CHK_LPG(pplist, szc);
3338 return (pplist);
3339 }
3340
3341 /*
3342 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
3343 * of 0 means nothing left after trim.
3344 */
3345 int
trimkcage(struct memseg * mseg,pfn_t * lo,pfn_t * hi,pfn_t pfnlo,pfn_t pfnhi)3346 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
3347 {
3348 pfn_t kcagepfn;
3349 int decr;
3350 int rc = 0;
3351
3352 if (PP_ISNORELOC(mseg->pages)) {
3353 if (PP_ISNORELOC(mseg->epages - 1) == 0) {
3354
3355 /* lower part of this mseg inside kernel cage */
3356 decr = kcage_current_pfn(&kcagepfn);
3357
3358 /* kernel cage may have transitioned past mseg */
3359 if (kcagepfn >= mseg->pages_base &&
3360 kcagepfn < mseg->pages_end) {
3361 ASSERT(decr == 0);
3362 *lo = MAX(kcagepfn, pfnlo);
3363 *hi = MIN(pfnhi, (mseg->pages_end - 1));
3364 rc = 1;
3365 }
3366 }
3367 /* else entire mseg in the cage */
3368 } else {
3369 if (PP_ISNORELOC(mseg->epages - 1)) {
3370
3371 /* upper part of this mseg inside kernel cage */
3372 decr = kcage_current_pfn(&kcagepfn);
3373
3374 /* kernel cage may have transitioned past mseg */
3375 if (kcagepfn >= mseg->pages_base &&
3376 kcagepfn < mseg->pages_end) {
3377 ASSERT(decr);
3378 *hi = MIN(kcagepfn, pfnhi);
3379 *lo = MAX(pfnlo, mseg->pages_base);
3380 rc = 1;
3381 }
3382 } else {
3383 /* entire mseg outside of kernel cage */
3384 *lo = MAX(pfnlo, mseg->pages_base);
3385 *hi = MIN(pfnhi, (mseg->pages_end - 1));
3386 rc = 1;
3387 }
3388 }
3389 return (rc);
3390 }
3391
3392 /*
3393 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3394 * page with size code 'szc'. Claiming such a page requires acquiring
3395 * exclusive locks on all constituent pages (page_trylock_contig_pages),
3396 * relocating pages in use and concatenating these constituent pages into a
3397 * large page.
3398 *
3399 * The page lists do not have such a large page and page_freelist_split has
3400 * already failed to demote larger pages and/or coalesce smaller free pages.
3401 *
3402 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3403 * pages with the same color as 'bin'.
3404 *
3405 * 'pfnflag' specifies the subset of the pfn range to search.
3406 */
3407
3408 static page_t *
page_geti_contig_pages(int mnode,uint_t bin,uchar_t szc,int flags,pfn_t pfnlo,pfn_t pfnhi,pgcnt_t pfnflag)3409 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
3410 pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
3411 {
3412 struct memseg *mseg;
3413 pgcnt_t szcpgcnt = page_get_pagecnt(szc);
3414 pgcnt_t szcpgmask = szcpgcnt - 1;
3415 pfn_t randpfn;
3416 page_t *pp, *randpp, *endpp;
3417 uint_t colors, ceq_mask;
3418 /* LINTED : set but not used in function */
3419 uint_t color_mask;
3420 pfn_t hi, lo;
3421 uint_t skip;
3422 MEM_NODE_ITERATOR_DECL(it);
3423
3424 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
3425
3426 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
3427
3428 if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi)
3429 return (NULL);
3430
3431 ASSERT(szc < mmu_page_sizes);
3432
3433 colors = PAGE_GET_PAGECOLORS(szc);
3434 color_mask = colors - 1;
3435 if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
3436 uchar_t ceq = colorequivszc[szc];
3437 uint_t ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
3438
3439 ASSERT(ceq_dif > 0);
3440 ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
3441 } else {
3442 ceq_mask = 0;
3443 }
3444
3445 ASSERT(bin < colors);
3446
3447 /* clear "non-significant" color bits */
3448 bin &= ceq_mask;
3449
3450 /*
3451 * trim the pfn range to search based on pfnflag. pfnflag is set
3452 * when there have been previous page_get_contig_page failures to
3453 * limit the search.
3454 *
3455 * The high bit in pfnflag specifies the number of 'slots' in the
3456 * pfn range and the remainder of pfnflag specifies which slot.
3457 * For example, a value of 1010b would mean the second slot of
3458 * the pfn range that has been divided into 8 slots.
3459 */
3460 if (pfnflag > 1) {
3461 int slots = 1 << (highbit(pfnflag) - 1);
3462 int slotid = pfnflag & (slots - 1);
3463 pgcnt_t szcpages;
3464 int slotlen;
3465
3466 pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1;
3467 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
3468 slotlen = howmany(szcpages, slots);
3469 /* skip if 'slotid' slot is empty */
3470 if (slotid * slotlen >= szcpages)
3471 return (NULL);
3472 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
3473 ASSERT(pfnlo < pfnhi);
3474 if (pfnhi > pfnlo + (slotlen * szcpgcnt))
3475 pfnhi = pfnlo + (slotlen * szcpgcnt) - 1;
3476 }
3477
3478 /*
3479 * This routine is can be called recursively so we shouldn't
3480 * acquire a reader lock if a write request is pending. This
3481 * could lead to a deadlock with the DR thread.
3482 *
3483 * Returning NULL informs the caller that we could not get
3484 * a contig page with the required characteristics.
3485 */
3486
3487 if (!memsegs_trylock(0))
3488 return (NULL);
3489
3490 /*
3491 * loop through memsegs to look for contig page candidates
3492 */
3493
3494 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
3495 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
3496 /* no overlap */
3497 continue;
3498 }
3499
3500 if (mseg->pages_end - mseg->pages_base < szcpgcnt)
3501 /* mseg too small */
3502 continue;
3503
3504 /*
3505 * trim off kernel cage pages from pfn range and check for
3506 * a trimmed pfn range returned that does not span the
3507 * desired large page size.
3508 */
3509 if (kcage_on) {
3510 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 ||
3511 lo >= hi || ((hi - lo) + 1) < szcpgcnt)
3512 continue;
3513 } else {
3514 lo = MAX(pfnlo, mseg->pages_base);
3515 hi = MIN(pfnhi, (mseg->pages_end - 1));
3516 }
3517
3518 /* round to szcpgcnt boundaries */
3519 lo = P2ROUNDUP(lo, szcpgcnt);
3520
3521 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3522 hi = P2ALIGN((hi + 1), szcpgcnt) - 1;
3523
3524 if (hi <= lo)
3525 continue;
3526
3527 /*
3528 * set lo to point to the pfn for the desired bin. Large
3529 * page sizes may only have a single page color
3530 */
3531 skip = szcpgcnt;
3532 if (ceq_mask > 0 || interleaved_mnodes) {
3533 /* set lo to point at appropriate color */
3534 if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
3535 (interleaved_mnodes &&
3536 PFN_2_MEM_NODE(lo) != mnode)) {
3537 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
3538 color_mask, &it);
3539 }
3540 if (hi <= lo)
3541 /* mseg cannot satisfy color request */
3542 continue;
3543 }
3544
3545 /* randomly choose a point between lo and hi to begin search */
3546
3547 randpfn = (pfn_t)GETTICK();
3548 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
3549 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it);
3550 if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) {
3551 if (randpfn != (pfn_t)-1) {
3552 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
3553 ceq_mask, color_mask, &it);
3554 }
3555 if (randpfn >= hi) {
3556 randpfn = lo;
3557 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc,
3558 &it);
3559 }
3560 }
3561 randpp = mseg->pages + (randpfn - mseg->pages_base);
3562
3563 ASSERT(randpp->p_pagenum == randpfn);
3564
3565 pp = randpp;
3566 endpp = mseg->pages + (hi - mseg->pages_base) + 1;
3567
3568 ASSERT(randpp + szcpgcnt <= endpp);
3569
3570 do {
3571 ASSERT(!(pp->p_pagenum & szcpgmask));
3572 ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
3573
3574 if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
3575 /* pages unlocked by page_claim on failure */
3576 if (page_claim_contig_pages(pp, szc, flags)) {
3577 memsegs_unlock(0);
3578 return (pp);
3579 }
3580 }
3581
3582 if (ceq_mask == 0 && !interleaved_mnodes) {
3583 pp += skip;
3584 } else {
3585 pfn_t pfn = pp->p_pagenum;
3586
3587 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
3588 ceq_mask, color_mask, &it);
3589 if (pfn == (pfn_t)-1) {
3590 pp = endpp;
3591 } else {
3592 pp = mseg->pages +
3593 (pfn - mseg->pages_base);
3594 }
3595 }
3596 if (pp >= endpp) {
3597 /* start from the beginning */
3598 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3599 pp = mseg->pages + (lo - mseg->pages_base);
3600 ASSERT(pp->p_pagenum == lo);
3601 ASSERT(pp + szcpgcnt <= endpp);
3602 }
3603 } while (pp != randpp);
3604 }
3605 memsegs_unlock(0);
3606 return (NULL);
3607 }
3608
3609
3610 /*
3611 * controlling routine that searches through physical memory in an attempt to
3612 * claim a large page based on the input parameters.
3613 * on the page free lists.
3614 *
3615 * calls page_geti_contig_pages with an initial pfn range from the mnode
3616 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3617 * that overlaps with the kernel cage or does not match the requested page
3618 * color if PG_MATCH_COLOR is set. Since this search is very expensive,
3619 * page_geti_contig_pages may further limit the search range based on
3620 * previous failure counts (pgcpfailcnt[]).
3621 *
3622 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3623 * pagesize page that satisfies mtype.
3624 */
3625 page_t *
page_get_contig_pages(int mnode,uint_t bin,int mtype,uchar_t szc,uint_t flags)3626 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
3627 uint_t flags)
3628 {
3629 pfn_t pfnlo, pfnhi; /* contig pages pfn range */
3630 page_t *pp;
3631 pgcnt_t pfnflag = 0; /* no limit on search if 0 */
3632
3633 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
3634
3635 /* no allocations from cage */
3636 flags |= PGI_NOCAGE;
3637
3638 /* LINTED */
3639 MTYPE_START(mnode, mtype, flags);
3640 if (mtype < 0) { /* mnode does not have memory in mtype range */
3641 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
3642 return (NULL);
3643 }
3644
3645 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3646
3647 /* do not limit search and ignore color if hi pri */
3648
3649 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
3650 pfnflag = pgcpfailcnt[szc];
3651
3652 /* remove color match to improve chances */
3653
3654 if (flags & PGI_PGCPHIPRI || pfnflag)
3655 flags &= ~PG_MATCH_COLOR;
3656
3657 do {
3658 /* get pfn range based on mnode and mtype */
3659 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3660
3661 ASSERT(pfnhi >= pfnlo);
3662
3663 pp = page_geti_contig_pages(mnode, bin, szc, flags,
3664 pfnlo, pfnhi, pfnflag);
3665
3666 if (pp != NULL) {
3667 pfnflag = pgcpfailcnt[szc];
3668 if (pfnflag) {
3669 /* double the search size */
3670 pgcpfailcnt[szc] = pfnflag >> 1;
3671 }
3672 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3673 return (pp);
3674 }
3675 MTYPE_NEXT(mnode, mtype, flags);
3676 } while (mtype >= 0);
3677
3678 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3679 return (NULL);
3680 }
3681
3682 #if defined(__i386) || defined(__amd64)
3683 /*
3684 * Determine the likelihood of finding/coalescing a szc page.
3685 * Return 0 if the likelihood is small otherwise return 1.
3686 *
3687 * For now, be conservative and check only 1g pages and return 0
3688 * if there had been previous coalescing failures and the szc pages
3689 * needed to satisfy request would exhaust most of freemem.
3690 */
3691 int
page_chk_freelist(uint_t szc)3692 page_chk_freelist(uint_t szc)
3693 {
3694 pgcnt_t pgcnt;
3695
3696 if (szc <= 1)
3697 return (1);
3698
3699 pgcnt = page_get_pagecnt(szc);
3700 if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) {
3701 VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]);
3702 return (0);
3703 }
3704 VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]);
3705 return (1);
3706 }
3707 #endif
3708
3709 /*
3710 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
3711 *
3712 * Does its own locking and accounting.
3713 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3714 * pages of the proper color even if there are pages of a different color.
3715 *
3716 * Finds a page, removes it, THEN locks it.
3717 */
3718
3719 /*ARGSUSED*/
3720 page_t *
page_get_freelist(struct vnode * vp,u_offset_t off,struct seg * seg,caddr_t vaddr,size_t size,uint_t flags,struct lgrp * lgrp)3721 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3722 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3723 {
3724 struct as *as = seg->s_as;
3725 page_t *pp = NULL;
3726 ulong_t bin;
3727 uchar_t szc;
3728 int mnode;
3729 int mtype;
3730 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
3731 lgrp_mnode_cookie_t lgrp_cookie;
3732
3733 page_get_func = page_get_mnode_freelist;
3734
3735 /*
3736 * If we aren't passed a specific lgroup, or passed a freed lgrp
3737 * assume we wish to allocate near to the current thread's home.
3738 */
3739 if (!LGRP_EXISTS(lgrp))
3740 lgrp = lgrp_home_lgrp();
3741
3742 if (kcage_on) {
3743 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
3744 kcage_freemem < kcage_throttlefree + btop(size) &&
3745 curthread != kcage_cageout_thread) {
3746 /*
3747 * Set a "reserve" of kcage_throttlefree pages for
3748 * PG_PANIC and cageout thread allocations.
3749 *
3750 * Everybody else has to serialize in
3751 * page_create_get_something() to get a cage page, so
3752 * that we don't deadlock cageout!
3753 */
3754 return (NULL);
3755 }
3756 } else {
3757 flags &= ~PG_NORELOC;
3758 flags |= PGI_NOCAGE;
3759 }
3760
3761 /* LINTED */
3762 MTYPE_INIT(mtype, vp, vaddr, flags, size);
3763
3764 /*
3765 * Convert size to page size code.
3766 */
3767 if ((szc = page_szc(size)) == (uchar_t)-1)
3768 panic("page_get_freelist: illegal page size request");
3769 ASSERT(szc < mmu_page_sizes);
3770
3771 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
3772
3773 /* LINTED */
3774 AS_2_BIN(as, seg, vp, vaddr, bin, szc);
3775
3776 ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
3777
3778 /*
3779 * Try to get a local page first, but try remote if we can't
3780 * get a page of the right color.
3781 */
3782 pgretry:
3783 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3784 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3785 pp = page_get_func(mnode, bin, mtype, szc, flags);
3786 if (pp != NULL) {
3787 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
3788 DTRACE_PROBE4(page__get,
3789 lgrp_t *, lgrp,
3790 int, mnode,
3791 ulong_t, bin,
3792 uint_t, flags);
3793 return (pp);
3794 }
3795 }
3796 ASSERT(pp == NULL);
3797
3798 /*
3799 * for non-SZC0 PAGESIZE requests, check cachelist before checking
3800 * remote free lists. Caller expected to call page_get_cachelist which
3801 * will check local cache lists and remote free lists.
3802 */
3803 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
3804 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
3805 return (NULL);
3806 }
3807
3808 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3809
3810 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3811
3812 if (!(flags & PG_LOCAL)) {
3813 /*
3814 * Try to get a non-local freelist page.
3815 */
3816 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3817 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3818 pp = page_get_func(mnode, bin, mtype, szc, flags);
3819 if (pp != NULL) {
3820 DTRACE_PROBE4(page__get,
3821 lgrp_t *, lgrp,
3822 int, mnode,
3823 ulong_t, bin,
3824 uint_t, flags);
3825 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
3826 return (pp);
3827 }
3828 }
3829 ASSERT(pp == NULL);
3830 }
3831
3832 /*
3833 * when the cage is off chances are page_get_contig_pages() will fail
3834 * to lock a large page chunk therefore when the cage is off it's not
3835 * called by default. this can be changed via /etc/system.
3836 *
3837 * page_get_contig_pages() also called to acquire a base pagesize page
3838 * for page_create_get_something().
3839 */
3840 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
3841 (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
3842 (page_get_func != page_get_contig_pages)) {
3843
3844 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
3845 page_get_func = page_get_contig_pages;
3846 goto pgretry;
3847 }
3848
3849 if (!(flags & PG_LOCAL) && pgcplimitsearch &&
3850 page_get_func == page_get_contig_pages)
3851 SETPGCPFAILCNT(szc);
3852
3853 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
3854 return (NULL);
3855 }
3856
3857 /*
3858 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
3859 *
3860 * Does its own locking.
3861 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3862 * pages of the proper color even if there are pages of a different color.
3863 * Otherwise, scan the bins for ones with pages. For each bin with pages,
3864 * try to lock one of them. If no page can be locked, try the
3865 * next bin. Return NULL if a page can not be found and locked.
3866 *
3867 * Finds a pages, trys to lock it, then removes it.
3868 */
3869
3870 /*ARGSUSED*/
3871 page_t *
page_get_cachelist(struct vnode * vp,u_offset_t off,struct seg * seg,caddr_t vaddr,uint_t flags,struct lgrp * lgrp)3872 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3873 caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3874 {
3875 page_t *pp;
3876 struct as *as = seg->s_as;
3877 ulong_t bin;
3878 /*LINTED*/
3879 int mnode;
3880 int mtype;
3881 lgrp_mnode_cookie_t lgrp_cookie;
3882
3883 /*
3884 * If we aren't passed a specific lgroup, or pasased a freed lgrp
3885 * assume we wish to allocate near to the current thread's home.
3886 */
3887 if (!LGRP_EXISTS(lgrp))
3888 lgrp = lgrp_home_lgrp();
3889
3890 if (!kcage_on) {
3891 flags &= ~PG_NORELOC;
3892 flags |= PGI_NOCAGE;
3893 }
3894
3895 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
3896 kcage_freemem <= kcage_throttlefree) {
3897 /*
3898 * Reserve kcage_throttlefree pages for critical kernel
3899 * threads.
3900 *
3901 * Everybody else has to go to page_create_get_something()
3902 * to get a cage page, so we don't deadlock cageout.
3903 */
3904 return (NULL);
3905 }
3906
3907 /* LINTED */
3908 AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3909
3910 ASSERT(bin < PAGE_GET_PAGECOLORS(0));
3911
3912 /* LINTED */
3913 MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);
3914
3915 VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3916
3917 /*
3918 * Try local cachelists first
3919 */
3920 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3921 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3922 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3923 if (pp != NULL) {
3924 VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3925 DTRACE_PROBE4(page__get,
3926 lgrp_t *, lgrp,
3927 int, mnode,
3928 ulong_t, bin,
3929 uint_t, flags);
3930 return (pp);
3931 }
3932 }
3933
3934 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3935
3936 /*
3937 * Try freelists/cachelists that are farther away
3938 * This is our only chance to allocate remote pages for PAGESIZE
3939 * requests.
3940 */
3941 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3942 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3943 pp = page_get_mnode_freelist(mnode, bin, mtype,
3944 0, flags);
3945 if (pp != NULL) {
3946 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3947 DTRACE_PROBE4(page__get,
3948 lgrp_t *, lgrp,
3949 int, mnode,
3950 ulong_t, bin,
3951 uint_t, flags);
3952 return (pp);
3953 }
3954 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3955 if (pp != NULL) {
3956 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3957 DTRACE_PROBE4(page__get,
3958 lgrp_t *, lgrp,
3959 int, mnode,
3960 ulong_t, bin,
3961 uint_t, flags);
3962 return (pp);
3963 }
3964 }
3965
3966 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3967 return (NULL);
3968 }
3969
3970 page_t *
page_get_mnode_cachelist(uint_t bin,uint_t flags,int mnode,int mtype)3971 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3972 {
3973 kmutex_t *pcm;
3974 page_t *pp, *first_pp;
3975 uint_t sbin;
3976 int plw_initialized;
3977 page_list_walker_t plw;
3978
3979 VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
3980
3981 /* LINTED */
3982 MTYPE_START(mnode, mtype, flags);
3983 if (mtype < 0) { /* mnode does not have memory in mtype range */
3984 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
3985 return (NULL);
3986 }
3987
3988 try_again:
3989
3990 plw_initialized = 0;
3991 plw.plw_ceq_dif = 1;
3992
3993 /*
3994 * Only hold one cachelist lock at a time, that way we
3995 * can start anywhere and not have to worry about lock
3996 * ordering.
3997 */
3998
3999 for (plw.plw_count = 0;
4000 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
4001 sbin = bin;
4002 do {
4003
4004 if (!PAGE_CACHELISTS(mnode, bin, mtype))
4005 goto bin_empty_1;
4006 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
4007 mutex_enter(pcm);
4008 pp = PAGE_CACHELISTS(mnode, bin, mtype);
4009 if (pp == NULL)
4010 goto bin_empty_0;
4011
4012 first_pp = pp;
4013 ASSERT(pp->p_vnode);
4014 ASSERT(PP_ISAGED(pp) == 0);
4015 ASSERT(pp->p_szc == 0);
4016 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
4017 while (IS_DUMP_PAGE(pp) || !page_trylock(pp, SE_EXCL)) {
4018 pp = pp->p_next;
4019 ASSERT(pp->p_szc == 0);
4020 if (pp == first_pp) {
4021 /*
4022 * We have searched the complete list!
4023 * And all of them (might only be one)
4024 * are locked. This can happen since
4025 * these pages can also be found via
4026 * the hash list. When found via the
4027 * hash list, they are locked first,
4028 * then removed. We give up to let the
4029 * other thread run.
4030 */
4031 pp = NULL;
4032 break;
4033 }
4034 ASSERT(pp->p_vnode);
4035 ASSERT(PP_ISFREE(pp));
4036 ASSERT(PP_ISAGED(pp) == 0);
4037 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
4038 mnode);
4039 }
4040
4041 if (pp) {
4042 page_t **ppp;
4043 /*
4044 * Found and locked a page.
4045 * Pull it off the list.
4046 */
4047 ASSERT(mtype == PP_2_MTYPE(pp));
4048 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
4049 page_sub(ppp, pp);
4050 /*
4051 * Subtract counters before releasing pcm mutex
4052 * to avoid a race with page_freelist_coalesce
4053 * and page_freelist_split.
4054 */
4055 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
4056 mutex_exit(pcm);
4057 ASSERT(pp->p_vnode);
4058 ASSERT(PP_ISAGED(pp) == 0);
4059 #if defined(__sparc)
4060 ASSERT(!kcage_on ||
4061 (flags & PG_NORELOC) == 0 ||
4062 PP_ISNORELOC(pp));
4063 if (PP_ISNORELOC(pp)) {
4064 kcage_freemem_sub(1);
4065 }
4066 #endif
4067 VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
4068 return (pp);
4069 }
4070 bin_empty_0:
4071 mutex_exit(pcm);
4072 bin_empty_1:
4073 if (plw_initialized == 0) {
4074 page_list_walk_init(0, flags, bin, 0, 1, &plw);
4075 plw_initialized = 1;
4076 }
4077 /* calculate the next bin with equivalent color */
4078 bin = ADD_MASKED(bin, plw.plw_bin_step,
4079 plw.plw_ceq_mask[0], plw.plw_color_mask);
4080 } while (sbin != bin);
4081
4082 if (plw.plw_ceq_dif > 1)
4083 bin = page_list_walk_next_bin(0, bin, &plw);
4084 }
4085
4086 MTYPE_NEXT(mnode, mtype, flags);
4087 if (mtype >= 0)
4088 goto try_again;
4089
4090 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
4091 return (NULL);
4092 }
4093
4094 #ifdef DEBUG
4095 #define REPL_PAGE_STATS
4096 #endif /* DEBUG */
4097
4098 #ifdef REPL_PAGE_STATS
4099 struct repl_page_stats {
4100 uint_t ngets;
4101 uint_t ngets_noreloc;
4102 uint_t npgr_noreloc;
4103 uint_t nnopage_first;
4104 uint_t nnopage;
4105 uint_t nhashout;
4106 uint_t nnofree;
4107 uint_t nnext_pp;
4108 } repl_page_stats;
4109 #define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1)
4110 #else /* REPL_PAGE_STATS */
4111 #define REPL_STAT_INCR(v)
4112 #endif /* REPL_PAGE_STATS */
4113
4114 int pgrppgcp;
4115
4116 /*
4117 * The freemem accounting must be done by the caller.
4118 * First we try to get a replacement page of the same size as like_pp,
4119 * if that is not possible, then we just get a set of discontiguous
4120 * PAGESIZE pages.
4121 */
4122 page_t *
page_get_replacement_page(page_t * orig_like_pp,struct lgrp * lgrp_target,uint_t pgrflags)4123 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
4124 uint_t pgrflags)
4125 {
4126 page_t *like_pp;
4127 page_t *pp, *pplist;
4128 page_t *pl = NULL;
4129 ulong_t bin;
4130 int mnode, page_mnode;
4131 int szc;
4132 spgcnt_t npgs, pg_cnt;
4133 pfn_t pfnum;
4134 int mtype;
4135 int flags = 0;
4136 lgrp_mnode_cookie_t lgrp_cookie;
4137 lgrp_t *lgrp;
4138
4139 REPL_STAT_INCR(ngets);
4140 like_pp = orig_like_pp;
4141 ASSERT(PAGE_EXCL(like_pp));
4142
4143 szc = like_pp->p_szc;
4144 npgs = page_get_pagecnt(szc);
4145 /*
4146 * Now we reset like_pp to the base page_t.
4147 * That way, we won't walk past the end of this 'szc' page.
4148 */
4149 pfnum = PFN_BASE(like_pp->p_pagenum, szc);
4150 like_pp = page_numtopp_nolock(pfnum);
4151 ASSERT(like_pp->p_szc == szc);
4152
4153 if (PP_ISNORELOC(like_pp)) {
4154 ASSERT(kcage_on);
4155 REPL_STAT_INCR(ngets_noreloc);
4156 flags = PGI_RELOCONLY;
4157 } else if (pgrflags & PGR_NORELOC) {
4158 ASSERT(kcage_on);
4159 REPL_STAT_INCR(npgr_noreloc);
4160 flags = PG_NORELOC;
4161 }
4162
4163 /*
4164 * Kernel pages must always be replaced with the same size
4165 * pages, since we cannot properly handle demotion of kernel
4166 * pages.
4167 */
4168 if (PP_ISKAS(like_pp))
4169 pgrflags |= PGR_SAMESZC;
4170
4171 /* LINTED */
4172 MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
4173
4174 while (npgs) {
4175 pplist = NULL;
4176 for (;;) {
4177 pg_cnt = page_get_pagecnt(szc);
4178 bin = PP_2_BIN(like_pp);
4179 ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
4180 ASSERT(pg_cnt <= npgs);
4181
4182 /*
4183 * If an lgroup was specified, try to get the
4184 * page from that lgroup.
4185 * NOTE: Must be careful with code below because
4186 * lgroup may disappear and reappear since there
4187 * is no locking for lgroup here.
4188 */
4189 if (LGRP_EXISTS(lgrp_target)) {
4190 /*
4191 * Keep local variable for lgroup separate
4192 * from lgroup argument since this code should
4193 * only be exercised when lgroup argument
4194 * exists....
4195 */
4196 lgrp = lgrp_target;
4197
4198 /* Try the lgroup's freelists first */
4199 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4200 LGRP_SRCH_LOCAL);
4201 while ((pplist == NULL) &&
4202 (mnode = lgrp_memnode_choose(&lgrp_cookie))
4203 != -1) {
4204 pplist =
4205 page_get_mnode_freelist(mnode, bin,
4206 mtype, szc, flags);
4207 }
4208
4209 /*
4210 * Now try it's cachelists if this is a
4211 * small page. Don't need to do it for
4212 * larger ones since page_freelist_coalesce()
4213 * already failed.
4214 */
4215 if (pplist != NULL || szc != 0)
4216 break;
4217
4218 /* Now try it's cachelists */
4219 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4220 LGRP_SRCH_LOCAL);
4221
4222 while ((pplist == NULL) &&
4223 (mnode = lgrp_memnode_choose(&lgrp_cookie))
4224 != -1) {
4225 pplist =
4226 page_get_mnode_cachelist(bin, flags,
4227 mnode, mtype);
4228 }
4229 if (pplist != NULL) {
4230 page_hashout(pplist, NULL);
4231 PP_SETAGED(pplist);
4232 REPL_STAT_INCR(nhashout);
4233 break;
4234 }
4235 /* Done looking in this lgroup. Bail out. */
4236 break;
4237 }
4238
4239 /*
4240 * No lgroup was specified (or lgroup was removed by
4241 * DR, so just try to get the page as close to
4242 * like_pp's mnode as possible.
4243 * First try the local freelist...
4244 */
4245 mnode = PP_2_MEM_NODE(like_pp);
4246 pplist = page_get_mnode_freelist(mnode, bin,
4247 mtype, szc, flags);
4248 if (pplist != NULL)
4249 break;
4250
4251 REPL_STAT_INCR(nnofree);
4252
4253 /*
4254 * ...then the local cachelist. Don't need to do it for
4255 * larger pages cause page_freelist_coalesce() already
4256 * failed there anyway.
4257 */
4258 if (szc == 0) {
4259 pplist = page_get_mnode_cachelist(bin, flags,
4260 mnode, mtype);
4261 if (pplist != NULL) {
4262 page_hashout(pplist, NULL);
4263 PP_SETAGED(pplist);
4264 REPL_STAT_INCR(nhashout);
4265 break;
4266 }
4267 }
4268
4269 /* Now try remote freelists */
4270 page_mnode = mnode;
4271 lgrp =
4272 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
4273 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4274 LGRP_SRCH_HIER);
4275 while (pplist == NULL &&
4276 (mnode = lgrp_memnode_choose(&lgrp_cookie))
4277 != -1) {
4278 /*
4279 * Skip local mnode.
4280 */
4281 if ((mnode == page_mnode) ||
4282 (mem_node_config[mnode].exists == 0))
4283 continue;
4284
4285 pplist = page_get_mnode_freelist(mnode,
4286 bin, mtype, szc, flags);
4287 }
4288
4289 if (pplist != NULL)
4290 break;
4291
4292
4293 /* Now try remote cachelists */
4294 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4295 LGRP_SRCH_HIER);
4296 while (pplist == NULL && szc == 0) {
4297 mnode = lgrp_memnode_choose(&lgrp_cookie);
4298 if (mnode == -1)
4299 break;
4300 /*
4301 * Skip local mnode.
4302 */
4303 if ((mnode == page_mnode) ||
4304 (mem_node_config[mnode].exists == 0))
4305 continue;
4306
4307 pplist = page_get_mnode_cachelist(bin,
4308 flags, mnode, mtype);
4309
4310 if (pplist != NULL) {
4311 page_hashout(pplist, NULL);
4312 PP_SETAGED(pplist);
4313 REPL_STAT_INCR(nhashout);
4314 break;
4315 }
4316 }
4317
4318 /*
4319 * Break out of while loop under the following cases:
4320 * - If we successfully got a page.
4321 * - If pgrflags specified only returning a specific
4322 * page size and we could not find that page size.
4323 * - If we could not satisfy the request with PAGESIZE
4324 * or larger pages.
4325 */
4326 if (pplist != NULL || szc == 0)
4327 break;
4328
4329 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
4330 /* try to find contig page */
4331
4332 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4333 LGRP_SRCH_HIER);
4334
4335 while ((pplist == NULL) &&
4336 (mnode =
4337 lgrp_memnode_choose(&lgrp_cookie))
4338 != -1) {
4339 pplist = page_get_contig_pages(
4340 mnode, bin, mtype, szc,
4341 flags | PGI_PGCPHIPRI);
4342 }
4343 break;
4344 }
4345
4346 /*
4347 * The correct thing to do here is try the next
4348 * page size down using szc--. Due to a bug
4349 * with the processing of HAT_RELOAD_SHARE
4350 * where the sfmmu_ttecnt arrays of all
4351 * hats sharing an ISM segment don't get updated,
4352 * using intermediate size pages for relocation
4353 * can lead to continuous page faults.
4354 */
4355 szc = 0;
4356 }
4357
4358 if (pplist != NULL) {
4359 DTRACE_PROBE4(page__get,
4360 lgrp_t *, lgrp,
4361 int, mnode,
4362 ulong_t, bin,
4363 uint_t, flags);
4364
4365 while (pplist != NULL && pg_cnt--) {
4366 ASSERT(pplist != NULL);
4367 pp = pplist;
4368 page_sub(&pplist, pp);
4369 PP_CLRFREE(pp);
4370 PP_CLRAGED(pp);
4371 page_list_concat(&pl, &pp);
4372 npgs--;
4373 like_pp = like_pp + 1;
4374 REPL_STAT_INCR(nnext_pp);
4375 }
4376 ASSERT(pg_cnt == 0);
4377 } else {
4378 break;
4379 }
4380 }
4381
4382 if (npgs) {
4383 /*
4384 * We were unable to allocate the necessary number
4385 * of pages.
4386 * We need to free up any pl.
4387 */
4388 REPL_STAT_INCR(nnopage);
4389 page_free_replacement_page(pl);
4390 return (NULL);
4391 } else {
4392 return (pl);
4393 }
4394 }
4395
4396 /*
4397 * demote a free large page to it's constituent pages
4398 */
4399 void
page_demote_free_pages(page_t * pp)4400 page_demote_free_pages(page_t *pp)
4401 {
4402
4403 int mnode;
4404
4405 ASSERT(pp != NULL);
4406 ASSERT(PAGE_LOCKED(pp));
4407 ASSERT(PP_ISFREE(pp));
4408 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
4409
4410 mnode = PP_2_MEM_NODE(pp);
4411 page_freelist_lock(mnode);
4412 if (pp->p_szc != 0) {
4413 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
4414 pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
4415 }
4416 page_freelist_unlock(mnode);
4417 ASSERT(pp->p_szc == 0);
4418 }
4419
4420 /*
4421 * Factor in colorequiv to check additional 'equivalent' bins.
4422 * colorequiv may be set in /etc/system
4423 */
4424 void
page_set_colorequiv_arr(void)4425 page_set_colorequiv_arr(void)
4426 {
4427 if (colorequiv > 1) {
4428 int i;
4429 uint_t sv_a = lowbit(colorequiv) - 1;
4430
4431 if (sv_a > 15)
4432 sv_a = 15;
4433
4434 for (i = 0; i < MMU_PAGE_SIZES; i++) {
4435 uint_t colors;
4436 uint_t a = sv_a;
4437
4438 if ((colors = hw_page_array[i].hp_colors) <= 1) {
4439 continue;
4440 }
4441 while ((colors >> a) == 0)
4442 a--;
4443 if ((a << 4) > colorequivszc[i]) {
4444 colorequivszc[i] = (a << 4);
4445 }
4446 }
4447 }
4448 }
4449