1*73b046f9Smpi /* $OpenBSD: uvm_pmemrange.c,v 1.76 2024/11/08 15:54:33 mpi Exp $ */ 2a3544580Soga 3a3544580Soga /* 482673a18Smpi * Copyright (c) 2024 Martin Pieuchot <mpi@openbsd.org> 5a3544580Soga * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl> 6a3544580Soga * 7a3544580Soga * Permission to use, copy, modify, and distribute this software for any 8a3544580Soga * purpose with or without fee is hereby granted, provided that the above 9a3544580Soga * copyright notice and this permission notice appear in all copies. 10a3544580Soga * 11a3544580Soga * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 12a3544580Soga * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13a3544580Soga * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 14a3544580Soga * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15a3544580Soga * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16a3544580Soga * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17a3544580Soga * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18a3544580Soga */ 19a3544580Soga 20a3544580Soga #include <sys/param.h> 2190ee2fe0Sbeck #include <sys/systm.h> 22a3544580Soga #include <uvm/uvm.h> 23a3544580Soga #include <sys/malloc.h> 2490ee2fe0Sbeck #include <sys/kernel.h> 2563cf9471Smpi #include <sys/proc.h> 260bd995e1Stedu #include <sys/mount.h> 27a3544580Soga 28a3544580Soga /* 29a3544580Soga * 2 trees: addr tree and size tree. 30a3544580Soga * 31a3544580Soga * The allocator keeps chunks of free pages (called a range). 32a3544580Soga * Two pages are part of the same range if: 33a3544580Soga * - all pages in between are part of that range, 34a3544580Soga * - they are of the same memory type (zeroed or non-zeroed), 35a3544580Soga * - they are part of the same pmemrange. 36a3544580Soga * A pmemrange is a range of memory which is part of the same vm_physseg 37a3544580Soga * and has a use-count. 38a3544580Soga * 39a3544580Soga * addr tree is vm_page[0].objt 40a3544580Soga * size tree is vm_page[1].objt 41a3544580Soga * 42a3544580Soga * The size tree is not used for memory ranges of 1 page, instead, 43a3544580Soga * single queue is vm_page[0].pageq 44a3544580Soga * 459bec9e43Sjsg * vm_page[0].fpgsz describes the length of a free range. Two adjacent ranges 46a3544580Soga * are joined, unless: 47a3544580Soga * - they have pages in between them which are not free 48a3544580Soga * - they belong to different memtypes (zeroed vs dirty memory) 49a3544580Soga * - they are in different pmemrange areas (ISA vs non-ISA memory for instance) 50a3544580Soga * - they are not a continuation of the same array 51a3544580Soga * The latter issue is caused by vm_physseg ordering and splitting from the 52a3544580Soga * MD initialization machinery. The MD code is dependant on freelists and 53a3544580Soga * happens to split ISA memory from non-ISA memory. 54a3544580Soga * (Note: freelists die die die!) 55a3544580Soga * 56a3544580Soga * uvm_page_init guarantees that every vm_physseg contains an array of 57a3544580Soga * struct vm_page. Also, uvm_page_physload allocates an array of struct 58a3544580Soga * vm_page. This code depends on that array. The array may break across 59a3544580Soga * vm_physsegs boundaries. 60a3544580Soga */ 61a3544580Soga 62a3544580Soga /* 63a3544580Soga * Validate the flags of the page. (Used in asserts.) 64a3544580Soga * Any free page must have the PQ_FREE flag set. 65a3544580Soga * Free pages may be zeroed. 66a3544580Soga * Pmap flags are left untouched. 67a3544580Soga * 68a3544580Soga * The PQ_FREE flag is not checked here: by not checking, we can easily use 69a3544580Soga * this check in pages which are freed. 70a3544580Soga */ 71a3544580Soga #define VALID_FLAGS(pg_flags) \ 721a0bb2c4Smiod (((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0) 73a3544580Soga 74a3544580Soga /* Tree comparators. */ 753accc929Sdlg int uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *, 763accc929Sdlg const struct uvm_pmemrange *); 77a3544580Soga int uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *); 78a3544580Soga int uvm_pmr_pg_to_memtype(struct vm_page *); 79a3544580Soga 80a3544580Soga #ifdef DDB 81a3544580Soga void uvm_pmr_print(void); 82a3544580Soga #endif 83a3544580Soga 84d60475a6Smpi static inline int 85d60475a6Smpi in_pagedaemon(int allowsyncer) 86d60475a6Smpi { 87d60475a6Smpi if (curcpu()->ci_idepth > 0) 88d60475a6Smpi return 0; 89d60475a6Smpi if (curproc == uvm.pagedaemon_proc) 90d60475a6Smpi return 1; 91d60475a6Smpi /* XXX why is the syncer allowed to use the pagedaemon's reserve? */ 92d60475a6Smpi if (allowsyncer && (curproc == syncerproc)) 93d60475a6Smpi return 1; 94d60475a6Smpi return 0; 95d60475a6Smpi } 96d60475a6Smpi 97a3544580Soga /* 98a3544580Soga * Memory types. The page flags are used to derive what the current memory 99a3544580Soga * type of a page is. 100a3544580Soga */ 101a3544580Soga int 102a3544580Soga uvm_pmr_pg_to_memtype(struct vm_page *pg) 103a3544580Soga { 104a3544580Soga if (pg->pg_flags & PG_ZERO) 105a3544580Soga return UVM_PMR_MEMTYPE_ZERO; 106a3544580Soga /* Default: dirty memory. */ 107a3544580Soga return UVM_PMR_MEMTYPE_DIRTY; 108a3544580Soga } 109a3544580Soga 110a3544580Soga /* Trees. */ 111262a556aSdlg RBT_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp); 112262a556aSdlg RBT_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp); 1133accc929Sdlg RBT_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr, 114a3544580Soga uvm_pmemrange_addr_cmp); 115a3544580Soga 116a3544580Soga /* Validation. */ 117a3544580Soga #ifdef DEBUG 118a3544580Soga void uvm_pmr_assertvalid(struct uvm_pmemrange *pmr); 119a3544580Soga #else 120a3544580Soga #define uvm_pmr_assertvalid(pmr) do {} while (0) 121a3544580Soga #endif 122a3544580Soga 123dba5c185Smiod psize_t uvm_pmr_get1page(psize_t, int, struct pglist *, 124ef440d08Skettenis paddr_t, paddr_t, int); 125a3544580Soga 126a3544580Soga struct uvm_pmemrange *uvm_pmr_allocpmr(void); 127a3544580Soga struct vm_page *uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int); 128a3544580Soga struct vm_page *uvm_pmr_nextsz(struct uvm_pmemrange *, 129a3544580Soga struct vm_page *, int); 130a3544580Soga void uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, 131a3544580Soga struct vm_page *pg, struct vm_page **pg_prev, 132a3544580Soga struct vm_page **pg_next); 133a3544580Soga struct vm_page *uvm_pmr_findnextsegment(struct uvm_pmemrange *, 134a3544580Soga struct vm_page *, paddr_t); 1351e57804eSbeck struct vm_page *uvm_pmr_findprevsegment(struct uvm_pmemrange *, 1361e57804eSbeck struct vm_page *, paddr_t); 137a3544580Soga psize_t uvm_pmr_remove_1strange(struct pglist *, paddr_t, 138a3544580Soga struct vm_page **, int); 1391e57804eSbeck psize_t uvm_pmr_remove_1strange_reverse(struct pglist *, 1401e57804eSbeck paddr_t *); 141a3544580Soga void uvm_pmr_split(paddr_t); 142a3544580Soga struct uvm_pmemrange *uvm_pmemrange_find(paddr_t); 143a3544580Soga struct uvm_pmemrange *uvm_pmemrange_use_insert(struct uvm_pmemrange_use *, 144a3544580Soga struct uvm_pmemrange *); 145a3544580Soga psize_t pow2divide(psize_t, psize_t); 146a3544580Soga struct vm_page *uvm_pmr_rootupdate(struct uvm_pmemrange *, 147a3544580Soga struct vm_page *, paddr_t, paddr_t, int); 148a3544580Soga 149a3544580Soga /* 150a3544580Soga * Computes num/denom and rounds it up to the next power-of-2. 151a3544580Soga * 152a3544580Soga * This is a division function which calculates an approximation of 153a3544580Soga * num/denom, with result =~ num/denom. It is meant to be fast and doesn't 154a3544580Soga * have to be accurate. 155a3544580Soga * 156a3544580Soga * Providing too large a value makes the allocator slightly faster, at the 157a3544580Soga * risk of hitting the failure case more often. Providing too small a value 158a3544580Soga * makes the allocator a bit slower, but less likely to hit a failure case. 159a3544580Soga */ 160a3544580Soga psize_t 161a3544580Soga pow2divide(psize_t num, psize_t denom) 162a3544580Soga { 163a3544580Soga int rshift; 164a3544580Soga 1654b9fb15fSoga for (rshift = 0; num > denom; rshift++, denom <<= 1) 1664b9fb15fSoga ; 167a3544580Soga return (paddr_t)1 << rshift; 168a3544580Soga } 169a3544580Soga 170a3544580Soga /* 171a3544580Soga * Predicate: lhs is a subrange or rhs. 172a3544580Soga * 173a3544580Soga * If rhs_low == 0: don't care about lower bound. 174a3544580Soga * If rhs_high == 0: don't care about upper bound. 175a3544580Soga */ 176a3544580Soga #define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high) \ 177a3544580Soga (((rhs_low) == 0 || (lhs_low) >= (rhs_low)) && \ 178a3544580Soga ((rhs_high) == 0 || (lhs_high) <= (rhs_high))) 179a3544580Soga 180a3544580Soga /* 181a3544580Soga * Predicate: lhs intersects with rhs. 182a3544580Soga * 183a3544580Soga * If rhs_low == 0: don't care about lower bound. 184a3544580Soga * If rhs_high == 0: don't care about upper bound. 185a3544580Soga * Ranges don't intersect if they don't have any page in common, array 186a3544580Soga * semantics mean that < instead of <= should be used here. 187a3544580Soga */ 188a3544580Soga #define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high) \ 189a3544580Soga (((rhs_low) == 0 || (rhs_low) < (lhs_high)) && \ 190a3544580Soga ((rhs_high) == 0 || (lhs_low) < (rhs_high))) 191a3544580Soga 192a3544580Soga /* 193a3544580Soga * Align to power-of-2 alignment. 194a3544580Soga */ 195a3544580Soga #define PMR_ALIGN(pgno, align) \ 196a3544580Soga (((pgno) + ((align) - 1)) & ~((align) - 1)) 1971e57804eSbeck #define PMR_ALIGN_DOWN(pgno, align) \ 1981e57804eSbeck ((pgno) & ~((align) - 1)) 199a3544580Soga 200a3544580Soga 201a3544580Soga /* 202a3544580Soga * Comparator: sort by address ascending. 203a3544580Soga */ 204a3544580Soga int 2053accc929Sdlg uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *lhs, 2063accc929Sdlg const struct uvm_pmemrange *rhs) 207a3544580Soga { 208a3544580Soga return lhs->low < rhs->low ? -1 : lhs->low > rhs->low; 209a3544580Soga } 210a3544580Soga 211a3544580Soga /* 212a3544580Soga * Comparator: sort by use ascending. 213a3544580Soga * 214a3544580Soga * The higher the use value of a range, the more devices need memory in 2157a3af63bStb * this range. Therefore allocate from the range with the lowest use first. 216a3544580Soga */ 217a3544580Soga int 218a3544580Soga uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs) 219a3544580Soga { 220a3544580Soga int result; 221a3544580Soga 222a3544580Soga result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use; 223a3544580Soga if (result == 0) 224a3544580Soga result = uvm_pmemrange_addr_cmp(lhs, rhs); 225a3544580Soga return result; 226a3544580Soga } 227a3544580Soga 228a3544580Soga int 229262a556aSdlg uvm_pmr_addr_cmp(const struct vm_page *lhs, const struct vm_page *rhs) 230a3544580Soga { 231a3544580Soga paddr_t lhs_addr, rhs_addr; 232a3544580Soga 233a3544580Soga lhs_addr = VM_PAGE_TO_PHYS(lhs); 234a3544580Soga rhs_addr = VM_PAGE_TO_PHYS(rhs); 235a3544580Soga 236a3544580Soga return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr); 237a3544580Soga } 238a3544580Soga 239a3544580Soga int 240262a556aSdlg uvm_pmr_size_cmp(const struct vm_page *lhs, const struct vm_page *rhs) 241a3544580Soga { 242a3544580Soga psize_t lhs_size, rhs_size; 243a3544580Soga int cmp; 244a3544580Soga 245a3544580Soga /* Using second tree, so we receive pg[1] instead of pg[0]. */ 246a3544580Soga lhs_size = (lhs - 1)->fpgsz; 247a3544580Soga rhs_size = (rhs - 1)->fpgsz; 248a3544580Soga 249a3544580Soga cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size); 250a3544580Soga if (cmp == 0) 251a3544580Soga cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1); 252a3544580Soga return cmp; 253a3544580Soga } 254a3544580Soga 255a3544580Soga /* 256a3544580Soga * Find the first range of free pages that is at least sz pages long. 257a3544580Soga */ 258a3544580Soga struct vm_page * 259a3544580Soga uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti) 260a3544580Soga { 261a3544580Soga struct vm_page *node, *best; 262a3544580Soga 263a3544580Soga KASSERT(sz >= 1); 264a3544580Soga 265a3544580Soga if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti])) 266a3544580Soga return TAILQ_FIRST(&pmr->single[mti]); 267a3544580Soga 268262a556aSdlg node = RBT_ROOT(uvm_pmr_size, &pmr->size[mti]); 269a3544580Soga best = NULL; 270a3544580Soga while (node != NULL) { 271a3544580Soga if ((node - 1)->fpgsz >= sz) { 272a3544580Soga best = (node - 1); 273262a556aSdlg node = RBT_LEFT(uvm_objtree, node); 274a3544580Soga } else 275262a556aSdlg node = RBT_RIGHT(uvm_objtree, node); 276a3544580Soga } 277a3544580Soga return best; 278a3544580Soga } 279a3544580Soga 280a3544580Soga /* 281a3544580Soga * Finds the next range. The next range has a size >= pg->fpgsz. 282a3544580Soga * Returns NULL if no more ranges are available. 283a3544580Soga */ 284a3544580Soga struct vm_page * 285a3544580Soga uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt) 286a3544580Soga { 287a3544580Soga struct vm_page *npg; 288a3544580Soga 289a3544580Soga KASSERT(pmr != NULL && pg != NULL); 290a3544580Soga if (pg->fpgsz == 1) { 291a3544580Soga if (TAILQ_NEXT(pg, pageq) != NULL) 292a3544580Soga return TAILQ_NEXT(pg, pageq); 293a3544580Soga else 294262a556aSdlg npg = RBT_MIN(uvm_pmr_size, &pmr->size[mt]); 295a3544580Soga } else 296262a556aSdlg npg = RBT_NEXT(uvm_pmr_size, pg + 1); 297a3544580Soga 298a3544580Soga return npg == NULL ? NULL : npg - 1; 299a3544580Soga } 300a3544580Soga 301a3544580Soga /* 302a3544580Soga * Finds the previous and next ranges relative to the (uninserted) pg range. 303a3544580Soga * 304a3544580Soga * *pg_prev == NULL if no previous range is available, that can join with 305a3544580Soga * pg. 306a3544580Soga * *pg_next == NULL if no next range is available, that can join with 307a3544580Soga * pg. 308a3544580Soga */ 309a3544580Soga void 310a3544580Soga uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg, 311a3544580Soga struct vm_page **pg_prev, struct vm_page **pg_next) 312a3544580Soga { 313a3544580Soga KASSERT(pg_prev != NULL && pg_next != NULL); 314a3544580Soga 315262a556aSdlg *pg_next = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg); 316a3544580Soga if (*pg_next == NULL) 317262a556aSdlg *pg_prev = RBT_MAX(uvm_pmr_addr, &pmr->addr); 318a3544580Soga else 319262a556aSdlg *pg_prev = RBT_PREV(uvm_pmr_addr, *pg_next); 320a3544580Soga 321a3544580Soga KDASSERT(*pg_next == NULL || 322a3544580Soga VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg)); 323a3544580Soga KDASSERT(*pg_prev == NULL || 324a3544580Soga VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg)); 325a3544580Soga 326a3544580Soga /* Reset if not contig. */ 327a3544580Soga if (*pg_prev != NULL && 328a3544580Soga (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz 329a3544580Soga != atop(VM_PAGE_TO_PHYS(pg)) || 330a3544580Soga *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */ 331a3544580Soga uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg))) 332a3544580Soga *pg_prev = NULL; 333a3544580Soga if (*pg_next != NULL && 334a3544580Soga (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz 335a3544580Soga != atop(VM_PAGE_TO_PHYS(*pg_next)) || 336a3544580Soga pg + pg->fpgsz != *pg_next || /* Array broke. */ 337a3544580Soga uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg))) 338a3544580Soga *pg_next = NULL; 339a3544580Soga return; 340a3544580Soga } 341a3544580Soga 342a3544580Soga /* 343a3544580Soga * Remove a range from the address tree. 344a3544580Soga * Address tree maintains pmr counters. 345a3544580Soga */ 346a3544580Soga void 347a3544580Soga uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg) 348a3544580Soga { 349262a556aSdlg KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg); 350a3544580Soga KDASSERT(pg->pg_flags & PQ_FREE); 351262a556aSdlg RBT_REMOVE(uvm_pmr_addr, &pmr->addr, pg); 352a3544580Soga 353a3544580Soga pmr->nsegs--; 354a3544580Soga } 355a3544580Soga /* 356a3544580Soga * Remove a range from the size tree. 357a3544580Soga */ 358a3544580Soga void 359a3544580Soga uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg) 360a3544580Soga { 361a3544580Soga int memtype; 362a3544580Soga #ifdef DEBUG 363a3544580Soga struct vm_page *i; 364a3544580Soga #endif 365a3544580Soga 366a3544580Soga KDASSERT(pg->fpgsz >= 1); 367a3544580Soga KDASSERT(pg->pg_flags & PQ_FREE); 368a3544580Soga memtype = uvm_pmr_pg_to_memtype(pg); 369a3544580Soga 370a3544580Soga if (pg->fpgsz == 1) { 371a3544580Soga #ifdef DEBUG 372a3544580Soga TAILQ_FOREACH(i, &pmr->single[memtype], pageq) { 373a3544580Soga if (i == pg) 374a3544580Soga break; 375a3544580Soga } 376a3544580Soga KDASSERT(i == pg); 377a3544580Soga #endif 378a3544580Soga TAILQ_REMOVE(&pmr->single[memtype], pg, pageq); 379a3544580Soga } else { 3806d0d0baaSdlg KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[memtype], 381a3544580Soga pg + 1) == pg + 1); 382262a556aSdlg RBT_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1); 383a3544580Soga } 384a3544580Soga } 385a3544580Soga /* Remove from both trees. */ 386a3544580Soga void 387a3544580Soga uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg) 388a3544580Soga { 389a3544580Soga uvm_pmr_assertvalid(pmr); 390a3544580Soga uvm_pmr_remove_size(pmr, pg); 391a3544580Soga uvm_pmr_remove_addr(pmr, pg); 392a3544580Soga uvm_pmr_assertvalid(pmr); 393a3544580Soga } 394a3544580Soga 395a3544580Soga /* 396a3544580Soga * Insert the range described in pg. 397a3544580Soga * Returns the range thus created (which may be joined with the previous and 398a3544580Soga * next ranges). 399a3544580Soga * If no_join, the caller guarantees that the range cannot possibly join 4009bec9e43Sjsg * with adjacent ranges. 401a3544580Soga */ 402a3544580Soga struct vm_page * 403a3544580Soga uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join) 404a3544580Soga { 405a3544580Soga struct vm_page *prev, *next; 406a3544580Soga 407a3544580Soga #ifdef DEBUG 408a3544580Soga struct vm_page *i; 409a3544580Soga int mt; 410a3544580Soga #endif 411a3544580Soga 412a3544580Soga KDASSERT(pg->pg_flags & PQ_FREE); 413a3544580Soga KDASSERT(pg->fpgsz >= 1); 414a3544580Soga 415a3544580Soga #ifdef DEBUG 416a3544580Soga for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { 417a3544580Soga TAILQ_FOREACH(i, &pmr->single[mt], pageq) 418a3544580Soga KDASSERT(i != pg); 419a3544580Soga if (pg->fpgsz > 1) { 4206d0d0baaSdlg KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mt], 421a3544580Soga pg + 1) == NULL); 422a3544580Soga } 4236d0d0baaSdlg KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL); 424a3544580Soga } 425a3544580Soga #endif 426a3544580Soga 427a3544580Soga if (!no_join) { 428a3544580Soga uvm_pmr_pnaddr(pmr, pg, &prev, &next); 429a3544580Soga if (next != NULL) { 430a3544580Soga uvm_pmr_remove_size(pmr, next); 431a3544580Soga uvm_pmr_remove_addr(pmr, next); 432a3544580Soga pg->fpgsz += next->fpgsz; 433a3544580Soga next->fpgsz = 0; 434a3544580Soga } 435a3544580Soga if (prev != NULL) { 436a3544580Soga uvm_pmr_remove_size(pmr, prev); 437a3544580Soga prev->fpgsz += pg->fpgsz; 438a3544580Soga pg->fpgsz = 0; 439a3544580Soga return prev; 440a3544580Soga } 441a3544580Soga } 442a3544580Soga 443262a556aSdlg RBT_INSERT(uvm_pmr_addr, &pmr->addr, pg); 444a3544580Soga 445a3544580Soga pmr->nsegs++; 446a3544580Soga 447a3544580Soga return pg; 448a3544580Soga } 449a3544580Soga /* 450a3544580Soga * Insert the range described in pg. 451a3544580Soga * Returns the range thus created (which may be joined with the previous and 452a3544580Soga * next ranges). 453a3544580Soga * Page must already be in the address tree. 454a3544580Soga */ 455a3544580Soga void 456a3544580Soga uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg) 457a3544580Soga { 458a3544580Soga int memtype; 459a3544580Soga #ifdef DEBUG 460a3544580Soga struct vm_page *i; 461a3544580Soga int mti; 462a3544580Soga #endif 463a3544580Soga 464a3544580Soga KDASSERT(pg->fpgsz >= 1); 465a3544580Soga KDASSERT(pg->pg_flags & PQ_FREE); 466a3544580Soga 467a3544580Soga memtype = uvm_pmr_pg_to_memtype(pg); 468a3544580Soga #ifdef DEBUG 469a3544580Soga for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) { 470a3544580Soga TAILQ_FOREACH(i, &pmr->single[mti], pageq) 471a3544580Soga KDASSERT(i != pg); 472a3544580Soga if (pg->fpgsz > 1) { 4736d0d0baaSdlg KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mti], 474a3544580Soga pg + 1) == NULL); 475a3544580Soga } 4766d0d0baaSdlg KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg); 477a3544580Soga } 478a3544580Soga for (i = pg; i < pg + pg->fpgsz; i++) 479a3544580Soga KASSERT(uvm_pmr_pg_to_memtype(i) == memtype); 480a3544580Soga #endif 481a3544580Soga 482a3544580Soga if (pg->fpgsz == 1) 483a3544580Soga TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq); 484a3544580Soga else 485262a556aSdlg RBT_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1); 486a3544580Soga } 487a3544580Soga /* Insert in both trees. */ 488a3544580Soga struct vm_page * 489a3544580Soga uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join) 490a3544580Soga { 491a3544580Soga uvm_pmr_assertvalid(pmr); 492a3544580Soga pg = uvm_pmr_insert_addr(pmr, pg, no_join); 493a3544580Soga uvm_pmr_insert_size(pmr, pg); 494a3544580Soga uvm_pmr_assertvalid(pmr); 495a3544580Soga return pg; 496a3544580Soga } 497a3544580Soga 498a3544580Soga /* 499a3544580Soga * Find the last page that is part of this segment. 500a3544580Soga * => pg: the range at which to start the search. 501a3544580Soga * => boundary: the page number boundary specification (0 = no boundary). 502a3544580Soga * => pmr: the pmemrange of the page. 503a3544580Soga * 504a3544580Soga * This function returns 1 before the next range, so if you want to have the 505a3544580Soga * next range, you need to run TAILQ_NEXT(result, pageq) after calling. 506a3544580Soga * The reason is that this way, the length of the segment is easily 507a3544580Soga * calculated using: atop(result) - atop(pg) + 1. 508a3544580Soga * Hence this function also never returns NULL. 509a3544580Soga */ 510a3544580Soga struct vm_page * 511a3544580Soga uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr, 512a3544580Soga struct vm_page *pg, paddr_t boundary) 513a3544580Soga { 514a3544580Soga paddr_t first_boundary; 515a3544580Soga struct vm_page *next; 516a3544580Soga struct vm_page *prev; 517a3544580Soga 518a3544580Soga KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) && 519a3544580Soga pmr->high > atop(VM_PAGE_TO_PHYS(pg))); 520a3544580Soga if (boundary != 0) { 521a3544580Soga first_boundary = 522a3544580Soga PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary); 523a3544580Soga } else 524a3544580Soga first_boundary = 0; 525a3544580Soga 526a3544580Soga /* 527a3544580Soga * Increase next until it hits the first page of the next segment. 528a3544580Soga * 529a3544580Soga * While loop checks the following: 530a3544580Soga * - next != NULL we have not reached the end of pgl 531a3544580Soga * - boundary == 0 || next < first_boundary 532a3544580Soga * we do not cross a boundary 533a3544580Soga * - atop(prev) + 1 == atop(next) 534a3544580Soga * still in the same segment 535a3544580Soga * - low <= last 536a3544580Soga * - high > last still in the same memory range 537a3544580Soga * - memtype is equal allocator is unable to view different memtypes 538a3544580Soga * as part of the same segment 539a3544580Soga * - prev + 1 == next no array breakage occurs 540a3544580Soga */ 541a3544580Soga prev = pg; 542a3544580Soga next = TAILQ_NEXT(prev, pageq); 543a3544580Soga while (next != NULL && 544a3544580Soga (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) && 545a3544580Soga atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) && 546a3544580Soga pmr->low <= atop(VM_PAGE_TO_PHYS(next)) && 547a3544580Soga pmr->high > atop(VM_PAGE_TO_PHYS(next)) && 548a3544580Soga uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) && 549a3544580Soga prev + 1 == next) { 550a3544580Soga prev = next; 551a3544580Soga next = TAILQ_NEXT(prev, pageq); 552a3544580Soga } 553a3544580Soga 554a3544580Soga /* 555a3544580Soga * End of this segment. 556a3544580Soga */ 557a3544580Soga return prev; 558a3544580Soga } 559a3544580Soga 560a3544580Soga /* 5611e57804eSbeck * Find the first page that is part of this segment. 5621e57804eSbeck * => pg: the range at which to start the search. 5631e57804eSbeck * => boundary: the page number boundary specification (0 = no boundary). 5641e57804eSbeck * => pmr: the pmemrange of the page. 5651e57804eSbeck * 5661e57804eSbeck * This function returns 1 after the previous range, so if you want to have the 5671e57804eSbeck * previous range, you need to run TAILQ_NEXT(result, pageq) after calling. 5681e57804eSbeck * The reason is that this way, the length of the segment is easily 5691e57804eSbeck * calculated using: atop(pg) - atop(result) + 1. 5701e57804eSbeck * Hence this function also never returns NULL. 5711e57804eSbeck */ 5721e57804eSbeck struct vm_page * 5731e57804eSbeck uvm_pmr_findprevsegment(struct uvm_pmemrange *pmr, 5741e57804eSbeck struct vm_page *pg, paddr_t boundary) 5751e57804eSbeck { 5761e57804eSbeck paddr_t first_boundary; 5771e57804eSbeck struct vm_page *next; 5781e57804eSbeck struct vm_page *prev; 5791e57804eSbeck 5801e57804eSbeck KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) && 5811e57804eSbeck pmr->high > atop(VM_PAGE_TO_PHYS(pg))); 5821e57804eSbeck if (boundary != 0) { 5831e57804eSbeck first_boundary = 5841e57804eSbeck PMR_ALIGN_DOWN(atop(VM_PAGE_TO_PHYS(pg)), boundary); 5851e57804eSbeck } else 5861e57804eSbeck first_boundary = 0; 5871e57804eSbeck 5881e57804eSbeck /* 5891e57804eSbeck * Increase next until it hits the first page of the previous segment. 5901e57804eSbeck * 5911e57804eSbeck * While loop checks the following: 5921e57804eSbeck * - next != NULL we have not reached the end of pgl 5931e57804eSbeck * - boundary == 0 || next >= first_boundary 5941e57804eSbeck * we do not cross a boundary 5951e57804eSbeck * - atop(prev) - 1 == atop(next) 5961e57804eSbeck * still in the same segment 5971e57804eSbeck * - low <= last 5981e57804eSbeck * - high > last still in the same memory range 5991e57804eSbeck * - memtype is equal allocator is unable to view different memtypes 6001e57804eSbeck * as part of the same segment 6011e57804eSbeck * - prev - 1 == next no array breakage occurs 6021e57804eSbeck */ 6031e57804eSbeck prev = pg; 6041e57804eSbeck next = TAILQ_NEXT(prev, pageq); 6051e57804eSbeck while (next != NULL && 6061e57804eSbeck (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) >= first_boundary) && 6071e57804eSbeck atop(VM_PAGE_TO_PHYS(prev)) - 1 == atop(VM_PAGE_TO_PHYS(next)) && 6081e57804eSbeck pmr->low <= atop(VM_PAGE_TO_PHYS(next)) && 6091e57804eSbeck pmr->high > atop(VM_PAGE_TO_PHYS(next)) && 6101e57804eSbeck uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) && 6111e57804eSbeck prev - 1 == next) { 6121e57804eSbeck prev = next; 6131e57804eSbeck next = TAILQ_NEXT(prev, pageq); 6141e57804eSbeck } 6151e57804eSbeck 6161e57804eSbeck /* 6171e57804eSbeck * Start of this segment. 6181e57804eSbeck */ 6191e57804eSbeck return prev; 6201e57804eSbeck } 6211e57804eSbeck 6221e57804eSbeck /* 623a3544580Soga * Remove the first segment of contiguous pages from pgl. 624a3544580Soga * A segment ends if it crosses boundary (unless boundary = 0) or 625a3544580Soga * if it would enter a different uvm_pmemrange. 626a3544580Soga * 627a3544580Soga * Work: the page range that the caller is currently working with. 628a3544580Soga * May be null. 629a3544580Soga * 630a3544580Soga * If is_desperate is non-zero, the smallest segment is erased. Otherwise, 631a3544580Soga * the first segment is erased (which, if called by uvm_pmr_getpages(), 632a3544580Soga * probably is the smallest or very close to it). 633a3544580Soga */ 634a3544580Soga psize_t 635a3544580Soga uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary, 636a3544580Soga struct vm_page **work, int is_desperate) 637a3544580Soga { 6381e57804eSbeck struct vm_page *start, *end, *iter, *iter_end, *inserted, *lowest; 639a3544580Soga psize_t count; 640a3544580Soga struct uvm_pmemrange *pmr, *pmr_iter; 641a3544580Soga 642a3544580Soga KASSERT(!TAILQ_EMPTY(pgl)); 643a3544580Soga 644a3544580Soga /* 645a3544580Soga * Initialize to first page. 646a3544580Soga * Unless desperate scan finds a better candidate, this is what'll be 647a3544580Soga * erased. 648a3544580Soga */ 649a3544580Soga start = TAILQ_FIRST(pgl); 650a3544580Soga pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start))); 651a3544580Soga end = uvm_pmr_findnextsegment(pmr, start, boundary); 652a3544580Soga 653a3544580Soga /* 654a3544580Soga * If we are desperate, we _really_ want to get rid of the smallest 655a3544580Soga * element (rather than a close match to the smallest element). 656a3544580Soga */ 657a3544580Soga if (is_desperate) { 658d3b3d34dSthib /* Linear search for smallest segment. */ 659a3544580Soga pmr_iter = pmr; 660a3544580Soga for (iter = TAILQ_NEXT(end, pageq); 661a3544580Soga iter != NULL && start != end; 662a3544580Soga iter = TAILQ_NEXT(iter_end, pageq)) { 663a3544580Soga /* 664a3544580Soga * Only update pmr if it doesn't match current 665a3544580Soga * iteration. 666a3544580Soga */ 667a3544580Soga if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) || 668a3544580Soga pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) { 669a3544580Soga pmr_iter = uvm_pmemrange_find(atop( 670a3544580Soga VM_PAGE_TO_PHYS(iter))); 671a3544580Soga } 672a3544580Soga 673a3544580Soga iter_end = uvm_pmr_findnextsegment(pmr_iter, iter, 674a3544580Soga boundary); 675a3544580Soga 676a3544580Soga /* 677a3544580Soga * Current iteration is smaller than best match so 678a3544580Soga * far; update. 679a3544580Soga */ 680a3544580Soga if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) < 681a3544580Soga VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) { 682a3544580Soga start = iter; 683a3544580Soga end = iter_end; 684a3544580Soga pmr = pmr_iter; 685a3544580Soga } 686a3544580Soga } 687a3544580Soga } 688a3544580Soga 689a3544580Soga /* 690a3544580Soga * Calculate count and end of the list. 691a3544580Soga */ 692a3544580Soga count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1; 6931e57804eSbeck lowest = start; 694a3544580Soga end = TAILQ_NEXT(end, pageq); 695a3544580Soga 696a3544580Soga /* 697a3544580Soga * Actually remove the range of pages. 698a3544580Soga * 699a3544580Soga * Sadly, this cannot be done using pointer iteration: 700a3544580Soga * vm_physseg is not guaranteed to be sorted on address, hence 701a3544580Soga * uvm_page_init() may not have initialized its array sorted by 702a3544580Soga * page number. 703a3544580Soga */ 704a3544580Soga for (iter = start; iter != end; iter = iter_end) { 705a3544580Soga iter_end = TAILQ_NEXT(iter, pageq); 706a3544580Soga TAILQ_REMOVE(pgl, iter, pageq); 707a3544580Soga } 708a3544580Soga 7091e57804eSbeck lowest->fpgsz = count; 7101e57804eSbeck inserted = uvm_pmr_insert(pmr, lowest, 0); 711a3544580Soga 712a3544580Soga /* 713a3544580Soga * If the caller was working on a range and this function modified 714a3544580Soga * that range, update the pointer. 715a3544580Soga */ 716a3544580Soga if (work != NULL && *work != NULL && 717a3544580Soga atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) && 718a3544580Soga atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz > 719a3544580Soga atop(VM_PAGE_TO_PHYS(*work))) 720a3544580Soga *work = inserted; 721a3544580Soga return count; 722a3544580Soga } 723a3544580Soga 724a3544580Soga /* 7251e57804eSbeck * Remove the first segment of contiguous pages from a pgl 7261e57804eSbeck * with the list elements in reverse order of physaddr. 7271e57804eSbeck * 7281e57804eSbeck * A segment ends if it would enter a different uvm_pmemrange. 7291e57804eSbeck * 7301e57804eSbeck * Stores starting physical address of the segment in pstart. 7311e57804eSbeck */ 7321e57804eSbeck psize_t 7331e57804eSbeck uvm_pmr_remove_1strange_reverse(struct pglist *pgl, paddr_t *pstart) 7341e57804eSbeck { 7351e57804eSbeck struct vm_page *start, *end, *iter, *iter_end, *lowest; 7361e57804eSbeck psize_t count; 7371e57804eSbeck struct uvm_pmemrange *pmr; 7381e57804eSbeck 7391e57804eSbeck KASSERT(!TAILQ_EMPTY(pgl)); 7401e57804eSbeck 7411e57804eSbeck start = TAILQ_FIRST(pgl); 7421e57804eSbeck pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start))); 7431e57804eSbeck end = uvm_pmr_findprevsegment(pmr, start, 0); 7441e57804eSbeck 7451e57804eSbeck KASSERT(end <= start); 7461e57804eSbeck 7471e57804eSbeck /* 7481e57804eSbeck * Calculate count and end of the list. 7491e57804eSbeck */ 7501e57804eSbeck count = atop(VM_PAGE_TO_PHYS(start) - VM_PAGE_TO_PHYS(end)) + 1; 7511e57804eSbeck lowest = end; 7521e57804eSbeck end = TAILQ_NEXT(end, pageq); 7531e57804eSbeck 7541e57804eSbeck /* 7551e57804eSbeck * Actually remove the range of pages. 7561e57804eSbeck * 7571e57804eSbeck * Sadly, this cannot be done using pointer iteration: 7581e57804eSbeck * vm_physseg is not guaranteed to be sorted on address, hence 7591e57804eSbeck * uvm_page_init() may not have initialized its array sorted by 7601e57804eSbeck * page number. 7611e57804eSbeck */ 7621e57804eSbeck for (iter = start; iter != end; iter = iter_end) { 7631e57804eSbeck iter_end = TAILQ_NEXT(iter, pageq); 7641e57804eSbeck TAILQ_REMOVE(pgl, iter, pageq); 7651e57804eSbeck } 7661e57804eSbeck 7671e57804eSbeck lowest->fpgsz = count; 7681e57804eSbeck (void) uvm_pmr_insert(pmr, lowest, 0); 7691e57804eSbeck 7701e57804eSbeck *pstart = VM_PAGE_TO_PHYS(lowest); 7711e57804eSbeck return count; 7721e57804eSbeck } 7731e57804eSbeck 7741e57804eSbeck /* 775a3544580Soga * Extract a number of pages from a segment of free pages. 776a3544580Soga * Called by uvm_pmr_getpages. 777a3544580Soga * 778a3544580Soga * Returns the segment that was created from pages left over at the tail 779a3544580Soga * of the remove set of pages, or NULL if no pages were left at the tail. 780a3544580Soga */ 781a3544580Soga struct vm_page * 782a3544580Soga uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg, 783a3544580Soga paddr_t start, paddr_t end, struct pglist *result) 784a3544580Soga { 785a3544580Soga struct vm_page *after, *pg_i; 786a3544580Soga psize_t before_sz, after_sz; 787a3544580Soga #ifdef DEBUG 788a3544580Soga psize_t i; 789a3544580Soga #endif 790a3544580Soga 791a3544580Soga KDASSERT(end > start); 792a3544580Soga KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg))); 793a3544580Soga KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz); 794a3544580Soga KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start); 795a3544580Soga KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end); 796a3544580Soga 797a3544580Soga before_sz = start - atop(VM_PAGE_TO_PHYS(pg)); 798a3544580Soga after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end; 799a3544580Soga KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz); 800a3544580Soga uvm_pmr_assertvalid(pmr); 801a3544580Soga 802a3544580Soga uvm_pmr_remove_size(pmr, pg); 803a3544580Soga if (before_sz == 0) 804a3544580Soga uvm_pmr_remove_addr(pmr, pg); 80556db1aa6Sariane after = pg + before_sz + (end - start); 806a3544580Soga 807a3544580Soga /* Add selected pages to result. */ 80856db1aa6Sariane for (pg_i = pg + before_sz; pg_i != after; pg_i++) { 809c6b41277Sariane KASSERT(pg_i->pg_flags & PQ_FREE); 810a3544580Soga pg_i->fpgsz = 0; 811a3544580Soga TAILQ_INSERT_TAIL(result, pg_i, pageq); 812a3544580Soga } 813a3544580Soga 814a3544580Soga /* Before handling. */ 815a3544580Soga if (before_sz > 0) { 816a3544580Soga pg->fpgsz = before_sz; 817a3544580Soga uvm_pmr_insert_size(pmr, pg); 818a3544580Soga } 819a3544580Soga 820a3544580Soga /* After handling. */ 821a3544580Soga if (after_sz > 0) { 822a3544580Soga #ifdef DEBUG 823a3544580Soga for (i = 0; i < after_sz; i++) { 824a3544580Soga KASSERT(!uvm_pmr_isfree(after + i)); 825a3544580Soga } 826a3544580Soga #endif 827a3544580Soga KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end); 828a3544580Soga after->fpgsz = after_sz; 829a3544580Soga after = uvm_pmr_insert_addr(pmr, after, 1); 830a3544580Soga uvm_pmr_insert_size(pmr, after); 831a3544580Soga } 832a3544580Soga 833a3544580Soga uvm_pmr_assertvalid(pmr); 83456db1aa6Sariane return (after_sz > 0 ? after : NULL); 835a3544580Soga } 836a3544580Soga 837a3544580Soga /* 838a3544580Soga * Acquire a number of pages. 839a3544580Soga * 840a3544580Soga * count: the number of pages returned 841a3544580Soga * start: lowest page number 842a3544580Soga * end: highest page number +1 843a3544580Soga * (start = end = 0: no limitation) 844a3544580Soga * align: power-of-2 alignment constraint (align = 1: no alignment) 845a3544580Soga * boundary: power-of-2 boundary (boundary = 0: no boundary) 846a3544580Soga * maxseg: maximum number of segments to return 847a3544580Soga * flags: UVM_PLA_* flags 848a3544580Soga * result: returned pages storage (uses pageq) 849a3544580Soga */ 850a3544580Soga int 851a3544580Soga uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align, 852a3544580Soga paddr_t boundary, int maxseg, int flags, struct pglist *result) 853a3544580Soga { 854a3544580Soga struct uvm_pmemrange *pmr; /* Iterate memory ranges. */ 855a3544580Soga struct vm_page *found, *f_next; /* Iterate chunks. */ 856a3544580Soga psize_t fcount; /* Current found pages. */ 857a3544580Soga int fnsegs; /* Current segment counter. */ 858a3544580Soga int try, start_try; 859a3544580Soga psize_t search[3]; 860a3544580Soga paddr_t fstart, fend; /* Pages to be taken from found. */ 861a3544580Soga int memtype; /* Requested memtype. */ 862a3544580Soga int memtype_init; /* Best memtype. */ 863a3544580Soga int desperate; /* True if allocation failed. */ 864a19ffaa9Sariane #ifdef DIAGNOSTIC 865a19ffaa9Sariane struct vm_page *diag_prev; /* Used during validation. */ 866a19ffaa9Sariane #endif /* DIAGNOSTIC */ 867a3544580Soga 868a3544580Soga /* 869a3544580Soga * Validate arguments. 870a3544580Soga */ 8716aa35f1fSbeck KASSERT(count > 0); 8726aa35f1fSbeck KASSERT(start == 0 || end == 0 || start < end); 8736aa35f1fSbeck KASSERT(align >= 1); 8746aa35f1fSbeck KASSERT(powerof2(align)); 8756aa35f1fSbeck KASSERT(maxseg > 0); 8766aa35f1fSbeck KASSERT(boundary == 0 || powerof2(boundary)); 8776aa35f1fSbeck KASSERT(boundary == 0 || maxseg * boundary >= count); 8786aa35f1fSbeck KASSERT(TAILQ_EMPTY(result)); 87903c020efSmpi KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT)); 880a3544580Soga 881a3544580Soga /* 882a3544580Soga * TRYCONTIG is a noop if you only want a single segment. 883a3544580Soga * Remove it if that's the case: otherwise it'll deny the fast 884a3544580Soga * allocation. 885a3544580Soga */ 886a3544580Soga if (maxseg == 1 || count == 1) 887a3544580Soga flags &= ~UVM_PLA_TRYCONTIG; 888a3544580Soga 889a3544580Soga /* 890a3544580Soga * Configure search. 891a3544580Soga * 892a3544580Soga * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case. 893a3544580Soga * search[1] is multiple segments, chosen to fulfill the search in 894a3544580Soga * approximately even-sized segments. 895a3544580Soga * This is a good trade-off between slightly reduced allocation speed 896a3544580Soga * and less fragmentation. 897a3544580Soga * search[2] is the worst case, in which all segments are evaluated. 898a3544580Soga * This provides the least fragmentation, but makes the search 899a3544580Soga * possibly longer (although in the case it is selected, that no 900a3544580Soga * longer matters most). 901a3544580Soga * 902a3544580Soga * The exception is when maxseg == 1: since we can only fulfill that 903a3544580Soga * with one segment of size pages, only a single search type has to 904a3544580Soga * be attempted. 905a3544580Soga */ 906a3544580Soga if (maxseg == 1 || count == 1) { 907a3544580Soga start_try = 2; 908a3544580Soga search[2] = count; 909a3544580Soga } else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) { 910a3544580Soga start_try = 2; 911a3544580Soga search[2] = 1; 912a3544580Soga } else { 913a3544580Soga start_try = 0; 914a3544580Soga search[0] = count; 915a3544580Soga search[1] = pow2divide(count, maxseg); 916a3544580Soga search[2] = 1; 917a3544580Soga if ((flags & UVM_PLA_TRYCONTIG) == 0) 918a3544580Soga start_try = 1; 919a3544580Soga if (search[1] >= search[0]) { 920a3544580Soga search[1] = search[0]; 921a3544580Soga start_try = 1; 922a3544580Soga } 923a3544580Soga if (search[2] >= search[start_try]) { 924a3544580Soga start_try = 2; 925a3544580Soga } 926a3544580Soga } 927a3544580Soga 928a3544580Soga /* 929a3544580Soga * Memory type: if zeroed memory is requested, traverse the zero set. 930a3544580Soga * Otherwise, traverse the dirty set. 931a3544580Soga * 932a3544580Soga * The memtype iterator is reinitialized to memtype_init on entrance 933a3544580Soga * of a pmemrange. 934a3544580Soga */ 935a3544580Soga if (flags & UVM_PLA_ZERO) 936a3544580Soga memtype_init = UVM_PMR_MEMTYPE_ZERO; 937a3544580Soga else 938a3544580Soga memtype_init = UVM_PMR_MEMTYPE_DIRTY; 939a3544580Soga 940a3544580Soga /* 941a3544580Soga * Initially, we're not desperate. 942a3544580Soga * 943a3544580Soga * Note that if we return from a sleep, we are still desperate. 944a3544580Soga * Chances are that memory pressure is still high, so resetting 945a3544580Soga * seems over-optimistic to me. 946a3544580Soga */ 947a3544580Soga desperate = 0; 948a3544580Soga 94990ee2fe0Sbeck uvm_lock_fpageq(); 95017fb2b23Smpi retry: /* Return point after sleeping. */ 95103c020efSmpi /* 95203c020efSmpi * check to see if we need to generate some free pages waking 95303c020efSmpi * the pagedaemon. 95403c020efSmpi */ 95503c020efSmpi if ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freemin || 95603c020efSmpi ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg && 95703c020efSmpi (uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg)) 95803c020efSmpi wakeup(&uvm.pagedaemon); 95903c020efSmpi 96003c020efSmpi /* 96103c020efSmpi * fail if any of these conditions is true: 96203c020efSmpi * [1] there really are no free pages, or 96303c020efSmpi * [2] only kernel "reserved" pages remain and 96403c020efSmpi * the UVM_PLA_USERESERVE flag wasn't used. 96503c020efSmpi * [3] only pagedaemon "reserved" pages remain and 96603c020efSmpi * the requestor isn't the pagedaemon nor the syncer. 96703c020efSmpi */ 968b7757c6eSkettenis if ((uvmexp.free <= (uvmexp.reserve_kernel + count)) && 96903c020efSmpi !(flags & UVM_PLA_USERESERVE)) { 97003c020efSmpi uvm_unlock_fpageq(); 97103c020efSmpi return ENOMEM; 97203c020efSmpi } 97303c020efSmpi 97403c020efSmpi if ((uvmexp.free <= (uvmexp.reserve_pagedaemon + count)) && 975d60475a6Smpi !in_pagedaemon(1)) { 9763976fff1Smpi uvm_unlock_fpageq(); 97703c020efSmpi if (flags & UVM_PLA_WAITOK) { 97803c020efSmpi uvm_wait("uvm_pmr_getpages"); 97917fb2b23Smpi uvm_lock_fpageq(); 98017fb2b23Smpi goto retry; 98103c020efSmpi } 98203c020efSmpi return ENOMEM; 98303c020efSmpi } 98403c020efSmpi 985a3544580Soga fcount = 0; 986a3544580Soga fnsegs = 0; 987a3544580Soga 9883d6ee858Soga retry_desperate: 989a3544580Soga /* 990a3544580Soga * If we just want any page(s), go for the really fast option. 991a3544580Soga */ 992a3544580Soga if (count <= maxseg && align == 1 && boundary == 0 && 993a3544580Soga (flags & UVM_PLA_TRYCONTIG) == 0) { 994a3544580Soga fcount += uvm_pmr_get1page(count - fcount, memtype_init, 995ef440d08Skettenis result, start, end, 0); 996a3544580Soga 997a3544580Soga /* 9984af3577fSjsg * If we found sufficient pages, go to the success exit code. 999a3544580Soga * 1000a3544580Soga * Otherwise, go immediately to fail, since we collected 1001a3544580Soga * all we could anyway. 1002a3544580Soga */ 1003a3544580Soga if (fcount == count) 10043d6ee858Soga goto out; 1005a3544580Soga else 10063d6ee858Soga goto fail; 1007a3544580Soga } 1008a3544580Soga 1009a3544580Soga /* 1010ddc77f9fSoga * The heart of the contig case. 1011a3544580Soga * 1012a3544580Soga * The code actually looks like this: 1013a3544580Soga * 1014a3544580Soga * foreach (struct pmemrange) { 1015a3544580Soga * foreach (memtype) { 1016a3544580Soga * foreach(try) { 1017a3544580Soga * foreach (free range of memtype in pmemrange, 1018a3544580Soga * starting at search[try]) { 1019a3544580Soga * while (range has space left) 1020a3544580Soga * take from range 1021a3544580Soga * } 1022a3544580Soga * } 1023a3544580Soga * } 1024a3544580Soga * 1025a3544580Soga * if next pmemrange has higher usecount than current: 1026a3544580Soga * enter desperate case (which will drain the pmemranges 1027a3544580Soga * until empty prior to moving to the next one) 1028a3544580Soga * } 1029a3544580Soga * 1030a3544580Soga * When desperate is activated, try always starts at the highest 1031a3544580Soga * value. The memtype loop is using a goto ReScanMemtype. 1032a3544580Soga * The try loop is using a goto ReScan. 1033a3544580Soga * The 'range has space left' loop uses label DrainFound. 1034a3544580Soga * 1035a3544580Soga * Writing them all as loops would take up a lot of screen space in 1036a3544580Soga * the form of indentation and some parts are easier to express 1037a3544580Soga * using the labels. 1038a3544580Soga */ 1039a3544580Soga 1040a3544580Soga TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { 1041a3544580Soga /* Empty range. */ 1042a3544580Soga if (pmr->nsegs == 0) 1043a3544580Soga continue; 1044a3544580Soga 1045a3544580Soga /* Outside requested range. */ 1046a3544580Soga if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end)) 1047a3544580Soga continue; 1048a3544580Soga 1049a3544580Soga memtype = memtype_init; 1050a3544580Soga 10513d6ee858Soga rescan_memtype: /* Return point at memtype++. */ 1052a3544580Soga try = start_try; 1053a3544580Soga 10543d6ee858Soga rescan: /* Return point at try++. */ 1055a3544580Soga for (found = uvm_pmr_nfindsz(pmr, search[try], memtype); 1056a3544580Soga found != NULL; 1057a3544580Soga found = f_next) { 1058a3544580Soga f_next = uvm_pmr_nextsz(pmr, found, memtype); 1059a3544580Soga 1060a3544580Soga fstart = atop(VM_PAGE_TO_PHYS(found)); 1061a3544580Soga if (start != 0) 1062a3544580Soga fstart = MAX(start, fstart); 10633d6ee858Soga drain_found: 1064a3544580Soga /* 1065a3544580Soga * Throw away the first segment if fnsegs == maxseg 1066a3544580Soga * 1067a3544580Soga * Note that f_next is still valid after this call, 1068a3544580Soga * since we only allocated from entries before f_next. 1069a3544580Soga * We don't revisit the entries we already extracted 1070a3544580Soga * from unless we entered the desperate case. 1071a3544580Soga */ 1072a3544580Soga if (fnsegs == maxseg) { 1073a3544580Soga fnsegs--; 1074a3544580Soga fcount -= 1075a3544580Soga uvm_pmr_remove_1strange(result, boundary, 1076a3544580Soga &found, desperate); 1077a3544580Soga } 1078a3544580Soga 1079a3544580Soga fstart = PMR_ALIGN(fstart, align); 1080a3544580Soga fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz; 1081c6b41277Sariane if (end != 0) 1082c6b41277Sariane fend = MIN(end, fend); 1083a3544580Soga if (boundary != 0) { 1084a3544580Soga fend = 1085a3544580Soga MIN(fend, PMR_ALIGN(fstart + 1, boundary)); 1086a3544580Soga } 1087c6b41277Sariane if (fstart >= fend) 1088c6b41277Sariane continue; 1089a3544580Soga if (fend - fstart > count - fcount) 1090a3544580Soga fend = fstart + (count - fcount); 1091a3544580Soga 1092a3544580Soga fcount += fend - fstart; 1093a3544580Soga fnsegs++; 1094a3544580Soga found = uvm_pmr_extract_range(pmr, found, 1095a3544580Soga fstart, fend, result); 1096a3544580Soga 1097a3544580Soga if (fcount == count) 10983d6ee858Soga goto out; 1099a3544580Soga 1100a3544580Soga /* 1101a3544580Soga * If there's still space left in found, try to 11024af3577fSjsg * fully drain it prior to continuing. 1103a3544580Soga */ 1104a3544580Soga if (found != NULL) { 1105a3544580Soga fstart = fend; 11063d6ee858Soga goto drain_found; 1107a3544580Soga } 1108a3544580Soga } 1109a3544580Soga 111035164244Stedu /* Try a smaller search now. */ 1111a3544580Soga if (++try < nitems(search)) 11123d6ee858Soga goto rescan; 1113a3544580Soga 1114a3544580Soga /* 1115a3544580Soga * Exhaust all memory types prior to going to the next memory 1116a3544580Soga * segment. 1117a3544580Soga * This means that zero-vs-dirty are eaten prior to moving 1118a3544580Soga * to a pmemrange with a higher use-count. 1119a3544580Soga * 1120a3544580Soga * Code is basically a difficult way of writing: 1121a3544580Soga * memtype = memtype_init; 1122a3544580Soga * do { 1123a3544580Soga * ...; 1124a3544580Soga * memtype += 1; 1125a3544580Soga * memtype %= MEMTYPE_MAX; 1126a3544580Soga * } while (memtype != memtype_init); 1127a3544580Soga */ 1128a3544580Soga memtype += 1; 1129a3544580Soga if (memtype == UVM_PMR_MEMTYPE_MAX) 1130a3544580Soga memtype = 0; 1131a3544580Soga if (memtype != memtype_init) 11323d6ee858Soga goto rescan_memtype; 1133a3544580Soga 1134a3544580Soga /* 1135a3544580Soga * If not desperate, enter desperate case prior to eating all 1136a3544580Soga * the good stuff in the next range. 1137a3544580Soga */ 1138a3544580Soga if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL && 1139a3544580Soga TAILQ_NEXT(pmr, pmr_use)->use != pmr->use) 1140a3544580Soga break; 1141a3544580Soga } 1142a3544580Soga 1143a3544580Soga /* 1144a3544580Soga * Not enough memory of the requested type available. Fall back to 1145a3544580Soga * less good memory that we'll clean up better later. 1146a3544580Soga * 1147a3544580Soga * This algorithm is not very smart though, it just starts scanning 1148a3544580Soga * a different typed range, but the nicer ranges of the previous 1149a3544580Soga * iteration may fall out. Hence there is a small chance of a false 1150a3544580Soga * negative. 1151a3544580Soga * 11524af3577fSjsg * When desperate: scan all sizes starting at the smallest 1153a3544580Soga * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may 1154a3544580Soga * allow us to hit the fast path now). 1155a3544580Soga * 1156a3544580Soga * Also, because we will revisit entries we scanned before, we need 1157a3544580Soga * to reset the page queue, or we may end up releasing entries in 1158a3544580Soga * such a way as to invalidate f_next. 1159a3544580Soga */ 1160a3544580Soga if (!desperate) { 1161a3544580Soga desperate = 1; 1162a3544580Soga start_try = nitems(search) - 1; 1163a3544580Soga flags &= ~UVM_PLA_TRYCONTIG; 1164a3544580Soga 1165a3544580Soga while (!TAILQ_EMPTY(result)) 1166a3544580Soga uvm_pmr_remove_1strange(result, 0, NULL, 0); 1167a3544580Soga fnsegs = 0; 1168a3544580Soga fcount = 0; 11693d6ee858Soga goto retry_desperate; 1170a3544580Soga } 1171a3544580Soga 11723d6ee858Soga fail: 117335164244Stedu /* Allocation failed. */ 1174a3544580Soga /* XXX: claim from memory reserve here */ 1175a3544580Soga 1176a3544580Soga while (!TAILQ_EMPTY(result)) 1177a3544580Soga uvm_pmr_remove_1strange(result, 0, NULL, 0); 1178a3544580Soga 1179a3544580Soga if (flags & UVM_PLA_WAITOK) { 118090ee2fe0Sbeck if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count), 118190ee2fe0Sbeck flags & UVM_PLA_FAILOK) == 0) 11823d6ee858Soga goto retry; 118390ee2fe0Sbeck KASSERT(flags & UVM_PLA_FAILOK); 1184ba03bb80Smpi } else if (!(flags & UVM_PLA_NOWAKE)) { 1185ba03bb80Smpi struct uvm_pmalloc *pma = &nowait_pma; 1186ba03bb80Smpi 1187ba03bb80Smpi if (!(nowait_pma.pm_flags & UVM_PMA_LINKED)) { 1188ba03bb80Smpi nowait_pma.pm_flags = UVM_PMA_LINKED; 1189ba03bb80Smpi TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, pma, pmq); 11908f004b5aSthib wakeup(&uvm.pagedaemon); 1191d4c6c9b5Sbeck } 1192d4c6c9b5Sbeck } 119390ee2fe0Sbeck uvm_unlock_fpageq(); 1194a3544580Soga 1195a3544580Soga return ENOMEM; 1196a3544580Soga 11973d6ee858Soga out: 11984af3577fSjsg /* Allocation successful. */ 1199a3544580Soga uvmexp.free -= fcount; 1200a3544580Soga 1201a3544580Soga uvm_unlock_fpageq(); 1202a3544580Soga 1203a3544580Soga /* Update statistics and zero pages if UVM_PLA_ZERO. */ 1204a19ffaa9Sariane #ifdef DIAGNOSTIC 1205a19ffaa9Sariane fnsegs = 0; 1206a19ffaa9Sariane fcount = 0; 1207a19ffaa9Sariane diag_prev = NULL; 1208a19ffaa9Sariane #endif /* DIAGNOSTIC */ 1209a3544580Soga TAILQ_FOREACH(found, result, pageq) { 12101a0bb2c4Smiod atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK); 1211a3544580Soga 1212a3544580Soga if (found->pg_flags & PG_ZERO) { 1213e216ee02Sblambert uvm_lock_fpageq(); 1214a3544580Soga uvmexp.zeropages--; 1215ef440d08Skettenis if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) 1216ef440d08Skettenis wakeup(&uvmexp.zeropages); 1217e216ee02Sblambert uvm_unlock_fpageq(); 1218a3544580Soga } 1219a3544580Soga if (flags & UVM_PLA_ZERO) { 1220a3544580Soga if (found->pg_flags & PG_ZERO) 1221a3544580Soga uvmexp.pga_zerohit++; 1222a3544580Soga else { 1223a3544580Soga uvmexp.pga_zeromiss++; 1224a3544580Soga uvm_pagezero(found); 1225a3544580Soga } 1226a3544580Soga } 1227a3544580Soga atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE); 1228a3544580Soga 1229a3544580Soga found->uobject = NULL; 1230a3544580Soga found->uanon = NULL; 1231a3544580Soga found->pg_version++; 1232a3544580Soga 1233a3544580Soga /* 1234a3544580Soga * Validate that the page matches range criterium. 1235a3544580Soga */ 1236a3544580Soga KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start); 1237a3544580Soga KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end); 1238a19ffaa9Sariane 1239a19ffaa9Sariane #ifdef DIAGNOSTIC 1240a19ffaa9Sariane /* 1241a19ffaa9Sariane * Update fcount (# found pages) and 1242a19ffaa9Sariane * fnsegs (# found segments) counters. 1243a19ffaa9Sariane */ 1244a19ffaa9Sariane if (diag_prev == NULL || 1245a19ffaa9Sariane /* new segment if it contains a hole */ 1246a19ffaa9Sariane atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 != 1247a19ffaa9Sariane atop(VM_PAGE_TO_PHYS(found)) || 1248a19ffaa9Sariane /* new segment if it crosses boundary */ 1249a19ffaa9Sariane (atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) != 1250a19ffaa9Sariane (atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1))) 1251a19ffaa9Sariane fnsegs++; 1252a19ffaa9Sariane fcount++; 1253a19ffaa9Sariane 1254a19ffaa9Sariane diag_prev = found; 1255a19ffaa9Sariane #endif /* DIAGNOSTIC */ 1256a3544580Soga } 1257a3544580Soga 1258a19ffaa9Sariane #ifdef DIAGNOSTIC 1259a19ffaa9Sariane /* 1260a19ffaa9Sariane * Panic on algorithm failure. 1261a19ffaa9Sariane */ 1262a19ffaa9Sariane if (fcount != count || fnsegs > maxseg) { 1263a19ffaa9Sariane panic("pmemrange allocation error: " 1264a19ffaa9Sariane "allocated %ld pages in %d segments, " 1265a19ffaa9Sariane "but request was %ld pages in %d segments", 1266a19ffaa9Sariane fcount, fnsegs, count, maxseg); 1267a19ffaa9Sariane } 1268a19ffaa9Sariane #endif /* DIAGNOSTIC */ 1269a19ffaa9Sariane 1270a3544580Soga return 0; 1271a3544580Soga } 1272a3544580Soga 1273a3544580Soga /* 127482673a18Smpi * Acquire a single page. 127582673a18Smpi * 127682673a18Smpi * flags: UVM_PLA_* flags 127782673a18Smpi * result: returned page. 127882673a18Smpi */ 127982673a18Smpi struct vm_page * 128082673a18Smpi uvm_pmr_getone(int flags) 128182673a18Smpi { 128282673a18Smpi struct vm_page *pg; 128382673a18Smpi struct pglist pgl; 128482673a18Smpi 128582673a18Smpi TAILQ_INIT(&pgl); 128682673a18Smpi if (uvm_pmr_getpages(1, 0, 0, 1, 0, 1, flags, &pgl) != 0) 128782673a18Smpi return NULL; 128882673a18Smpi 128982673a18Smpi pg = TAILQ_FIRST(&pgl); 129082673a18Smpi KASSERT(pg != NULL && TAILQ_NEXT(pg, pageq) == NULL); 129182673a18Smpi 129282673a18Smpi return pg; 129382673a18Smpi } 129482673a18Smpi 129582673a18Smpi /* 1296a3544580Soga * Free a number of contig pages (invoked by uvm_page_init). 1297a3544580Soga */ 1298a3544580Soga void 1299a3544580Soga uvm_pmr_freepages(struct vm_page *pg, psize_t count) 1300a3544580Soga { 1301a3544580Soga struct uvm_pmemrange *pmr; 1302a3544580Soga psize_t i, pmr_count; 130332007cedSmiod struct vm_page *firstpg = pg; 1304a3544580Soga 1305a3544580Soga for (i = 0; i < count; i++) { 1306a3544580Soga KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) == 1307a3544580Soga atop(VM_PAGE_TO_PHYS(pg)) + i); 1308a3544580Soga 1309a3544580Soga if (!((pg[i].pg_flags & PQ_FREE) == 0 && 1310a3544580Soga VALID_FLAGS(pg[i].pg_flags))) { 1311a3544580Soga printf("Flags: 0x%x, will panic now.\n", 1312a3544580Soga pg[i].pg_flags); 1313a3544580Soga } 1314a3544580Soga KASSERT((pg[i].pg_flags & PQ_FREE) == 0 && 1315a3544580Soga VALID_FLAGS(pg[i].pg_flags)); 1316a3544580Soga atomic_setbits_int(&pg[i].pg_flags, PQ_FREE); 1317a3544580Soga atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO); 1318a3544580Soga } 1319a3544580Soga 1320a3544580Soga uvm_lock_fpageq(); 1321a3544580Soga 132232007cedSmiod for (i = count; i > 0; i -= pmr_count) { 1323a3544580Soga pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); 1324a3544580Soga KASSERT(pmr != NULL); 1325a3544580Soga 132632007cedSmiod pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg))); 1327a3544580Soga pg->fpgsz = pmr_count; 1328a3544580Soga uvm_pmr_insert(pmr, pg, 0); 1329a3544580Soga 1330a3544580Soga uvmexp.free += pmr_count; 1331a3544580Soga pg += pmr_count; 1332a3544580Soga } 1333a3544580Soga wakeup(&uvmexp.free); 1334ef440d08Skettenis if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) 1335ef440d08Skettenis wakeup(&uvmexp.zeropages); 1336a3544580Soga 133732007cedSmiod uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count)); 133890ee2fe0Sbeck 1339a3544580Soga uvm_unlock_fpageq(); 1340a3544580Soga } 1341a3544580Soga 1342a3544580Soga /* 1343a3544580Soga * Free all pages in the queue. 1344a3544580Soga */ 1345a3544580Soga void 1346a3544580Soga uvm_pmr_freepageq(struct pglist *pgl) 1347a3544580Soga { 1348a3544580Soga struct vm_page *pg; 134990ee2fe0Sbeck paddr_t pstart; 135090ee2fe0Sbeck psize_t plen; 1351a3544580Soga 1352a3544580Soga TAILQ_FOREACH(pg, pgl, pageq) { 1353a3544580Soga if (!((pg->pg_flags & PQ_FREE) == 0 && 1354a3544580Soga VALID_FLAGS(pg->pg_flags))) { 1355a3544580Soga printf("Flags: 0x%x, will panic now.\n", 1356a3544580Soga pg->pg_flags); 1357a3544580Soga } 1358a3544580Soga KASSERT((pg->pg_flags & PQ_FREE) == 0 && 1359a3544580Soga VALID_FLAGS(pg->pg_flags)); 1360a3544580Soga atomic_setbits_int(&pg->pg_flags, PQ_FREE); 1361a3544580Soga atomic_clearbits_int(&pg->pg_flags, PG_ZERO); 1362a3544580Soga } 1363a3544580Soga 1364a3544580Soga uvm_lock_fpageq(); 136590ee2fe0Sbeck while (!TAILQ_EMPTY(pgl)) { 13661e57804eSbeck pg = TAILQ_FIRST(pgl); 13671e57804eSbeck if (pg == TAILQ_NEXT(pg, pageq) + 1) { 13681e57804eSbeck /* 13691e57804eSbeck * If pg is one behind the position of the 13701e57804eSbeck * next page in the list in the page array, 13711e57804eSbeck * try going backwards instead of forward. 13721e57804eSbeck */ 13731e57804eSbeck plen = uvm_pmr_remove_1strange_reverse(pgl, &pstart); 13741e57804eSbeck } else { 137590ee2fe0Sbeck pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl)); 137690ee2fe0Sbeck plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0); 13771e57804eSbeck } 137890ee2fe0Sbeck uvmexp.free += plen; 137990ee2fe0Sbeck 138090ee2fe0Sbeck uvm_wakeup_pla(pstart, ptoa(plen)); 138190ee2fe0Sbeck } 1382a3544580Soga wakeup(&uvmexp.free); 1383ef440d08Skettenis if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) 1384ef440d08Skettenis wakeup(&uvmexp.zeropages); 1385a3544580Soga uvm_unlock_fpageq(); 1386a3544580Soga 1387a3544580Soga return; 1388a3544580Soga } 1389a3544580Soga 1390a3544580Soga /* 1391a3544580Soga * Store a pmemrange in the list. 1392a3544580Soga * 1393a3544580Soga * The list is sorted by use. 1394a3544580Soga */ 1395a3544580Soga struct uvm_pmemrange * 1396a3544580Soga uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq, 1397a3544580Soga struct uvm_pmemrange *pmr) 1398a3544580Soga { 1399a3544580Soga struct uvm_pmemrange *iter; 1400a3544580Soga int cmp = 1; 1401a3544580Soga 1402a3544580Soga TAILQ_FOREACH(iter, useq, pmr_use) { 1403a3544580Soga cmp = uvm_pmemrange_use_cmp(pmr, iter); 1404a3544580Soga if (cmp == 0) 1405a3544580Soga return iter; 1406a3544580Soga if (cmp == -1) 1407a3544580Soga break; 1408a3544580Soga } 1409a3544580Soga 1410a3544580Soga if (iter == NULL) 1411a3544580Soga TAILQ_INSERT_TAIL(useq, pmr, pmr_use); 1412a3544580Soga else 1413a3544580Soga TAILQ_INSERT_BEFORE(iter, pmr, pmr_use); 1414a3544580Soga return NULL; 1415a3544580Soga } 1416a3544580Soga 1417a3544580Soga #ifdef DEBUG 1418a3544580Soga /* 1419a3544580Soga * Validation of the whole pmemrange. 1420a3544580Soga * Called with fpageq locked. 1421a3544580Soga */ 1422a3544580Soga void 1423a3544580Soga uvm_pmr_assertvalid(struct uvm_pmemrange *pmr) 1424a3544580Soga { 1425a3544580Soga struct vm_page *prev, *next, *i, *xref; 1426a3544580Soga int lcv, mti; 1427a3544580Soga 1428b0fb607fSthib /* Empty range */ 1429b0fb607fSthib if (pmr->nsegs == 0) 1430b0fb607fSthib return; 1431b0fb607fSthib 1432a3544580Soga /* Validate address tree. */ 14336d0d0baaSdlg RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) { 1434a3544580Soga /* Validate the range. */ 1435a3544580Soga KASSERT(i->fpgsz > 0); 1436a3544580Soga KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low); 1437a3544580Soga KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz 1438a3544580Soga <= pmr->high); 1439a3544580Soga 1440a3544580Soga /* Validate each page in this range. */ 1441a3544580Soga for (lcv = 0; lcv < i->fpgsz; lcv++) { 1442a3544580Soga /* 1443a3544580Soga * Only the first page has a size specification. 1444a3544580Soga * Rest is size 0. 1445a3544580Soga */ 1446a3544580Soga KASSERT(lcv == 0 || i[lcv].fpgsz == 0); 1447a3544580Soga /* 1448a3544580Soga * Flag check. 1449a3544580Soga */ 1450a3544580Soga KASSERT(VALID_FLAGS(i[lcv].pg_flags) && 1451a3544580Soga (i[lcv].pg_flags & PQ_FREE) == PQ_FREE); 1452a3544580Soga /* 1453a3544580Soga * Free pages are: 1454a3544580Soga * - not wired 1455a3544580Soga * - have no vm_anon 1456a3544580Soga * - have no uvm_object 1457a3544580Soga */ 1458a3544580Soga KASSERT(i[lcv].wire_count == 0); 1459a3544580Soga KASSERT(i[lcv].uanon == (void*)0xdeadbeef || 1460a3544580Soga i[lcv].uanon == NULL); 1461a3544580Soga KASSERT(i[lcv].uobject == (void*)0xdeadbeef || 1462a3544580Soga i[lcv].uobject == NULL); 1463a3544580Soga /* 1464a3544580Soga * Pages in a single range always have the same 1465a3544580Soga * memtype. 1466a3544580Soga */ 1467a3544580Soga KASSERT(uvm_pmr_pg_to_memtype(&i[0]) == 1468a3544580Soga uvm_pmr_pg_to_memtype(&i[lcv])); 1469a3544580Soga } 1470a3544580Soga 1471a3544580Soga /* Check that it shouldn't be joined with its predecessor. */ 14726d0d0baaSdlg prev = RBT_PREV(uvm_pmr_addr, i); 1473a3544580Soga if (prev != NULL) { 1474a3544580Soga KASSERT(uvm_pmr_pg_to_memtype(i) != 1475a3544580Soga uvm_pmr_pg_to_memtype(prev) || 1476a3544580Soga atop(VM_PAGE_TO_PHYS(i)) > 1477a3544580Soga atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz || 1478a3544580Soga prev + prev->fpgsz != i); 1479a3544580Soga } 1480a3544580Soga 1481a3544580Soga /* Assert i is in the size tree as well. */ 1482a3544580Soga if (i->fpgsz == 1) { 1483a3544580Soga TAILQ_FOREACH(xref, 1484a3544580Soga &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) { 1485a3544580Soga if (xref == i) 1486a3544580Soga break; 1487a3544580Soga } 1488a3544580Soga KASSERT(xref == i); 1489a3544580Soga } else { 14906d0d0baaSdlg KASSERT(RBT_FIND(uvm_pmr_size, 1491a3544580Soga &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) == 1492a3544580Soga i + 1); 1493a3544580Soga } 1494a3544580Soga } 1495a3544580Soga 1496a3544580Soga /* Validate size tree. */ 1497a3544580Soga for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) { 1498a3544580Soga for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) { 1499a3544580Soga next = uvm_pmr_nextsz(pmr, i, mti); 1500a3544580Soga if (next != NULL) { 1501a3544580Soga KASSERT(i->fpgsz <= 1502a3544580Soga next->fpgsz); 1503a3544580Soga } 1504a3544580Soga 1505a3544580Soga /* Assert i is in the addr tree as well. */ 15066d0d0baaSdlg KASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, i) == i); 1507a3544580Soga 1508a3544580Soga /* Assert i is of the correct memory type. */ 1509a3544580Soga KASSERT(uvm_pmr_pg_to_memtype(i) == mti); 1510a3544580Soga } 1511a3544580Soga } 1512a3544580Soga 1513a3544580Soga /* Validate nsegs statistic. */ 1514a3544580Soga lcv = 0; 15156d0d0baaSdlg RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) 1516a3544580Soga lcv++; 1517a3544580Soga KASSERT(pmr->nsegs == lcv); 1518a3544580Soga } 1519a3544580Soga #endif /* DEBUG */ 1520a3544580Soga 1521a3544580Soga /* 1522a3544580Soga * Split pmr at split point pageno. 1523a3544580Soga * Called with fpageq unlocked. 1524a3544580Soga * 1525a3544580Soga * Split is only applied if a pmemrange spans pageno. 1526a3544580Soga */ 1527a3544580Soga void 1528a3544580Soga uvm_pmr_split(paddr_t pageno) 1529a3544580Soga { 1530a3544580Soga struct uvm_pmemrange *pmr, *drain; 1531a3544580Soga struct vm_page *rebuild, *prev, *next; 1532a3544580Soga psize_t prev_sz; 1533a3544580Soga 1534a3544580Soga uvm_lock_fpageq(); 1535a3544580Soga pmr = uvm_pmemrange_find(pageno); 1536a3544580Soga if (pmr == NULL || !(pmr->low < pageno)) { 1537a3544580Soga /* No split required. */ 1538a3544580Soga uvm_unlock_fpageq(); 1539a3544580Soga return; 1540a3544580Soga } 1541a3544580Soga 1542a3544580Soga KASSERT(pmr->low < pageno); 1543a3544580Soga KASSERT(pmr->high > pageno); 1544a3544580Soga 1545b426ab7bSthib /* 1546b426ab7bSthib * uvm_pmr_allocpmr() calls into malloc() which in turn calls into 1547b426ab7bSthib * uvm_kmemalloc which calls into pmemrange, making the locking 1548b426ab7bSthib * a bit hard, so we just race! 1549b426ab7bSthib */ 1550b426ab7bSthib uvm_unlock_fpageq(); 1551a3544580Soga drain = uvm_pmr_allocpmr(); 1552b426ab7bSthib uvm_lock_fpageq(); 1553b426ab7bSthib pmr = uvm_pmemrange_find(pageno); 1554b426ab7bSthib if (pmr == NULL || !(pmr->low < pageno)) { 1555b426ab7bSthib /* 1556b426ab7bSthib * We lost the race since someone else ran this or a related 1557b426ab7bSthib * function, however this should be triggered very rarely so 1558b426ab7bSthib * we just leak the pmr. 1559b426ab7bSthib */ 1560b426ab7bSthib printf("uvm_pmr_split: lost one pmr\n"); 1561b426ab7bSthib uvm_unlock_fpageq(); 1562b426ab7bSthib return; 1563b426ab7bSthib } 1564b426ab7bSthib 1565a3544580Soga drain->low = pageno; 1566a3544580Soga drain->high = pmr->high; 1567a3544580Soga drain->use = pmr->use; 1568a3544580Soga 1569a3544580Soga uvm_pmr_assertvalid(pmr); 1570a3544580Soga uvm_pmr_assertvalid(drain); 1571a3544580Soga KASSERT(drain->nsegs == 0); 1572a3544580Soga 1573262a556aSdlg RBT_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) { 1574a3544580Soga if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno) 1575a3544580Soga break; 1576a3544580Soga } 1577a3544580Soga if (rebuild == NULL) 1578262a556aSdlg prev = RBT_MAX(uvm_pmr_addr, &pmr->addr); 1579a3544580Soga else 1580262a556aSdlg prev = RBT_PREV(uvm_pmr_addr, rebuild); 1581a3544580Soga KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno); 1582a3544580Soga 1583a3544580Soga /* 1584a3544580Soga * Handle free chunk that spans the split point. 1585a3544580Soga */ 1586a3544580Soga if (prev != NULL && 1587a3544580Soga atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) { 1588a3544580Soga psize_t before, after; 1589a3544580Soga 1590a3544580Soga KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno); 1591a3544580Soga 1592a3544580Soga uvm_pmr_remove(pmr, prev); 1593a3544580Soga prev_sz = prev->fpgsz; 1594a3544580Soga before = pageno - atop(VM_PAGE_TO_PHYS(prev)); 1595a3544580Soga after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno; 1596a3544580Soga 1597a3544580Soga KASSERT(before > 0); 1598a3544580Soga KASSERT(after > 0); 1599a3544580Soga 1600a3544580Soga prev->fpgsz = before; 1601a3544580Soga uvm_pmr_insert(pmr, prev, 1); 1602a3544580Soga (prev + before)->fpgsz = after; 1603a3544580Soga uvm_pmr_insert(drain, prev + before, 1); 1604a3544580Soga } 1605a3544580Soga 160635164244Stedu /* Move free chunks that no longer fall in the range. */ 1607a3544580Soga for (; rebuild != NULL; rebuild = next) { 1608262a556aSdlg next = RBT_NEXT(uvm_pmr_addr, rebuild); 1609a3544580Soga 1610a3544580Soga uvm_pmr_remove(pmr, rebuild); 1611a3544580Soga uvm_pmr_insert(drain, rebuild, 1); 1612a3544580Soga } 1613a3544580Soga 1614a3544580Soga pmr->high = pageno; 1615a3544580Soga uvm_pmr_assertvalid(pmr); 1616a3544580Soga uvm_pmr_assertvalid(drain); 1617a3544580Soga 16183accc929Sdlg RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain); 1619a3544580Soga uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain); 1620a3544580Soga uvm_unlock_fpageq(); 1621a3544580Soga } 1622a3544580Soga 1623a3544580Soga /* 1624a3544580Soga * Increase the usage counter for the given range of memory. 1625a3544580Soga * 1626a3544580Soga * The more usage counters a given range of memory has, the more will be 1627a3544580Soga * attempted not to allocate from it. 1628a3544580Soga * 1629a3544580Soga * Addresses here are in paddr_t, not page-numbers. 1630a3544580Soga * The lowest and highest allowed address are specified. 1631a3544580Soga */ 1632a3544580Soga void 1633a3544580Soga uvm_pmr_use_inc(paddr_t low, paddr_t high) 1634a3544580Soga { 1635a3544580Soga struct uvm_pmemrange *pmr; 1636b426ab7bSthib paddr_t sz; 1637a3544580Soga 1638b426ab7bSthib /* pmr uses page numbers, translate low and high. */ 1639a3544580Soga high++; 1640a3544580Soga high = atop(trunc_page(high)); 1641b426ab7bSthib low = atop(round_page(low)); 1642a3544580Soga uvm_pmr_split(low); 1643a3544580Soga uvm_pmr_split(high); 1644a3544580Soga 1645c37416d6Smiod sz = 0; 1646a3544580Soga uvm_lock_fpageq(); 1647a3544580Soga /* Increase use count on segments in range. */ 16483accc929Sdlg RBT_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) { 1649a3544580Soga if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) { 1650a3544580Soga TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use); 1651a3544580Soga pmr->use++; 1652b426ab7bSthib sz += pmr->high - pmr->low; 1653a3544580Soga uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr); 1654a3544580Soga } 1655a3544580Soga uvm_pmr_assertvalid(pmr); 1656a3544580Soga } 1657a3544580Soga uvm_unlock_fpageq(); 1658b426ab7bSthib 1659b426ab7bSthib KASSERT(sz >= high - low); 1660a3544580Soga } 1661a3544580Soga 1662a3544580Soga /* 1663a3544580Soga * Allocate a pmemrange. 1664a3544580Soga * 1665a3544580Soga * If called from uvm_page_init, the uvm_pageboot_alloc is used. 1666a3544580Soga * If called after uvm_init, malloc is used. 1667a3544580Soga * (And if called in between, you're dead.) 1668a3544580Soga */ 1669a3544580Soga struct uvm_pmemrange * 1670b426ab7bSthib uvm_pmr_allocpmr(void) 1671a3544580Soga { 1672a3544580Soga struct uvm_pmemrange *nw; 1673a3544580Soga int i; 1674a3544580Soga 1675b426ab7bSthib /* We're only ever hitting the !uvm.page_init_done case for now. */ 1676a3544580Soga if (!uvm.page_init_done) { 1677a3544580Soga nw = (struct uvm_pmemrange *) 1678a3544580Soga uvm_pageboot_alloc(sizeof(struct uvm_pmemrange)); 1679a3544580Soga } else { 1680a3544580Soga nw = malloc(sizeof(struct uvm_pmemrange), 1681b426ab7bSthib M_VMMAP, M_NOWAIT); 1682a3544580Soga } 1683b426ab7bSthib KASSERT(nw != NULL); 16846c0aa6dcStedu memset(nw, 0, sizeof(struct uvm_pmemrange)); 1685262a556aSdlg RBT_INIT(uvm_pmr_addr, &nw->addr); 1686a3544580Soga for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) { 1687262a556aSdlg RBT_INIT(uvm_pmr_size, &nw->size[i]); 1688a3544580Soga TAILQ_INIT(&nw->single[i]); 1689a3544580Soga } 1690a3544580Soga return nw; 1691a3544580Soga } 1692a3544580Soga 1693a3544580Soga /* 1694a3544580Soga * Initialization of pmr. 1695a3544580Soga * Called by uvm_page_init. 1696a3544580Soga * 1697a3544580Soga * Sets up pmemranges. 1698a3544580Soga */ 1699a3544580Soga void 1700a3544580Soga uvm_pmr_init(void) 1701a3544580Soga { 1702a3544580Soga struct uvm_pmemrange *new_pmr; 1703a3544580Soga int i; 1704a3544580Soga 1705a3544580Soga TAILQ_INIT(&uvm.pmr_control.use); 17063accc929Sdlg RBT_INIT(uvm_pmemrange_addr, &uvm.pmr_control.addr); 170790ee2fe0Sbeck TAILQ_INIT(&uvm.pmr_control.allocs); 1708a3544580Soga 1709b426ab7bSthib /* By default, one range for the entire address space. */ 1710a3544580Soga new_pmr = uvm_pmr_allocpmr(); 1711a3544580Soga new_pmr->low = 0; 1712a3544580Soga new_pmr->high = atop((paddr_t)-1) + 1; 1713a3544580Soga 17143accc929Sdlg RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr); 1715a3544580Soga uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr); 1716a3544580Soga 1717b426ab7bSthib for (i = 0; uvm_md_constraints[i] != NULL; i++) { 1718b426ab7bSthib uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low, 1719b426ab7bSthib uvm_md_constraints[i]->ucr_high); 1720b426ab7bSthib } 1721a3544580Soga } 1722a3544580Soga 1723a3544580Soga /* 1724a3544580Soga * Find the pmemrange that contains the given page number. 1725a3544580Soga * 1726a3544580Soga * (Manually traverses the binary tree, because that is cheaper on stack 1727a3544580Soga * usage.) 1728a3544580Soga */ 1729a3544580Soga struct uvm_pmemrange * 1730a3544580Soga uvm_pmemrange_find(paddr_t pageno) 1731a3544580Soga { 1732a3544580Soga struct uvm_pmemrange *pmr; 1733a3544580Soga 17343accc929Sdlg pmr = RBT_ROOT(uvm_pmemrange_addr, &uvm.pmr_control.addr); 1735a3544580Soga while (pmr != NULL) { 1736a3544580Soga if (pmr->low > pageno) 17373accc929Sdlg pmr = RBT_LEFT(uvm_pmemrange_addr, pmr); 1738a3544580Soga else if (pmr->high <= pageno) 17393accc929Sdlg pmr = RBT_RIGHT(uvm_pmemrange_addr, pmr); 1740a3544580Soga else 1741a3544580Soga break; 1742a3544580Soga } 1743a3544580Soga 1744a3544580Soga return pmr; 1745a3544580Soga } 1746a3544580Soga 1747a3544580Soga #if defined(DDB) || defined(DEBUG) 1748a3544580Soga /* 1749a3544580Soga * Return true if the given page is in any of the free lists. 1750a3544580Soga * Used by uvm_page_printit. 1751a3544580Soga * This function is safe, even if the page is not on the freeq. 1752a3544580Soga * Note: does not apply locking, only called from ddb. 1753a3544580Soga */ 1754a3544580Soga int 1755a3544580Soga uvm_pmr_isfree(struct vm_page *pg) 1756a3544580Soga { 1757a3544580Soga struct vm_page *r; 1758a3544580Soga struct uvm_pmemrange *pmr; 1759a3544580Soga 1760a3544580Soga pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); 1761a3544580Soga if (pmr == NULL) 1762a3544580Soga return 0; 1763262a556aSdlg r = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg); 1764a3544580Soga if (r == NULL) 1765262a556aSdlg r = RBT_MAX(uvm_pmr_addr, &pmr->addr); 176606c0d188Svisa else if (r != pg) 1767262a556aSdlg r = RBT_PREV(uvm_pmr_addr, r); 1768a3544580Soga if (r == NULL) 1769a3544580Soga return 0; /* Empty tree. */ 1770a3544580Soga 1771a3544580Soga KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg))); 1772a3544580Soga return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz > 1773a3544580Soga atop(VM_PAGE_TO_PHYS(pg)); 1774a3544580Soga } 1775a3544580Soga #endif /* DEBUG */ 1776a3544580Soga 1777a3544580Soga /* 1778a3544580Soga * Given a root of a tree, find a range which intersects start, end and 1779a3544580Soga * is of the same memtype. 1780a3544580Soga * 1781a3544580Soga * Page must be in the address tree. 1782a3544580Soga */ 1783a3544580Soga struct vm_page* 1784a3544580Soga uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root, 1785a3544580Soga paddr_t start, paddr_t end, int memtype) 1786a3544580Soga { 1787a3544580Soga int direction; 1788a3544580Soga struct vm_page *root; 1789a3544580Soga struct vm_page *high, *high_next; 1790a3544580Soga struct vm_page *low, *low_next; 1791a3544580Soga 1792a3544580Soga KDASSERT(pmr != NULL && init_root != NULL); 1793a3544580Soga root = init_root; 1794a3544580Soga 179535164244Stedu /* Which direction to use for searching. */ 1796a3544580Soga if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start) 1797a3544580Soga direction = 1; 1798a3544580Soga else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end) 1799a3544580Soga direction = -1; 1800a3544580Soga else /* nothing to do */ 1801a3544580Soga return root; 1802a3544580Soga 180335164244Stedu /* First, update root to fall within the chosen range. */ 1804a3544580Soga while (root && !PMR_INTERSECTS_WITH( 1805a3544580Soga atop(VM_PAGE_TO_PHYS(root)), 1806a3544580Soga atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz, 1807a3544580Soga start, end)) { 1808a3544580Soga if (direction == 1) 1809262a556aSdlg root = RBT_RIGHT(uvm_objtree, root); 1810a3544580Soga else 1811262a556aSdlg root = RBT_LEFT(uvm_objtree, root); 1812a3544580Soga } 1813a3544580Soga if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype) 1814a3544580Soga return root; 1815a3544580Soga 1816a3544580Soga /* 1817a3544580Soga * Root is valid, but of the wrong memtype. 1818a3544580Soga * 1819a3544580Soga * Try to find a range that has the given memtype in the subtree 1820a3544580Soga * (memtype mismatches are costly, either because the conversion 1821a3544580Soga * is expensive, or a later allocation will need to do the opposite 1822a3544580Soga * conversion, which will be expensive). 1823a3544580Soga * 1824a3544580Soga * 1825a3544580Soga * First, simply increase address until we hit something we can use. 1826a3544580Soga * Cache the upper page, so we can page-walk later. 1827a3544580Soga */ 1828a3544580Soga high = root; 1829262a556aSdlg high_next = RBT_RIGHT(uvm_objtree, high); 1830a3544580Soga while (high_next != NULL && PMR_INTERSECTS_WITH( 1831a3544580Soga atop(VM_PAGE_TO_PHYS(high_next)), 1832a3544580Soga atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz, 1833a3544580Soga start, end)) { 1834a3544580Soga high = high_next; 1835a3544580Soga if (uvm_pmr_pg_to_memtype(high) == memtype) 1836a3544580Soga return high; 1837262a556aSdlg high_next = RBT_RIGHT(uvm_objtree, high); 1838a3544580Soga } 1839a3544580Soga 1840a3544580Soga /* 1841a3544580Soga * Second, decrease the address until we hit something we can use. 1842a3544580Soga * Cache the lower page, so we can page-walk later. 1843a3544580Soga */ 1844a3544580Soga low = root; 1845262a556aSdlg low_next = RBT_LEFT(uvm_objtree, low); 1846a3544580Soga while (low_next != NULL && PMR_INTERSECTS_WITH( 1847a3544580Soga atop(VM_PAGE_TO_PHYS(low_next)), 1848a3544580Soga atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz, 1849a3544580Soga start, end)) { 1850a3544580Soga low = low_next; 1851a3544580Soga if (uvm_pmr_pg_to_memtype(low) == memtype) 1852a3544580Soga return low; 1853262a556aSdlg low_next = RBT_LEFT(uvm_objtree, low); 1854a3544580Soga } 1855a3544580Soga 1856baa04740Smiod if (low == high) 1857baa04740Smiod return NULL; 1858baa04740Smiod 185935164244Stedu /* No hits. Walk the address tree until we find something usable. */ 1860262a556aSdlg for (low = RBT_NEXT(uvm_pmr_addr, low); 1861a3544580Soga low != high; 1862262a556aSdlg low = RBT_NEXT(uvm_pmr_addr, low)) { 1863e74ab57bSariane KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)), 1864e74ab57bSariane atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz, 1865a3544580Soga start, end)); 1866a3544580Soga if (uvm_pmr_pg_to_memtype(low) == memtype) 1867a3544580Soga return low; 1868a3544580Soga } 1869a3544580Soga 187035164244Stedu /* Nothing found. */ 1871a3544580Soga return NULL; 1872a3544580Soga } 1873a3544580Soga 1874a3544580Soga /* 1875a3544580Soga * Allocate any page, the fastest way. Page number constraints only. 1876a3544580Soga */ 1877dba5c185Smiod psize_t 1878a3544580Soga uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result, 1879ef440d08Skettenis paddr_t start, paddr_t end, int memtype_only) 1880a3544580Soga { 1881a3544580Soga struct uvm_pmemrange *pmr; 1882a3544580Soga struct vm_page *found, *splitpg; 1883a3544580Soga psize_t fcount; 1884a3544580Soga int memtype; 1885a3544580Soga 1886a3544580Soga fcount = 0; 1887d04c0e5aSthib TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { 1888d04c0e5aSthib /* We're done. */ 1889d04c0e5aSthib if (fcount == count) 1890d04c0e5aSthib break; 1891d04c0e5aSthib 1892a3544580Soga /* Outside requested range. */ 1893a3544580Soga if (!(start == 0 && end == 0) && 1894a3544580Soga !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end)) 1895a3544580Soga continue; 1896a3544580Soga 1897a3544580Soga /* Range is empty. */ 1898a3544580Soga if (pmr->nsegs == 0) 1899a3544580Soga continue; 1900a3544580Soga 1901d04c0e5aSthib /* Loop over all memtypes, starting at memtype_init. */ 1902a3544580Soga memtype = memtype_init; 1903d04c0e5aSthib while (fcount != count) { 1904a3544580Soga found = TAILQ_FIRST(&pmr->single[memtype]); 1905a3544580Soga /* 1906a3544580Soga * If found is outside the range, walk the list 1907a3544580Soga * until we find something that intersects with 1908a3544580Soga * boundaries. 1909a3544580Soga */ 1910a3544580Soga while (found && !PMR_INTERSECTS_WITH( 1911a3544580Soga atop(VM_PAGE_TO_PHYS(found)), 1912a3544580Soga atop(VM_PAGE_TO_PHYS(found)) + 1, 1913a3544580Soga start, end)) 1914a3544580Soga found = TAILQ_NEXT(found, pageq); 1915a3544580Soga 1916a3544580Soga if (found == NULL) { 19176fd881a3Skettenis /* 19186fd881a3Skettenis * Check if the size tree contains a range 19196fd881a3Skettenis * that intersects with the boundaries. As the 19206fd881a3Skettenis * allocation is for any page, try the smallest 19216fd881a3Skettenis * range so that large ranges are preserved for 19226fd881a3Skettenis * more constrained cases. Only one entry is 19236fd881a3Skettenis * checked here, to avoid a brute-force search. 19246fd881a3Skettenis * 19256fd881a3Skettenis * Note that a size tree gives pg[1] instead of 19266fd881a3Skettenis * pg[0]. 19276fd881a3Skettenis */ 1928262a556aSdlg found = RBT_MIN(uvm_pmr_size, 19296fd881a3Skettenis &pmr->size[memtype]); 193088fc093eSariane if (found != NULL) { 1931a3544580Soga found--; 19326fd881a3Skettenis if (!PMR_INTERSECTS_WITH( 19336fd881a3Skettenis atop(VM_PAGE_TO_PHYS(found)), 19346fd881a3Skettenis atop(VM_PAGE_TO_PHYS(found)) + 19356fd881a3Skettenis found->fpgsz, start, end)) 19366fd881a3Skettenis found = NULL; 19376fd881a3Skettenis } 19386fd881a3Skettenis } 19396fd881a3Skettenis if (found == NULL) { 19406fd881a3Skettenis /* 19416fd881a3Skettenis * Try address-guided search to meet the page 19426fd881a3Skettenis * number constraints. 19436fd881a3Skettenis */ 1944262a556aSdlg found = RBT_ROOT(uvm_pmr_addr, &pmr->addr); 19456fd881a3Skettenis if (found != NULL) { 1946a3544580Soga found = uvm_pmr_rootupdate(pmr, found, 1947a3544580Soga start, end, memtype); 1948a3544580Soga } 194988fc093eSariane } 1950a3544580Soga if (found != NULL) { 1951a3544580Soga uvm_pmr_assertvalid(pmr); 1952a3544580Soga uvm_pmr_remove_size(pmr, found); 1953a3544580Soga 1954a3544580Soga /* 1955a3544580Soga * If the page intersects the end, then it'll 1956a3544580Soga * need splitting. 1957a3544580Soga * 1958a3544580Soga * Note that we don't need to split if the page 1959a3544580Soga * intersects start: the drain function will 1960a3544580Soga * simply stop on hitting start. 1961a3544580Soga */ 1962a3544580Soga if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) + 1963a3544580Soga found->fpgsz > end) { 1964a3544580Soga psize_t splitsz = 1965a3544580Soga atop(VM_PAGE_TO_PHYS(found)) + 1966a3544580Soga found->fpgsz - end; 1967a3544580Soga 1968a3544580Soga uvm_pmr_remove_addr(pmr, found); 1969a3544580Soga uvm_pmr_assertvalid(pmr); 1970a3544580Soga found->fpgsz -= splitsz; 1971a3544580Soga splitpg = found + found->fpgsz; 1972a3544580Soga splitpg->fpgsz = splitsz; 1973a3544580Soga uvm_pmr_insert(pmr, splitpg, 1); 1974a3544580Soga 1975a3544580Soga /* 1976a3544580Soga * At this point, splitpg and found 1977a3544580Soga * actually should be joined. 1978a3544580Soga * But we explicitly disable that, 1979a3544580Soga * because we will start subtracting 1980a3544580Soga * from found. 1981a3544580Soga */ 1982a3544580Soga KASSERT(start == 0 || 1983a3544580Soga atop(VM_PAGE_TO_PHYS(found)) + 1984a3544580Soga found->fpgsz > start); 1985a3544580Soga uvm_pmr_insert_addr(pmr, found, 1); 1986a3544580Soga } 1987a3544580Soga 1988a3544580Soga /* 1989a3544580Soga * Fetch pages from the end. 1990a3544580Soga * If the range is larger than the requested 1991a3544580Soga * number of pages, this saves us an addr-tree 1992a3544580Soga * update. 1993a3544580Soga * 1994a3544580Soga * Since we take from the end and insert at 1995a3544580Soga * the head, any ranges keep preserved. 1996a3544580Soga */ 1997a3544580Soga while (found->fpgsz > 0 && fcount < count && 1998a3544580Soga (start == 0 || 1999a3544580Soga atop(VM_PAGE_TO_PHYS(found)) + 2000a3544580Soga found->fpgsz > start)) { 2001a3544580Soga found->fpgsz--; 2002a3544580Soga fcount++; 2003a3544580Soga TAILQ_INSERT_HEAD(result, 2004a3544580Soga &found[found->fpgsz], pageq); 2005a3544580Soga } 2006a3544580Soga if (found->fpgsz > 0) { 2007a3544580Soga uvm_pmr_insert_size(pmr, found); 2008a3544580Soga KDASSERT(fcount == count); 2009a3544580Soga uvm_pmr_assertvalid(pmr); 2010a3544580Soga return fcount; 2011a3544580Soga } 2012a3544580Soga 2013a3544580Soga /* 2014a3544580Soga * Delayed addr-tree removal. 2015a3544580Soga */ 2016a3544580Soga uvm_pmr_remove_addr(pmr, found); 2017a3544580Soga uvm_pmr_assertvalid(pmr); 2018a3544580Soga } else { 2019ef440d08Skettenis if (memtype_only) 2020ef440d08Skettenis break; 2021a3544580Soga /* 2022a3544580Soga * Skip to the next memtype. 2023a3544580Soga */ 2024a3544580Soga memtype += 1; 2025a3544580Soga if (memtype == UVM_PMR_MEMTYPE_MAX) 2026a3544580Soga memtype = 0; 2027d04c0e5aSthib if (memtype == memtype_init) 2028d04c0e5aSthib break; 2029a3544580Soga } 2030d04c0e5aSthib } 2031a3544580Soga } 2032a3544580Soga 2033a3544580Soga /* 2034a3544580Soga * Search finished. 2035a3544580Soga * 2036a3544580Soga * Ran out of ranges before enough pages were gathered, or we hit the 2037a3544580Soga * case where found->fpgsz == count - fcount, in which case the 2038a3544580Soga * above exit condition didn't trigger. 2039a3544580Soga * 2040a3544580Soga * On failure, caller will free the pages. 2041a3544580Soga */ 2042a3544580Soga return fcount; 2043a3544580Soga } 2044a3544580Soga 2045a3544580Soga #ifdef DDB 2046a3544580Soga /* 2047a3544580Soga * Print information about pmemrange. 2048a3544580Soga * Does not do locking (so either call it from DDB or acquire fpageq lock 2049a3544580Soga * before invoking. 2050a3544580Soga */ 2051a3544580Soga void 2052a3544580Soga uvm_pmr_print(void) 2053a3544580Soga { 2054a3544580Soga struct uvm_pmemrange *pmr; 2055a3544580Soga struct vm_page *pg; 2056a3544580Soga psize_t size[UVM_PMR_MEMTYPE_MAX]; 2057a3544580Soga psize_t free; 2058a3544580Soga int useq_len; 2059a3544580Soga int mt; 2060a3544580Soga 2061a3544580Soga printf("Ranges, use queue:\n"); 2062a3544580Soga useq_len = 0; 2063a3544580Soga TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { 2064a3544580Soga useq_len++; 2065a3544580Soga free = 0; 2066a3544580Soga for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { 2067262a556aSdlg pg = RBT_MAX(uvm_pmr_size, &pmr->size[mt]); 2068a3544580Soga if (pg != NULL) 2069a3544580Soga pg--; 2070a3544580Soga else 2071a3544580Soga pg = TAILQ_FIRST(&pmr->single[mt]); 2072a3544580Soga size[mt] = (pg == NULL ? 0 : pg->fpgsz); 2073a3544580Soga 2074262a556aSdlg RBT_FOREACH(pg, uvm_pmr_addr, &pmr->addr) 2075a3544580Soga free += pg->fpgsz; 2076a3544580Soga } 2077a3544580Soga 2078a3544580Soga printf("* [0x%lx-0x%lx] use=%d nsegs=%ld", 2079a3544580Soga (unsigned long)pmr->low, (unsigned long)pmr->high, 2080a3544580Soga pmr->use, (unsigned long)pmr->nsegs); 2081a3544580Soga for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { 2082a3544580Soga printf(" maxsegsz[%d]=0x%lx", mt, 2083a3544580Soga (unsigned long)size[mt]); 2084a3544580Soga } 2085a3544580Soga printf(" free=0x%lx\n", (unsigned long)free); 2086a3544580Soga } 2087a3544580Soga printf("#ranges = %d\n", useq_len); 2088a3544580Soga } 2089a3544580Soga #endif 2090205bb46dSariane 209190ee2fe0Sbeck /* 209290ee2fe0Sbeck * uvm_wait_pla: wait (sleep) for the page daemon to free some pages 209390ee2fe0Sbeck * in a specific physmem area. 209490ee2fe0Sbeck * 209590ee2fe0Sbeck * Returns ENOMEM if the pagedaemon failed to free any pages. 209690ee2fe0Sbeck * If not failok, failure will lead to panic. 209790ee2fe0Sbeck * 209890ee2fe0Sbeck * Must be called with fpageq locked. 209990ee2fe0Sbeck */ 210090ee2fe0Sbeck int 210190ee2fe0Sbeck uvm_wait_pla(paddr_t low, paddr_t high, paddr_t size, int failok) 210290ee2fe0Sbeck { 210390ee2fe0Sbeck struct uvm_pmalloc pma; 210490ee2fe0Sbeck const char *wmsg = "pmrwait"; 210590ee2fe0Sbeck 2106d60475a6Smpi KASSERT(curcpu()->ci_idepth == 0); 2107d60475a6Smpi 210890ee2fe0Sbeck if (curproc == uvm.pagedaemon_proc) { 21099a0f4daeSbeck /* 21100bd995e1Stedu * This is not that uncommon when the pagedaemon is trying 21110bd995e1Stedu * to flush out a large mmapped file. VOP_WRITE will circle 21120bd995e1Stedu * back through the buffer cache and try to get more memory. 21130bd995e1Stedu * The pagedaemon starts by calling bufbackoff, but we can 21140bd995e1Stedu * easily use up that reserve in a single scan iteration. 21150bd995e1Stedu */ 21160bd995e1Stedu uvm_unlock_fpageq(); 21170b4f309dSmpi if (bufbackoff(NULL, atop(size)) >= atop(size)) { 21180bd995e1Stedu uvm_lock_fpageq(); 21190bd995e1Stedu return 0; 21200bd995e1Stedu } 21210bd995e1Stedu uvm_lock_fpageq(); 21220bd995e1Stedu 21230bd995e1Stedu /* 21249a0f4daeSbeck * XXX detect pagedaemon deadlock - see comment in 21259a0f4daeSbeck * uvm_wait(), as this is exactly the same issue. 21269a0f4daeSbeck */ 21279a0f4daeSbeck printf("pagedaemon: wait_pla deadlock detected!\n"); 21284bc97b15Scheloha msleep_nsec(&uvmexp.free, &uvm.fpageqlock, PVM, wmsg, 21294bc97b15Scheloha MSEC_TO_NSEC(125)); 21309a0f4daeSbeck #if defined(DEBUG) 21319a0f4daeSbeck /* DEBUG: panic so we can debug it */ 21329a0f4daeSbeck panic("wait_pla pagedaemon deadlock"); 21339a0f4daeSbeck #endif 213490ee2fe0Sbeck return 0; 213590ee2fe0Sbeck } 213690ee2fe0Sbeck 213790ee2fe0Sbeck for (;;) { 213890ee2fe0Sbeck pma.pm_constraint.ucr_low = low; 213990ee2fe0Sbeck pma.pm_constraint.ucr_high = high; 214090ee2fe0Sbeck pma.pm_size = size; 214190ee2fe0Sbeck pma.pm_flags = UVM_PMA_LINKED; 214290ee2fe0Sbeck TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, &pma, pmq); 214390ee2fe0Sbeck 214490ee2fe0Sbeck wakeup(&uvm.pagedaemon); /* wake the daemon! */ 214590ee2fe0Sbeck while (pma.pm_flags & (UVM_PMA_LINKED | UVM_PMA_BUSY)) 2146744ebc9fSmpi msleep_nsec(&pma, &uvm.fpageqlock, PVM, wmsg, INFSLP); 214790ee2fe0Sbeck 214890ee2fe0Sbeck if (!(pma.pm_flags & UVM_PMA_FREED) && 214990ee2fe0Sbeck pma.pm_flags & UVM_PMA_FAIL) { 215090ee2fe0Sbeck if (failok) 215190ee2fe0Sbeck return ENOMEM; 215290ee2fe0Sbeck printf("uvm_wait: failed to free %ld pages between " 215390ee2fe0Sbeck "0x%lx-0x%lx\n", atop(size), low, high); 215490ee2fe0Sbeck } else 215590ee2fe0Sbeck return 0; 215690ee2fe0Sbeck } 215790ee2fe0Sbeck /* UNREACHABLE */ 215890ee2fe0Sbeck } 215990ee2fe0Sbeck 216090ee2fe0Sbeck /* 216190ee2fe0Sbeck * Wake up uvm_pmalloc sleepers. 216290ee2fe0Sbeck */ 216390ee2fe0Sbeck void 216490ee2fe0Sbeck uvm_wakeup_pla(paddr_t low, psize_t len) 216590ee2fe0Sbeck { 216690ee2fe0Sbeck struct uvm_pmalloc *pma, *pma_next; 216790ee2fe0Sbeck paddr_t high; 216890ee2fe0Sbeck 216990ee2fe0Sbeck high = low + len; 217090ee2fe0Sbeck 217135164244Stedu /* Wake specific allocations waiting for this memory. */ 217290ee2fe0Sbeck for (pma = TAILQ_FIRST(&uvm.pmr_control.allocs); pma != NULL; 217390ee2fe0Sbeck pma = pma_next) { 217490ee2fe0Sbeck pma_next = TAILQ_NEXT(pma, pmq); 217590ee2fe0Sbeck 217690ee2fe0Sbeck if (low < pma->pm_constraint.ucr_high && 217790ee2fe0Sbeck high > pma->pm_constraint.ucr_low) { 217890ee2fe0Sbeck pma->pm_flags |= UVM_PMA_FREED; 217990ee2fe0Sbeck if (!(pma->pm_flags & UVM_PMA_BUSY)) { 218090ee2fe0Sbeck pma->pm_flags &= ~UVM_PMA_LINKED; 218190ee2fe0Sbeck TAILQ_REMOVE(&uvm.pmr_control.allocs, pma, 218290ee2fe0Sbeck pmq); 218390ee2fe0Sbeck wakeup(pma); 218490ee2fe0Sbeck } 218590ee2fe0Sbeck } 218690ee2fe0Sbeck } 218790ee2fe0Sbeck } 2188ef440d08Skettenis 2189ef440d08Skettenis void 2190ef440d08Skettenis uvm_pagezero_thread(void *arg) 2191ef440d08Skettenis { 2192ef440d08Skettenis struct pglist pgl; 2193ef440d08Skettenis struct vm_page *pg; 2194ef440d08Skettenis int count; 2195ef440d08Skettenis 2196ef440d08Skettenis /* Run at the lowest possible priority. */ 2197ef440d08Skettenis curproc->p_p->ps_nice = NZERO + PRIO_MAX; 2198ef440d08Skettenis 2199ef440d08Skettenis KERNEL_UNLOCK(); 2200ef440d08Skettenis 22012dfb4493Sguenther TAILQ_INIT(&pgl); 2202ef440d08Skettenis for (;;) { 2203ef440d08Skettenis uvm_lock_fpageq(); 2204ef440d08Skettenis while (uvmexp.zeropages >= UVM_PAGEZERO_TARGET || 2205ef440d08Skettenis (count = uvm_pmr_get1page(16, UVM_PMR_MEMTYPE_DIRTY, 2206ef440d08Skettenis &pgl, 0, 0, 1)) == 0) { 22072404448fSjsg msleep_nsec(&uvmexp.zeropages, &uvm.fpageqlock, 22082404448fSjsg MAXPRI, "pgzero", INFSLP); 2209ef440d08Skettenis } 2210ef440d08Skettenis uvm_unlock_fpageq(); 2211ef440d08Skettenis 2212ef440d08Skettenis TAILQ_FOREACH(pg, &pgl, pageq) { 2213ef440d08Skettenis uvm_pagezero(pg); 2214ef440d08Skettenis atomic_setbits_int(&pg->pg_flags, PG_ZERO); 2215ef440d08Skettenis } 2216ef440d08Skettenis 2217ef440d08Skettenis uvm_lock_fpageq(); 2218ef440d08Skettenis while (!TAILQ_EMPTY(&pgl)) 2219ef440d08Skettenis uvm_pmr_remove_1strange(&pgl, 0, NULL, 0); 2220ef440d08Skettenis uvmexp.zeropages += count; 2221ef440d08Skettenis uvm_unlock_fpageq(); 2222ef440d08Skettenis 2223ef440d08Skettenis yield(); 2224ef440d08Skettenis } 2225ef440d08Skettenis } 222682673a18Smpi 222782673a18Smpi #if defined(MULTIPROCESSOR) && defined(__HAVE_UVM_PERCPU) 222882673a18Smpi int 222982673a18Smpi uvm_pmr_cache_alloc(struct uvm_pmr_cache_item *upci) 223082673a18Smpi { 223182673a18Smpi struct vm_page *pg; 223282673a18Smpi struct pglist pgl; 223382673a18Smpi int flags = UVM_PLA_NOWAIT|UVM_PLA_NOWAKE; 223482673a18Smpi int npages = UVM_PMR_CACHEMAGSZ; 223582673a18Smpi 223682673a18Smpi splassert(IPL_VM); 223782673a18Smpi KASSERT(upci->upci_npages == 0); 223882673a18Smpi 223982673a18Smpi TAILQ_INIT(&pgl); 224082673a18Smpi if (uvm_pmr_getpages(npages, 0, 0, 1, 0, npages, flags, &pgl)) 224182673a18Smpi return -1; 224282673a18Smpi 224382673a18Smpi while ((pg = TAILQ_FIRST(&pgl)) != NULL) { 224482673a18Smpi TAILQ_REMOVE(&pgl, pg, pageq); 224582673a18Smpi upci->upci_pages[upci->upci_npages] = pg; 224682673a18Smpi upci->upci_npages++; 224782673a18Smpi } 224882673a18Smpi atomic_add_int(&uvmexp.percpucaches, npages); 224982673a18Smpi 225082673a18Smpi return 0; 225182673a18Smpi } 225282673a18Smpi 225382673a18Smpi struct vm_page * 225482673a18Smpi uvm_pmr_cache_get(int flags) 225582673a18Smpi { 225682673a18Smpi struct uvm_pmr_cache *upc = &curcpu()->ci_uvm; 225782673a18Smpi struct uvm_pmr_cache_item *upci; 225882673a18Smpi struct vm_page *pg; 225982673a18Smpi int s; 226082673a18Smpi 226182673a18Smpi s = splvm(); 226282673a18Smpi upci = &upc->upc_magz[upc->upc_actv]; 226382673a18Smpi if (upci->upci_npages == 0) { 226482673a18Smpi unsigned int prev; 226582673a18Smpi 226682673a18Smpi prev = (upc->upc_actv == 0) ? 1 : 0; 226782673a18Smpi upci = &upc->upc_magz[prev]; 226882673a18Smpi if (upci->upci_npages == 0) { 226982673a18Smpi atomic_inc_int(&uvmexp.pcpmiss); 227082673a18Smpi if (uvm_pmr_cache_alloc(upci)) { 227182673a18Smpi splx(s); 227282673a18Smpi return uvm_pmr_getone(flags); 227382673a18Smpi } 227482673a18Smpi } 227582673a18Smpi /* Swap magazines */ 227682673a18Smpi upc->upc_actv = prev; 227782673a18Smpi } else { 227882673a18Smpi atomic_inc_int(&uvmexp.pcphit); 227982673a18Smpi } 228082673a18Smpi 228182673a18Smpi atomic_dec_int(&uvmexp.percpucaches); 228282673a18Smpi upci->upci_npages--; 228382673a18Smpi pg = upci->upci_pages[upci->upci_npages]; 228482673a18Smpi splx(s); 228582673a18Smpi 228682673a18Smpi if (flags & UVM_PLA_ZERO) 228782673a18Smpi uvm_pagezero(pg); 228882673a18Smpi 228982673a18Smpi return pg; 229082673a18Smpi } 229182673a18Smpi 2292804e3d51Smpi unsigned int 229382673a18Smpi uvm_pmr_cache_free(struct uvm_pmr_cache_item *upci) 229482673a18Smpi { 229582673a18Smpi struct pglist pgl; 229682673a18Smpi unsigned int i; 229782673a18Smpi 229882673a18Smpi splassert(IPL_VM); 229982673a18Smpi 230082673a18Smpi TAILQ_INIT(&pgl); 230182673a18Smpi for (i = 0; i < upci->upci_npages; i++) 230282673a18Smpi TAILQ_INSERT_TAIL(&pgl, upci->upci_pages[i], pageq); 230382673a18Smpi 230482673a18Smpi uvm_pmr_freepageq(&pgl); 230582673a18Smpi 230682673a18Smpi atomic_sub_int(&uvmexp.percpucaches, upci->upci_npages); 230782673a18Smpi upci->upci_npages = 0; 230882673a18Smpi memset(upci->upci_pages, 0, sizeof(upci->upci_pages)); 2309804e3d51Smpi 2310804e3d51Smpi return i; 231182673a18Smpi } 231282673a18Smpi 231382673a18Smpi void 231482673a18Smpi uvm_pmr_cache_put(struct vm_page *pg) 231582673a18Smpi { 231682673a18Smpi struct uvm_pmr_cache *upc = &curcpu()->ci_uvm; 231782673a18Smpi struct uvm_pmr_cache_item *upci; 23189326bf8dSmpi struct uvm_pmemrange *pmr; 231982673a18Smpi int s; 232082673a18Smpi 23219326bf8dSmpi /* 23229326bf8dSmpi * Always give back low pages to the allocator to not accelerate 23239326bf8dSmpi * their exhaustion. 23249326bf8dSmpi */ 23259326bf8dSmpi pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); 23269326bf8dSmpi if (pmr->use > 0) { 23279326bf8dSmpi uvm_pmr_freepages(pg, 1); 23289326bf8dSmpi return; 23299326bf8dSmpi } 23309326bf8dSmpi 233182673a18Smpi s = splvm(); 233282673a18Smpi upci = &upc->upc_magz[upc->upc_actv]; 233382673a18Smpi if (upci->upci_npages >= UVM_PMR_CACHEMAGSZ) { 233482673a18Smpi unsigned int prev; 233582673a18Smpi 233682673a18Smpi prev = (upc->upc_actv == 0) ? 1 : 0; 233782673a18Smpi upci = &upc->upc_magz[prev]; 233882673a18Smpi if (upci->upci_npages > 0) 233982673a18Smpi uvm_pmr_cache_free(upci); 234082673a18Smpi 234182673a18Smpi /* Swap magazines */ 234282673a18Smpi upc->upc_actv = prev; 234382673a18Smpi KASSERT(upci->upci_npages == 0); 234482673a18Smpi } 234582673a18Smpi 234682673a18Smpi upci->upci_pages[upci->upci_npages] = pg; 234782673a18Smpi upci->upci_npages++; 234882673a18Smpi atomic_inc_int(&uvmexp.percpucaches); 234982673a18Smpi splx(s); 235082673a18Smpi } 235182673a18Smpi 2352804e3d51Smpi unsigned int 235382673a18Smpi uvm_pmr_cache_drain(void) 235482673a18Smpi { 235582673a18Smpi struct uvm_pmr_cache *upc = &curcpu()->ci_uvm; 2356804e3d51Smpi unsigned int freed = 0; 235782673a18Smpi int s; 235882673a18Smpi 235982673a18Smpi s = splvm(); 2360804e3d51Smpi freed += uvm_pmr_cache_free(&upc->upc_magz[0]); 2361804e3d51Smpi freed += uvm_pmr_cache_free(&upc->upc_magz[1]); 236282673a18Smpi splx(s); 2363804e3d51Smpi 2364804e3d51Smpi return freed; 236582673a18Smpi } 236682673a18Smpi 236782673a18Smpi #else /* !(MULTIPROCESSOR && __HAVE_UVM_PERCPU) */ 236882673a18Smpi 236982673a18Smpi struct vm_page * 237082673a18Smpi uvm_pmr_cache_get(int flags) 237182673a18Smpi { 237282673a18Smpi return uvm_pmr_getone(flags); 237382673a18Smpi } 237482673a18Smpi 237582673a18Smpi void 237682673a18Smpi uvm_pmr_cache_put(struct vm_page *pg) 237782673a18Smpi { 237882673a18Smpi uvm_pmr_freepages(pg, 1); 237982673a18Smpi } 238082673a18Smpi 2381804e3d51Smpi unsigned int 238282673a18Smpi uvm_pmr_cache_drain(void) 238382673a18Smpi { 2384804e3d51Smpi return 0; 238582673a18Smpi } 238682673a18Smpi #endif 2387