1 /* $OpenBSD: uvm_pmemrange.c,v 1.54 2019/05/09 20:36:44 beck Exp $ */ 2 3 /* 4 * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/param.h> 20 #include <sys/systm.h> 21 #include <uvm/uvm.h> 22 #include <sys/malloc.h> 23 #include <sys/kernel.h> 24 #include <sys/kthread.h> 25 #include <sys/mount.h> 26 27 /* 28 * 2 trees: addr tree and size tree. 29 * 30 * The allocator keeps chunks of free pages (called a range). 31 * Two pages are part of the same range if: 32 * - all pages in between are part of that range, 33 * - they are of the same memory type (zeroed or non-zeroed), 34 * - they are part of the same pmemrange. 35 * A pmemrange is a range of memory which is part of the same vm_physseg 36 * and has a use-count. 37 * 38 * addr tree is vm_page[0].objt 39 * size tree is vm_page[1].objt 40 * 41 * The size tree is not used for memory ranges of 1 page, instead, 42 * single queue is vm_page[0].pageq 43 * 44 * vm_page[0].fpgsz describes the length of a free range. Two adjecent ranges 45 * are joined, unless: 46 * - they have pages in between them which are not free 47 * - they belong to different memtypes (zeroed vs dirty memory) 48 * - they are in different pmemrange areas (ISA vs non-ISA memory for instance) 49 * - they are not a continuation of the same array 50 * The latter issue is caused by vm_physseg ordering and splitting from the 51 * MD initialization machinery. The MD code is dependant on freelists and 52 * happens to split ISA memory from non-ISA memory. 53 * (Note: freelists die die die!) 54 * 55 * uvm_page_init guarantees that every vm_physseg contains an array of 56 * struct vm_page. Also, uvm_page_physload allocates an array of struct 57 * vm_page. This code depends on that array. The array may break across 58 * vm_physsegs boundaries. 59 */ 60 61 /* 62 * Validate the flags of the page. (Used in asserts.) 63 * Any free page must have the PQ_FREE flag set. 64 * Free pages may be zeroed. 65 * Pmap flags are left untouched. 66 * 67 * The PQ_FREE flag is not checked here: by not checking, we can easily use 68 * this check in pages which are freed. 69 */ 70 #define VALID_FLAGS(pg_flags) \ 71 (((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0) 72 73 /* Tree comparators. */ 74 int uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *, 75 const struct uvm_pmemrange *); 76 int uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *); 77 int uvm_pmr_pg_to_memtype(struct vm_page *); 78 79 #ifdef DDB 80 void uvm_pmr_print(void); 81 #endif 82 83 /* 84 * Memory types. The page flags are used to derive what the current memory 85 * type of a page is. 86 */ 87 int 88 uvm_pmr_pg_to_memtype(struct vm_page *pg) 89 { 90 if (pg->pg_flags & PG_ZERO) 91 return UVM_PMR_MEMTYPE_ZERO; 92 /* Default: dirty memory. */ 93 return UVM_PMR_MEMTYPE_DIRTY; 94 } 95 96 /* Trees. */ 97 RBT_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp); 98 RBT_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp); 99 RBT_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr, 100 uvm_pmemrange_addr_cmp); 101 102 /* Validation. */ 103 #ifdef DEBUG 104 void uvm_pmr_assertvalid(struct uvm_pmemrange *pmr); 105 #else 106 #define uvm_pmr_assertvalid(pmr) do {} while (0) 107 #endif 108 109 psize_t uvm_pmr_get1page(psize_t, int, struct pglist *, 110 paddr_t, paddr_t, int); 111 112 struct uvm_pmemrange *uvm_pmr_allocpmr(void); 113 struct vm_page *uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int); 114 struct vm_page *uvm_pmr_nextsz(struct uvm_pmemrange *, 115 struct vm_page *, int); 116 void uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, 117 struct vm_page *pg, struct vm_page **pg_prev, 118 struct vm_page **pg_next); 119 struct vm_page *uvm_pmr_findnextsegment(struct uvm_pmemrange *, 120 struct vm_page *, paddr_t); 121 psize_t uvm_pmr_remove_1strange(struct pglist *, paddr_t, 122 struct vm_page **, int); 123 void uvm_pmr_split(paddr_t); 124 struct uvm_pmemrange *uvm_pmemrange_find(paddr_t); 125 struct uvm_pmemrange *uvm_pmemrange_use_insert(struct uvm_pmemrange_use *, 126 struct uvm_pmemrange *); 127 psize_t pow2divide(psize_t, psize_t); 128 struct vm_page *uvm_pmr_rootupdate(struct uvm_pmemrange *, 129 struct vm_page *, paddr_t, paddr_t, int); 130 131 /* 132 * Computes num/denom and rounds it up to the next power-of-2. 133 * 134 * This is a division function which calculates an approximation of 135 * num/denom, with result =~ num/denom. It is meant to be fast and doesn't 136 * have to be accurate. 137 * 138 * Providing too large a value makes the allocator slightly faster, at the 139 * risk of hitting the failure case more often. Providing too small a value 140 * makes the allocator a bit slower, but less likely to hit a failure case. 141 */ 142 psize_t 143 pow2divide(psize_t num, psize_t denom) 144 { 145 int rshift; 146 147 for (rshift = 0; num > denom; rshift++, denom <<= 1) 148 ; 149 return (paddr_t)1 << rshift; 150 } 151 152 /* 153 * Predicate: lhs is a subrange or rhs. 154 * 155 * If rhs_low == 0: don't care about lower bound. 156 * If rhs_high == 0: don't care about upper bound. 157 */ 158 #define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high) \ 159 (((rhs_low) == 0 || (lhs_low) >= (rhs_low)) && \ 160 ((rhs_high) == 0 || (lhs_high) <= (rhs_high))) 161 162 /* 163 * Predicate: lhs intersects with rhs. 164 * 165 * If rhs_low == 0: don't care about lower bound. 166 * If rhs_high == 0: don't care about upper bound. 167 * Ranges don't intersect if they don't have any page in common, array 168 * semantics mean that < instead of <= should be used here. 169 */ 170 #define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high) \ 171 (((rhs_low) == 0 || (rhs_low) < (lhs_high)) && \ 172 ((rhs_high) == 0 || (lhs_low) < (rhs_high))) 173 174 /* 175 * Align to power-of-2 alignment. 176 */ 177 #define PMR_ALIGN(pgno, align) \ 178 (((pgno) + ((align) - 1)) & ~((align) - 1)) 179 180 181 /* 182 * Comparator: sort by address ascending. 183 */ 184 int 185 uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *lhs, 186 const struct uvm_pmemrange *rhs) 187 { 188 return lhs->low < rhs->low ? -1 : lhs->low > rhs->low; 189 } 190 191 /* 192 * Comparator: sort by use ascending. 193 * 194 * The higher the use value of a range, the more devices need memory in 195 * this range. Therefore allocate from the range with the lowest use first. 196 */ 197 int 198 uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs) 199 { 200 int result; 201 202 result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use; 203 if (result == 0) 204 result = uvm_pmemrange_addr_cmp(lhs, rhs); 205 return result; 206 } 207 208 int 209 uvm_pmr_addr_cmp(const struct vm_page *lhs, const struct vm_page *rhs) 210 { 211 paddr_t lhs_addr, rhs_addr; 212 213 lhs_addr = VM_PAGE_TO_PHYS(lhs); 214 rhs_addr = VM_PAGE_TO_PHYS(rhs); 215 216 return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr); 217 } 218 219 int 220 uvm_pmr_size_cmp(const struct vm_page *lhs, const struct vm_page *rhs) 221 { 222 psize_t lhs_size, rhs_size; 223 int cmp; 224 225 /* Using second tree, so we receive pg[1] instead of pg[0]. */ 226 lhs_size = (lhs - 1)->fpgsz; 227 rhs_size = (rhs - 1)->fpgsz; 228 229 cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size); 230 if (cmp == 0) 231 cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1); 232 return cmp; 233 } 234 235 /* 236 * Find the first range of free pages that is at least sz pages long. 237 */ 238 struct vm_page * 239 uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti) 240 { 241 struct vm_page *node, *best; 242 243 KASSERT(sz >= 1); 244 245 if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti])) 246 return TAILQ_FIRST(&pmr->single[mti]); 247 248 node = RBT_ROOT(uvm_pmr_size, &pmr->size[mti]); 249 best = NULL; 250 while (node != NULL) { 251 if ((node - 1)->fpgsz >= sz) { 252 best = (node - 1); 253 node = RBT_LEFT(uvm_objtree, node); 254 } else 255 node = RBT_RIGHT(uvm_objtree, node); 256 } 257 return best; 258 } 259 260 /* 261 * Finds the next range. The next range has a size >= pg->fpgsz. 262 * Returns NULL if no more ranges are available. 263 */ 264 struct vm_page * 265 uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt) 266 { 267 struct vm_page *npg; 268 269 KASSERT(pmr != NULL && pg != NULL); 270 if (pg->fpgsz == 1) { 271 if (TAILQ_NEXT(pg, pageq) != NULL) 272 return TAILQ_NEXT(pg, pageq); 273 else 274 npg = RBT_MIN(uvm_pmr_size, &pmr->size[mt]); 275 } else 276 npg = RBT_NEXT(uvm_pmr_size, pg + 1); 277 278 return npg == NULL ? NULL : npg - 1; 279 } 280 281 /* 282 * Finds the previous and next ranges relative to the (uninserted) pg range. 283 * 284 * *pg_prev == NULL if no previous range is available, that can join with 285 * pg. 286 * *pg_next == NULL if no next range is available, that can join with 287 * pg. 288 */ 289 void 290 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg, 291 struct vm_page **pg_prev, struct vm_page **pg_next) 292 { 293 KASSERT(pg_prev != NULL && pg_next != NULL); 294 295 *pg_next = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg); 296 if (*pg_next == NULL) 297 *pg_prev = RBT_MAX(uvm_pmr_addr, &pmr->addr); 298 else 299 *pg_prev = RBT_PREV(uvm_pmr_addr, *pg_next); 300 301 KDASSERT(*pg_next == NULL || 302 VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg)); 303 KDASSERT(*pg_prev == NULL || 304 VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg)); 305 306 /* Reset if not contig. */ 307 if (*pg_prev != NULL && 308 (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz 309 != atop(VM_PAGE_TO_PHYS(pg)) || 310 *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */ 311 uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg))) 312 *pg_prev = NULL; 313 if (*pg_next != NULL && 314 (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz 315 != atop(VM_PAGE_TO_PHYS(*pg_next)) || 316 pg + pg->fpgsz != *pg_next || /* Array broke. */ 317 uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg))) 318 *pg_next = NULL; 319 return; 320 } 321 322 /* 323 * Remove a range from the address tree. 324 * Address tree maintains pmr counters. 325 */ 326 void 327 uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg) 328 { 329 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg); 330 KDASSERT(pg->pg_flags & PQ_FREE); 331 RBT_REMOVE(uvm_pmr_addr, &pmr->addr, pg); 332 333 pmr->nsegs--; 334 } 335 /* 336 * Remove a range from the size tree. 337 */ 338 void 339 uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg) 340 { 341 int memtype; 342 #ifdef DEBUG 343 struct vm_page *i; 344 #endif 345 346 KDASSERT(pg->fpgsz >= 1); 347 KDASSERT(pg->pg_flags & PQ_FREE); 348 memtype = uvm_pmr_pg_to_memtype(pg); 349 350 if (pg->fpgsz == 1) { 351 #ifdef DEBUG 352 TAILQ_FOREACH(i, &pmr->single[memtype], pageq) { 353 if (i == pg) 354 break; 355 } 356 KDASSERT(i == pg); 357 #endif 358 TAILQ_REMOVE(&pmr->single[memtype], pg, pageq); 359 } else { 360 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[memtype], 361 pg + 1) == pg + 1); 362 RBT_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1); 363 } 364 } 365 /* Remove from both trees. */ 366 void 367 uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg) 368 { 369 uvm_pmr_assertvalid(pmr); 370 uvm_pmr_remove_size(pmr, pg); 371 uvm_pmr_remove_addr(pmr, pg); 372 uvm_pmr_assertvalid(pmr); 373 } 374 375 /* 376 * Insert the range described in pg. 377 * Returns the range thus created (which may be joined with the previous and 378 * next ranges). 379 * If no_join, the caller guarantees that the range cannot possibly join 380 * with adjecent ranges. 381 */ 382 struct vm_page * 383 uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join) 384 { 385 struct vm_page *prev, *next; 386 387 #ifdef DEBUG 388 struct vm_page *i; 389 int mt; 390 #endif 391 392 KDASSERT(pg->pg_flags & PQ_FREE); 393 KDASSERT(pg->fpgsz >= 1); 394 395 #ifdef DEBUG 396 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { 397 TAILQ_FOREACH(i, &pmr->single[mt], pageq) 398 KDASSERT(i != pg); 399 if (pg->fpgsz > 1) { 400 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mt], 401 pg + 1) == NULL); 402 } 403 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL); 404 } 405 #endif 406 407 if (!no_join) { 408 uvm_pmr_pnaddr(pmr, pg, &prev, &next); 409 if (next != NULL) { 410 uvm_pmr_remove_size(pmr, next); 411 uvm_pmr_remove_addr(pmr, next); 412 pg->fpgsz += next->fpgsz; 413 next->fpgsz = 0; 414 } 415 if (prev != NULL) { 416 uvm_pmr_remove_size(pmr, prev); 417 prev->fpgsz += pg->fpgsz; 418 pg->fpgsz = 0; 419 return prev; 420 } 421 } 422 423 RBT_INSERT(uvm_pmr_addr, &pmr->addr, pg); 424 425 pmr->nsegs++; 426 427 return pg; 428 } 429 /* 430 * Insert the range described in pg. 431 * Returns the range thus created (which may be joined with the previous and 432 * next ranges). 433 * Page must already be in the address tree. 434 */ 435 void 436 uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg) 437 { 438 int memtype; 439 #ifdef DEBUG 440 struct vm_page *i; 441 int mti; 442 #endif 443 444 KDASSERT(pg->fpgsz >= 1); 445 KDASSERT(pg->pg_flags & PQ_FREE); 446 447 memtype = uvm_pmr_pg_to_memtype(pg); 448 #ifdef DEBUG 449 for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) { 450 TAILQ_FOREACH(i, &pmr->single[mti], pageq) 451 KDASSERT(i != pg); 452 if (pg->fpgsz > 1) { 453 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mti], 454 pg + 1) == NULL); 455 } 456 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg); 457 } 458 for (i = pg; i < pg + pg->fpgsz; i++) 459 KASSERT(uvm_pmr_pg_to_memtype(i) == memtype); 460 #endif 461 462 if (pg->fpgsz == 1) 463 TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq); 464 else 465 RBT_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1); 466 } 467 /* Insert in both trees. */ 468 struct vm_page * 469 uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join) 470 { 471 uvm_pmr_assertvalid(pmr); 472 pg = uvm_pmr_insert_addr(pmr, pg, no_join); 473 uvm_pmr_insert_size(pmr, pg); 474 uvm_pmr_assertvalid(pmr); 475 return pg; 476 } 477 478 /* 479 * Find the last page that is part of this segment. 480 * => pg: the range at which to start the search. 481 * => boundary: the page number boundary specification (0 = no boundary). 482 * => pmr: the pmemrange of the page. 483 * 484 * This function returns 1 before the next range, so if you want to have the 485 * next range, you need to run TAILQ_NEXT(result, pageq) after calling. 486 * The reason is that this way, the length of the segment is easily 487 * calculated using: atop(result) - atop(pg) + 1. 488 * Hence this function also never returns NULL. 489 */ 490 struct vm_page * 491 uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr, 492 struct vm_page *pg, paddr_t boundary) 493 { 494 paddr_t first_boundary; 495 struct vm_page *next; 496 struct vm_page *prev; 497 498 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) && 499 pmr->high > atop(VM_PAGE_TO_PHYS(pg))); 500 if (boundary != 0) { 501 first_boundary = 502 PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary); 503 } else 504 first_boundary = 0; 505 506 /* 507 * Increase next until it hits the first page of the next segment. 508 * 509 * While loop checks the following: 510 * - next != NULL we have not reached the end of pgl 511 * - boundary == 0 || next < first_boundary 512 * we do not cross a boundary 513 * - atop(prev) + 1 == atop(next) 514 * still in the same segment 515 * - low <= last 516 * - high > last still in the same memory range 517 * - memtype is equal allocator is unable to view different memtypes 518 * as part of the same segment 519 * - prev + 1 == next no array breakage occurs 520 */ 521 prev = pg; 522 next = TAILQ_NEXT(prev, pageq); 523 while (next != NULL && 524 (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) && 525 atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) && 526 pmr->low <= atop(VM_PAGE_TO_PHYS(next)) && 527 pmr->high > atop(VM_PAGE_TO_PHYS(next)) && 528 uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) && 529 prev + 1 == next) { 530 prev = next; 531 next = TAILQ_NEXT(prev, pageq); 532 } 533 534 /* 535 * End of this segment. 536 */ 537 return prev; 538 } 539 540 /* 541 * Remove the first segment of contiguous pages from pgl. 542 * A segment ends if it crosses boundary (unless boundary = 0) or 543 * if it would enter a different uvm_pmemrange. 544 * 545 * Work: the page range that the caller is currently working with. 546 * May be null. 547 * 548 * If is_desperate is non-zero, the smallest segment is erased. Otherwise, 549 * the first segment is erased (which, if called by uvm_pmr_getpages(), 550 * probably is the smallest or very close to it). 551 */ 552 psize_t 553 uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary, 554 struct vm_page **work, int is_desperate) 555 { 556 struct vm_page *start, *end, *iter, *iter_end, *inserted; 557 psize_t count; 558 struct uvm_pmemrange *pmr, *pmr_iter; 559 560 KASSERT(!TAILQ_EMPTY(pgl)); 561 562 /* 563 * Initialize to first page. 564 * Unless desperate scan finds a better candidate, this is what'll be 565 * erased. 566 */ 567 start = TAILQ_FIRST(pgl); 568 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start))); 569 end = uvm_pmr_findnextsegment(pmr, start, boundary); 570 571 /* 572 * If we are desperate, we _really_ want to get rid of the smallest 573 * element (rather than a close match to the smallest element). 574 */ 575 if (is_desperate) { 576 /* Linear search for smallest segment. */ 577 pmr_iter = pmr; 578 for (iter = TAILQ_NEXT(end, pageq); 579 iter != NULL && start != end; 580 iter = TAILQ_NEXT(iter_end, pageq)) { 581 /* 582 * Only update pmr if it doesn't match current 583 * iteration. 584 */ 585 if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) || 586 pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) { 587 pmr_iter = uvm_pmemrange_find(atop( 588 VM_PAGE_TO_PHYS(iter))); 589 } 590 591 iter_end = uvm_pmr_findnextsegment(pmr_iter, iter, 592 boundary); 593 594 /* 595 * Current iteration is smaller than best match so 596 * far; update. 597 */ 598 if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) < 599 VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) { 600 start = iter; 601 end = iter_end; 602 pmr = pmr_iter; 603 } 604 } 605 } 606 607 /* 608 * Calculate count and end of the list. 609 */ 610 count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1; 611 end = TAILQ_NEXT(end, pageq); 612 613 /* 614 * Actually remove the range of pages. 615 * 616 * Sadly, this cannot be done using pointer iteration: 617 * vm_physseg is not guaranteed to be sorted on address, hence 618 * uvm_page_init() may not have initialized its array sorted by 619 * page number. 620 */ 621 for (iter = start; iter != end; iter = iter_end) { 622 iter_end = TAILQ_NEXT(iter, pageq); 623 TAILQ_REMOVE(pgl, iter, pageq); 624 } 625 626 start->fpgsz = count; 627 inserted = uvm_pmr_insert(pmr, start, 0); 628 629 /* 630 * If the caller was working on a range and this function modified 631 * that range, update the pointer. 632 */ 633 if (work != NULL && *work != NULL && 634 atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) && 635 atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz > 636 atop(VM_PAGE_TO_PHYS(*work))) 637 *work = inserted; 638 return count; 639 } 640 641 /* 642 * Extract a number of pages from a segment of free pages. 643 * Called by uvm_pmr_getpages. 644 * 645 * Returns the segment that was created from pages left over at the tail 646 * of the remove set of pages, or NULL if no pages were left at the tail. 647 */ 648 struct vm_page * 649 uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg, 650 paddr_t start, paddr_t end, struct pglist *result) 651 { 652 struct vm_page *after, *pg_i; 653 psize_t before_sz, after_sz; 654 #ifdef DEBUG 655 psize_t i; 656 #endif 657 658 KDASSERT(end > start); 659 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg))); 660 KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz); 661 KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start); 662 KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end); 663 664 before_sz = start - atop(VM_PAGE_TO_PHYS(pg)); 665 after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end; 666 KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz); 667 uvm_pmr_assertvalid(pmr); 668 669 uvm_pmr_remove_size(pmr, pg); 670 if (before_sz == 0) 671 uvm_pmr_remove_addr(pmr, pg); 672 after = pg + before_sz + (end - start); 673 674 /* Add selected pages to result. */ 675 for (pg_i = pg + before_sz; pg_i != after; pg_i++) { 676 KASSERT(pg_i->pg_flags & PQ_FREE); 677 pg_i->fpgsz = 0; 678 TAILQ_INSERT_TAIL(result, pg_i, pageq); 679 } 680 681 /* Before handling. */ 682 if (before_sz > 0) { 683 pg->fpgsz = before_sz; 684 uvm_pmr_insert_size(pmr, pg); 685 } 686 687 /* After handling. */ 688 if (after_sz > 0) { 689 #ifdef DEBUG 690 for (i = 0; i < after_sz; i++) { 691 KASSERT(!uvm_pmr_isfree(after + i)); 692 } 693 #endif 694 KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end); 695 after->fpgsz = after_sz; 696 after = uvm_pmr_insert_addr(pmr, after, 1); 697 uvm_pmr_insert_size(pmr, after); 698 } 699 700 uvm_pmr_assertvalid(pmr); 701 return (after_sz > 0 ? after : NULL); 702 } 703 704 /* 705 * Indicate to the page daemon that a nowait call failed and it should 706 * recover at least some memory in the most restricted region (assumed 707 * to be dma_constraint). 708 */ 709 extern volatile int uvm_nowait_failed; 710 711 /* 712 * Acquire a number of pages. 713 * 714 * count: the number of pages returned 715 * start: lowest page number 716 * end: highest page number +1 717 * (start = end = 0: no limitation) 718 * align: power-of-2 alignment constraint (align = 1: no alignment) 719 * boundary: power-of-2 boundary (boundary = 0: no boundary) 720 * maxseg: maximum number of segments to return 721 * flags: UVM_PLA_* flags 722 * result: returned pages storage (uses pageq) 723 */ 724 int 725 uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align, 726 paddr_t boundary, int maxseg, int flags, struct pglist *result) 727 { 728 struct uvm_pmemrange *pmr; /* Iterate memory ranges. */ 729 struct vm_page *found, *f_next; /* Iterate chunks. */ 730 psize_t fcount; /* Current found pages. */ 731 int fnsegs; /* Current segment counter. */ 732 int try, start_try; 733 psize_t search[3]; 734 paddr_t fstart, fend; /* Pages to be taken from found. */ 735 int memtype; /* Requested memtype. */ 736 int memtype_init; /* Best memtype. */ 737 int desperate; /* True if allocation failed. */ 738 #ifdef DIAGNOSTIC 739 struct vm_page *diag_prev; /* Used during validation. */ 740 #endif /* DIAGNOSTIC */ 741 742 /* 743 * Validate arguments. 744 */ 745 KASSERT(count > 0); 746 KASSERT(start == 0 || end == 0 || start < end); 747 KASSERT(align >= 1); 748 KASSERT(powerof2(align)); 749 KASSERT(maxseg > 0); 750 KASSERT(boundary == 0 || powerof2(boundary)); 751 KASSERT(boundary == 0 || maxseg * boundary >= count); 752 KASSERT(TAILQ_EMPTY(result)); 753 754 /* 755 * TRYCONTIG is a noop if you only want a single segment. 756 * Remove it if that's the case: otherwise it'll deny the fast 757 * allocation. 758 */ 759 if (maxseg == 1 || count == 1) 760 flags &= ~UVM_PLA_TRYCONTIG; 761 762 /* 763 * Configure search. 764 * 765 * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case. 766 * search[1] is multiple segments, chosen to fulfill the search in 767 * approximately even-sized segments. 768 * This is a good trade-off between slightly reduced allocation speed 769 * and less fragmentation. 770 * search[2] is the worst case, in which all segments are evaluated. 771 * This provides the least fragmentation, but makes the search 772 * possibly longer (although in the case it is selected, that no 773 * longer matters most). 774 * 775 * The exception is when maxseg == 1: since we can only fulfill that 776 * with one segment of size pages, only a single search type has to 777 * be attempted. 778 */ 779 if (maxseg == 1 || count == 1) { 780 start_try = 2; 781 search[2] = count; 782 } else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) { 783 start_try = 2; 784 search[2] = 1; 785 } else { 786 start_try = 0; 787 search[0] = count; 788 search[1] = pow2divide(count, maxseg); 789 search[2] = 1; 790 if ((flags & UVM_PLA_TRYCONTIG) == 0) 791 start_try = 1; 792 if (search[1] >= search[0]) { 793 search[1] = search[0]; 794 start_try = 1; 795 } 796 if (search[2] >= search[start_try]) { 797 start_try = 2; 798 } 799 } 800 801 /* 802 * Memory type: if zeroed memory is requested, traverse the zero set. 803 * Otherwise, traverse the dirty set. 804 * 805 * The memtype iterator is reinitialized to memtype_init on entrance 806 * of a pmemrange. 807 */ 808 if (flags & UVM_PLA_ZERO) 809 memtype_init = UVM_PMR_MEMTYPE_ZERO; 810 else 811 memtype_init = UVM_PMR_MEMTYPE_DIRTY; 812 813 /* 814 * Initially, we're not desperate. 815 * 816 * Note that if we return from a sleep, we are still desperate. 817 * Chances are that memory pressure is still high, so resetting 818 * seems over-optimistic to me. 819 */ 820 desperate = 0; 821 822 uvm_lock_fpageq(); 823 824 retry: /* Return point after sleeping. */ 825 fcount = 0; 826 fnsegs = 0; 827 828 retry_desperate: 829 /* 830 * If we just want any page(s), go for the really fast option. 831 */ 832 if (count <= maxseg && align == 1 && boundary == 0 && 833 (flags & UVM_PLA_TRYCONTIG) == 0) { 834 fcount += uvm_pmr_get1page(count - fcount, memtype_init, 835 result, start, end, 0); 836 837 /* 838 * If we found sufficient pages, go to the succes exit code. 839 * 840 * Otherwise, go immediately to fail, since we collected 841 * all we could anyway. 842 */ 843 if (fcount == count) 844 goto out; 845 else 846 goto fail; 847 } 848 849 /* 850 * The heart of the contig case. 851 * 852 * The code actually looks like this: 853 * 854 * foreach (struct pmemrange) { 855 * foreach (memtype) { 856 * foreach(try) { 857 * foreach (free range of memtype in pmemrange, 858 * starting at search[try]) { 859 * while (range has space left) 860 * take from range 861 * } 862 * } 863 * } 864 * 865 * if next pmemrange has higher usecount than current: 866 * enter desperate case (which will drain the pmemranges 867 * until empty prior to moving to the next one) 868 * } 869 * 870 * When desperate is activated, try always starts at the highest 871 * value. The memtype loop is using a goto ReScanMemtype. 872 * The try loop is using a goto ReScan. 873 * The 'range has space left' loop uses label DrainFound. 874 * 875 * Writing them all as loops would take up a lot of screen space in 876 * the form of indentation and some parts are easier to express 877 * using the labels. 878 */ 879 880 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { 881 /* Empty range. */ 882 if (pmr->nsegs == 0) 883 continue; 884 885 /* Outside requested range. */ 886 if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end)) 887 continue; 888 889 memtype = memtype_init; 890 891 rescan_memtype: /* Return point at memtype++. */ 892 try = start_try; 893 894 rescan: /* Return point at try++. */ 895 for (found = uvm_pmr_nfindsz(pmr, search[try], memtype); 896 found != NULL; 897 found = f_next) { 898 f_next = uvm_pmr_nextsz(pmr, found, memtype); 899 900 fstart = atop(VM_PAGE_TO_PHYS(found)); 901 if (start != 0) 902 fstart = MAX(start, fstart); 903 drain_found: 904 /* 905 * Throw away the first segment if fnsegs == maxseg 906 * 907 * Note that f_next is still valid after this call, 908 * since we only allocated from entries before f_next. 909 * We don't revisit the entries we already extracted 910 * from unless we entered the desperate case. 911 */ 912 if (fnsegs == maxseg) { 913 fnsegs--; 914 fcount -= 915 uvm_pmr_remove_1strange(result, boundary, 916 &found, desperate); 917 } 918 919 fstart = PMR_ALIGN(fstart, align); 920 fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz; 921 if (end != 0) 922 fend = MIN(end, fend); 923 if (boundary != 0) { 924 fend = 925 MIN(fend, PMR_ALIGN(fstart + 1, boundary)); 926 } 927 if (fstart >= fend) 928 continue; 929 if (fend - fstart > count - fcount) 930 fend = fstart + (count - fcount); 931 932 fcount += fend - fstart; 933 fnsegs++; 934 found = uvm_pmr_extract_range(pmr, found, 935 fstart, fend, result); 936 937 if (fcount == count) 938 goto out; 939 940 /* 941 * If there's still space left in found, try to 942 * fully drain it prior to continueing. 943 */ 944 if (found != NULL) { 945 fstart = fend; 946 goto drain_found; 947 } 948 } 949 950 /* Try a smaller search now. */ 951 if (++try < nitems(search)) 952 goto rescan; 953 954 /* 955 * Exhaust all memory types prior to going to the next memory 956 * segment. 957 * This means that zero-vs-dirty are eaten prior to moving 958 * to a pmemrange with a higher use-count. 959 * 960 * Code is basically a difficult way of writing: 961 * memtype = memtype_init; 962 * do { 963 * ...; 964 * memtype += 1; 965 * memtype %= MEMTYPE_MAX; 966 * } while (memtype != memtype_init); 967 */ 968 memtype += 1; 969 if (memtype == UVM_PMR_MEMTYPE_MAX) 970 memtype = 0; 971 if (memtype != memtype_init) 972 goto rescan_memtype; 973 974 /* 975 * If not desperate, enter desperate case prior to eating all 976 * the good stuff in the next range. 977 */ 978 if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL && 979 TAILQ_NEXT(pmr, pmr_use)->use != pmr->use) 980 break; 981 } 982 983 /* 984 * Not enough memory of the requested type available. Fall back to 985 * less good memory that we'll clean up better later. 986 * 987 * This algorithm is not very smart though, it just starts scanning 988 * a different typed range, but the nicer ranges of the previous 989 * iteration may fall out. Hence there is a small chance of a false 990 * negative. 991 * 992 * When desparate: scan all sizes starting at the smallest 993 * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may 994 * allow us to hit the fast path now). 995 * 996 * Also, because we will revisit entries we scanned before, we need 997 * to reset the page queue, or we may end up releasing entries in 998 * such a way as to invalidate f_next. 999 */ 1000 if (!desperate) { 1001 desperate = 1; 1002 start_try = nitems(search) - 1; 1003 flags &= ~UVM_PLA_TRYCONTIG; 1004 1005 while (!TAILQ_EMPTY(result)) 1006 uvm_pmr_remove_1strange(result, 0, NULL, 0); 1007 fnsegs = 0; 1008 fcount = 0; 1009 goto retry_desperate; 1010 } 1011 1012 fail: 1013 /* Allocation failed. */ 1014 /* XXX: claim from memory reserve here */ 1015 1016 while (!TAILQ_EMPTY(result)) 1017 uvm_pmr_remove_1strange(result, 0, NULL, 0); 1018 1019 if (flags & UVM_PLA_WAITOK) { 1020 if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count), 1021 flags & UVM_PLA_FAILOK) == 0) 1022 goto retry; 1023 KASSERT(flags & UVM_PLA_FAILOK); 1024 } else { 1025 if (!(flags & UVM_PLA_NOWAKE)) { 1026 uvm_nowait_failed = 1; 1027 wakeup(&uvm.pagedaemon); 1028 } 1029 } 1030 uvm_unlock_fpageq(); 1031 1032 return ENOMEM; 1033 1034 out: 1035 /* Allocation succesful. */ 1036 uvmexp.free -= fcount; 1037 1038 uvm_unlock_fpageq(); 1039 1040 /* Update statistics and zero pages if UVM_PLA_ZERO. */ 1041 #ifdef DIAGNOSTIC 1042 fnsegs = 0; 1043 fcount = 0; 1044 diag_prev = NULL; 1045 #endif /* DIAGNOSTIC */ 1046 TAILQ_FOREACH(found, result, pageq) { 1047 atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK); 1048 1049 if (found->pg_flags & PG_ZERO) { 1050 uvm_lock_fpageq(); 1051 uvmexp.zeropages--; 1052 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) 1053 wakeup(&uvmexp.zeropages); 1054 uvm_unlock_fpageq(); 1055 } 1056 if (flags & UVM_PLA_ZERO) { 1057 if (found->pg_flags & PG_ZERO) 1058 uvmexp.pga_zerohit++; 1059 else { 1060 uvmexp.pga_zeromiss++; 1061 uvm_pagezero(found); 1062 } 1063 } 1064 atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE); 1065 1066 found->uobject = NULL; 1067 found->uanon = NULL; 1068 found->pg_version++; 1069 1070 /* 1071 * Validate that the page matches range criterium. 1072 */ 1073 KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start); 1074 KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end); 1075 1076 #ifdef DIAGNOSTIC 1077 /* 1078 * Update fcount (# found pages) and 1079 * fnsegs (# found segments) counters. 1080 */ 1081 if (diag_prev == NULL || 1082 /* new segment if it contains a hole */ 1083 atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 != 1084 atop(VM_PAGE_TO_PHYS(found)) || 1085 /* new segment if it crosses boundary */ 1086 (atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) != 1087 (atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1))) 1088 fnsegs++; 1089 fcount++; 1090 1091 diag_prev = found; 1092 #endif /* DIAGNOSTIC */ 1093 } 1094 1095 #ifdef DIAGNOSTIC 1096 /* 1097 * Panic on algorithm failure. 1098 */ 1099 if (fcount != count || fnsegs > maxseg) { 1100 panic("pmemrange allocation error: " 1101 "allocated %ld pages in %d segments, " 1102 "but request was %ld pages in %d segments", 1103 fcount, fnsegs, count, maxseg); 1104 } 1105 #endif /* DIAGNOSTIC */ 1106 1107 return 0; 1108 } 1109 1110 /* 1111 * Free a number of contig pages (invoked by uvm_page_init). 1112 */ 1113 void 1114 uvm_pmr_freepages(struct vm_page *pg, psize_t count) 1115 { 1116 struct uvm_pmemrange *pmr; 1117 psize_t i, pmr_count; 1118 struct vm_page *firstpg = pg; 1119 1120 for (i = 0; i < count; i++) { 1121 KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) == 1122 atop(VM_PAGE_TO_PHYS(pg)) + i); 1123 1124 if (!((pg[i].pg_flags & PQ_FREE) == 0 && 1125 VALID_FLAGS(pg[i].pg_flags))) { 1126 printf("Flags: 0x%x, will panic now.\n", 1127 pg[i].pg_flags); 1128 } 1129 KASSERT((pg[i].pg_flags & PQ_FREE) == 0 && 1130 VALID_FLAGS(pg[i].pg_flags)); 1131 atomic_setbits_int(&pg[i].pg_flags, PQ_FREE); 1132 atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO); 1133 } 1134 1135 uvm_lock_fpageq(); 1136 1137 for (i = count; i > 0; i -= pmr_count) { 1138 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); 1139 KASSERT(pmr != NULL); 1140 1141 pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg))); 1142 pg->fpgsz = pmr_count; 1143 uvm_pmr_insert(pmr, pg, 0); 1144 1145 uvmexp.free += pmr_count; 1146 pg += pmr_count; 1147 } 1148 wakeup(&uvmexp.free); 1149 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) 1150 wakeup(&uvmexp.zeropages); 1151 1152 uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count)); 1153 1154 uvm_unlock_fpageq(); 1155 } 1156 1157 /* 1158 * Free all pages in the queue. 1159 */ 1160 void 1161 uvm_pmr_freepageq(struct pglist *pgl) 1162 { 1163 struct vm_page *pg; 1164 paddr_t pstart; 1165 psize_t plen; 1166 1167 TAILQ_FOREACH(pg, pgl, pageq) { 1168 if (!((pg->pg_flags & PQ_FREE) == 0 && 1169 VALID_FLAGS(pg->pg_flags))) { 1170 printf("Flags: 0x%x, will panic now.\n", 1171 pg->pg_flags); 1172 } 1173 KASSERT((pg->pg_flags & PQ_FREE) == 0 && 1174 VALID_FLAGS(pg->pg_flags)); 1175 atomic_setbits_int(&pg->pg_flags, PQ_FREE); 1176 atomic_clearbits_int(&pg->pg_flags, PG_ZERO); 1177 } 1178 1179 uvm_lock_fpageq(); 1180 while (!TAILQ_EMPTY(pgl)) { 1181 pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl)); 1182 plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0); 1183 uvmexp.free += plen; 1184 1185 uvm_wakeup_pla(pstart, ptoa(plen)); 1186 } 1187 wakeup(&uvmexp.free); 1188 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) 1189 wakeup(&uvmexp.zeropages); 1190 uvm_unlock_fpageq(); 1191 1192 return; 1193 } 1194 1195 /* 1196 * Store a pmemrange in the list. 1197 * 1198 * The list is sorted by use. 1199 */ 1200 struct uvm_pmemrange * 1201 uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq, 1202 struct uvm_pmemrange *pmr) 1203 { 1204 struct uvm_pmemrange *iter; 1205 int cmp = 1; 1206 1207 TAILQ_FOREACH(iter, useq, pmr_use) { 1208 cmp = uvm_pmemrange_use_cmp(pmr, iter); 1209 if (cmp == 0) 1210 return iter; 1211 if (cmp == -1) 1212 break; 1213 } 1214 1215 if (iter == NULL) 1216 TAILQ_INSERT_TAIL(useq, pmr, pmr_use); 1217 else 1218 TAILQ_INSERT_BEFORE(iter, pmr, pmr_use); 1219 return NULL; 1220 } 1221 1222 #ifdef DEBUG 1223 /* 1224 * Validation of the whole pmemrange. 1225 * Called with fpageq locked. 1226 */ 1227 void 1228 uvm_pmr_assertvalid(struct uvm_pmemrange *pmr) 1229 { 1230 struct vm_page *prev, *next, *i, *xref; 1231 int lcv, mti; 1232 1233 /* Empty range */ 1234 if (pmr->nsegs == 0) 1235 return; 1236 1237 /* Validate address tree. */ 1238 RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) { 1239 /* Validate the range. */ 1240 KASSERT(i->fpgsz > 0); 1241 KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low); 1242 KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz 1243 <= pmr->high); 1244 1245 /* Validate each page in this range. */ 1246 for (lcv = 0; lcv < i->fpgsz; lcv++) { 1247 /* 1248 * Only the first page has a size specification. 1249 * Rest is size 0. 1250 */ 1251 KASSERT(lcv == 0 || i[lcv].fpgsz == 0); 1252 /* 1253 * Flag check. 1254 */ 1255 KASSERT(VALID_FLAGS(i[lcv].pg_flags) && 1256 (i[lcv].pg_flags & PQ_FREE) == PQ_FREE); 1257 /* 1258 * Free pages are: 1259 * - not wired 1260 * - have no vm_anon 1261 * - have no uvm_object 1262 */ 1263 KASSERT(i[lcv].wire_count == 0); 1264 KASSERT(i[lcv].uanon == (void*)0xdeadbeef || 1265 i[lcv].uanon == NULL); 1266 KASSERT(i[lcv].uobject == (void*)0xdeadbeef || 1267 i[lcv].uobject == NULL); 1268 /* 1269 * Pages in a single range always have the same 1270 * memtype. 1271 */ 1272 KASSERT(uvm_pmr_pg_to_memtype(&i[0]) == 1273 uvm_pmr_pg_to_memtype(&i[lcv])); 1274 } 1275 1276 /* Check that it shouldn't be joined with its predecessor. */ 1277 prev = RBT_PREV(uvm_pmr_addr, i); 1278 if (prev != NULL) { 1279 KASSERT(uvm_pmr_pg_to_memtype(i) != 1280 uvm_pmr_pg_to_memtype(prev) || 1281 atop(VM_PAGE_TO_PHYS(i)) > 1282 atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz || 1283 prev + prev->fpgsz != i); 1284 } 1285 1286 /* Assert i is in the size tree as well. */ 1287 if (i->fpgsz == 1) { 1288 TAILQ_FOREACH(xref, 1289 &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) { 1290 if (xref == i) 1291 break; 1292 } 1293 KASSERT(xref == i); 1294 } else { 1295 KASSERT(RBT_FIND(uvm_pmr_size, 1296 &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) == 1297 i + 1); 1298 } 1299 } 1300 1301 /* Validate size tree. */ 1302 for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) { 1303 for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) { 1304 next = uvm_pmr_nextsz(pmr, i, mti); 1305 if (next != NULL) { 1306 KASSERT(i->fpgsz <= 1307 next->fpgsz); 1308 } 1309 1310 /* Assert i is in the addr tree as well. */ 1311 KASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, i) == i); 1312 1313 /* Assert i is of the correct memory type. */ 1314 KASSERT(uvm_pmr_pg_to_memtype(i) == mti); 1315 } 1316 } 1317 1318 /* Validate nsegs statistic. */ 1319 lcv = 0; 1320 RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) 1321 lcv++; 1322 KASSERT(pmr->nsegs == lcv); 1323 } 1324 #endif /* DEBUG */ 1325 1326 /* 1327 * Split pmr at split point pageno. 1328 * Called with fpageq unlocked. 1329 * 1330 * Split is only applied if a pmemrange spans pageno. 1331 */ 1332 void 1333 uvm_pmr_split(paddr_t pageno) 1334 { 1335 struct uvm_pmemrange *pmr, *drain; 1336 struct vm_page *rebuild, *prev, *next; 1337 psize_t prev_sz; 1338 1339 uvm_lock_fpageq(); 1340 pmr = uvm_pmemrange_find(pageno); 1341 if (pmr == NULL || !(pmr->low < pageno)) { 1342 /* No split required. */ 1343 uvm_unlock_fpageq(); 1344 return; 1345 } 1346 1347 KASSERT(pmr->low < pageno); 1348 KASSERT(pmr->high > pageno); 1349 1350 /* 1351 * uvm_pmr_allocpmr() calls into malloc() which in turn calls into 1352 * uvm_kmemalloc which calls into pmemrange, making the locking 1353 * a bit hard, so we just race! 1354 */ 1355 uvm_unlock_fpageq(); 1356 drain = uvm_pmr_allocpmr(); 1357 uvm_lock_fpageq(); 1358 pmr = uvm_pmemrange_find(pageno); 1359 if (pmr == NULL || !(pmr->low < pageno)) { 1360 /* 1361 * We lost the race since someone else ran this or a related 1362 * function, however this should be triggered very rarely so 1363 * we just leak the pmr. 1364 */ 1365 printf("uvm_pmr_split: lost one pmr\n"); 1366 uvm_unlock_fpageq(); 1367 return; 1368 } 1369 1370 drain->low = pageno; 1371 drain->high = pmr->high; 1372 drain->use = pmr->use; 1373 1374 uvm_pmr_assertvalid(pmr); 1375 uvm_pmr_assertvalid(drain); 1376 KASSERT(drain->nsegs == 0); 1377 1378 RBT_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) { 1379 if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno) 1380 break; 1381 } 1382 if (rebuild == NULL) 1383 prev = RBT_MAX(uvm_pmr_addr, &pmr->addr); 1384 else 1385 prev = RBT_PREV(uvm_pmr_addr, rebuild); 1386 KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno); 1387 1388 /* 1389 * Handle free chunk that spans the split point. 1390 */ 1391 if (prev != NULL && 1392 atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) { 1393 psize_t before, after; 1394 1395 KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno); 1396 1397 uvm_pmr_remove(pmr, prev); 1398 prev_sz = prev->fpgsz; 1399 before = pageno - atop(VM_PAGE_TO_PHYS(prev)); 1400 after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno; 1401 1402 KASSERT(before > 0); 1403 KASSERT(after > 0); 1404 1405 prev->fpgsz = before; 1406 uvm_pmr_insert(pmr, prev, 1); 1407 (prev + before)->fpgsz = after; 1408 uvm_pmr_insert(drain, prev + before, 1); 1409 } 1410 1411 /* Move free chunks that no longer fall in the range. */ 1412 for (; rebuild != NULL; rebuild = next) { 1413 next = RBT_NEXT(uvm_pmr_addr, rebuild); 1414 1415 uvm_pmr_remove(pmr, rebuild); 1416 uvm_pmr_insert(drain, rebuild, 1); 1417 } 1418 1419 pmr->high = pageno; 1420 uvm_pmr_assertvalid(pmr); 1421 uvm_pmr_assertvalid(drain); 1422 1423 RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain); 1424 uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain); 1425 uvm_unlock_fpageq(); 1426 } 1427 1428 /* 1429 * Increase the usage counter for the given range of memory. 1430 * 1431 * The more usage counters a given range of memory has, the more will be 1432 * attempted not to allocate from it. 1433 * 1434 * Addresses here are in paddr_t, not page-numbers. 1435 * The lowest and highest allowed address are specified. 1436 */ 1437 void 1438 uvm_pmr_use_inc(paddr_t low, paddr_t high) 1439 { 1440 struct uvm_pmemrange *pmr; 1441 paddr_t sz; 1442 1443 /* pmr uses page numbers, translate low and high. */ 1444 high++; 1445 high = atop(trunc_page(high)); 1446 low = atop(round_page(low)); 1447 uvm_pmr_split(low); 1448 uvm_pmr_split(high); 1449 1450 sz = 0; 1451 uvm_lock_fpageq(); 1452 /* Increase use count on segments in range. */ 1453 RBT_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) { 1454 if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) { 1455 TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use); 1456 pmr->use++; 1457 sz += pmr->high - pmr->low; 1458 uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr); 1459 } 1460 uvm_pmr_assertvalid(pmr); 1461 } 1462 uvm_unlock_fpageq(); 1463 1464 KASSERT(sz >= high - low); 1465 } 1466 1467 /* 1468 * Allocate a pmemrange. 1469 * 1470 * If called from uvm_page_init, the uvm_pageboot_alloc is used. 1471 * If called after uvm_init, malloc is used. 1472 * (And if called in between, you're dead.) 1473 */ 1474 struct uvm_pmemrange * 1475 uvm_pmr_allocpmr(void) 1476 { 1477 struct uvm_pmemrange *nw; 1478 int i; 1479 1480 /* We're only ever hitting the !uvm.page_init_done case for now. */ 1481 if (!uvm.page_init_done) { 1482 nw = (struct uvm_pmemrange *) 1483 uvm_pageboot_alloc(sizeof(struct uvm_pmemrange)); 1484 } else { 1485 nw = malloc(sizeof(struct uvm_pmemrange), 1486 M_VMMAP, M_NOWAIT); 1487 } 1488 KASSERT(nw != NULL); 1489 memset(nw, 0, sizeof(struct uvm_pmemrange)); 1490 RBT_INIT(uvm_pmr_addr, &nw->addr); 1491 for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) { 1492 RBT_INIT(uvm_pmr_size, &nw->size[i]); 1493 TAILQ_INIT(&nw->single[i]); 1494 } 1495 return nw; 1496 } 1497 1498 /* 1499 * Initialization of pmr. 1500 * Called by uvm_page_init. 1501 * 1502 * Sets up pmemranges. 1503 */ 1504 void 1505 uvm_pmr_init(void) 1506 { 1507 struct uvm_pmemrange *new_pmr; 1508 int i; 1509 1510 TAILQ_INIT(&uvm.pmr_control.use); 1511 RBT_INIT(uvm_pmemrange_addr, &uvm.pmr_control.addr); 1512 TAILQ_INIT(&uvm.pmr_control.allocs); 1513 1514 /* By default, one range for the entire address space. */ 1515 new_pmr = uvm_pmr_allocpmr(); 1516 new_pmr->low = 0; 1517 new_pmr->high = atop((paddr_t)-1) + 1; 1518 1519 RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr); 1520 uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr); 1521 1522 for (i = 0; uvm_md_constraints[i] != NULL; i++) { 1523 uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low, 1524 uvm_md_constraints[i]->ucr_high); 1525 } 1526 } 1527 1528 /* 1529 * Find the pmemrange that contains the given page number. 1530 * 1531 * (Manually traverses the binary tree, because that is cheaper on stack 1532 * usage.) 1533 */ 1534 struct uvm_pmemrange * 1535 uvm_pmemrange_find(paddr_t pageno) 1536 { 1537 struct uvm_pmemrange *pmr; 1538 1539 pmr = RBT_ROOT(uvm_pmemrange_addr, &uvm.pmr_control.addr); 1540 while (pmr != NULL) { 1541 if (pmr->low > pageno) 1542 pmr = RBT_LEFT(uvm_pmemrange_addr, pmr); 1543 else if (pmr->high <= pageno) 1544 pmr = RBT_RIGHT(uvm_pmemrange_addr, pmr); 1545 else 1546 break; 1547 } 1548 1549 return pmr; 1550 } 1551 1552 #if defined(DDB) || defined(DEBUG) 1553 /* 1554 * Return true if the given page is in any of the free lists. 1555 * Used by uvm_page_printit. 1556 * This function is safe, even if the page is not on the freeq. 1557 * Note: does not apply locking, only called from ddb. 1558 */ 1559 int 1560 uvm_pmr_isfree(struct vm_page *pg) 1561 { 1562 struct vm_page *r; 1563 struct uvm_pmemrange *pmr; 1564 1565 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); 1566 if (pmr == NULL) 1567 return 0; 1568 r = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg); 1569 if (r == NULL) 1570 r = RBT_MAX(uvm_pmr_addr, &pmr->addr); 1571 else if (r != pg) 1572 r = RBT_PREV(uvm_pmr_addr, r); 1573 if (r == NULL) 1574 return 0; /* Empty tree. */ 1575 1576 KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg))); 1577 return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz > 1578 atop(VM_PAGE_TO_PHYS(pg)); 1579 } 1580 #endif /* DEBUG */ 1581 1582 /* 1583 * Given a root of a tree, find a range which intersects start, end and 1584 * is of the same memtype. 1585 * 1586 * Page must be in the address tree. 1587 */ 1588 struct vm_page* 1589 uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root, 1590 paddr_t start, paddr_t end, int memtype) 1591 { 1592 int direction; 1593 struct vm_page *root; 1594 struct vm_page *high, *high_next; 1595 struct vm_page *low, *low_next; 1596 1597 KDASSERT(pmr != NULL && init_root != NULL); 1598 root = init_root; 1599 1600 /* Which direction to use for searching. */ 1601 if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start) 1602 direction = 1; 1603 else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end) 1604 direction = -1; 1605 else /* nothing to do */ 1606 return root; 1607 1608 /* First, update root to fall within the chosen range. */ 1609 while (root && !PMR_INTERSECTS_WITH( 1610 atop(VM_PAGE_TO_PHYS(root)), 1611 atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz, 1612 start, end)) { 1613 if (direction == 1) 1614 root = RBT_RIGHT(uvm_objtree, root); 1615 else 1616 root = RBT_LEFT(uvm_objtree, root); 1617 } 1618 if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype) 1619 return root; 1620 1621 /* 1622 * Root is valid, but of the wrong memtype. 1623 * 1624 * Try to find a range that has the given memtype in the subtree 1625 * (memtype mismatches are costly, either because the conversion 1626 * is expensive, or a later allocation will need to do the opposite 1627 * conversion, which will be expensive). 1628 * 1629 * 1630 * First, simply increase address until we hit something we can use. 1631 * Cache the upper page, so we can page-walk later. 1632 */ 1633 high = root; 1634 high_next = RBT_RIGHT(uvm_objtree, high); 1635 while (high_next != NULL && PMR_INTERSECTS_WITH( 1636 atop(VM_PAGE_TO_PHYS(high_next)), 1637 atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz, 1638 start, end)) { 1639 high = high_next; 1640 if (uvm_pmr_pg_to_memtype(high) == memtype) 1641 return high; 1642 high_next = RBT_RIGHT(uvm_objtree, high); 1643 } 1644 1645 /* 1646 * Second, decrease the address until we hit something we can use. 1647 * Cache the lower page, so we can page-walk later. 1648 */ 1649 low = root; 1650 low_next = RBT_LEFT(uvm_objtree, low); 1651 while (low_next != NULL && PMR_INTERSECTS_WITH( 1652 atop(VM_PAGE_TO_PHYS(low_next)), 1653 atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz, 1654 start, end)) { 1655 low = low_next; 1656 if (uvm_pmr_pg_to_memtype(low) == memtype) 1657 return low; 1658 low_next = RBT_LEFT(uvm_objtree, low); 1659 } 1660 1661 if (low == high) 1662 return NULL; 1663 1664 /* No hits. Walk the address tree until we find something usable. */ 1665 for (low = RBT_NEXT(uvm_pmr_addr, low); 1666 low != high; 1667 low = RBT_NEXT(uvm_pmr_addr, low)) { 1668 KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)), 1669 atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz, 1670 start, end)); 1671 if (uvm_pmr_pg_to_memtype(low) == memtype) 1672 return low; 1673 } 1674 1675 /* Nothing found. */ 1676 return NULL; 1677 } 1678 1679 /* 1680 * Allocate any page, the fastest way. Page number constraints only. 1681 */ 1682 psize_t 1683 uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result, 1684 paddr_t start, paddr_t end, int memtype_only) 1685 { 1686 struct uvm_pmemrange *pmr; 1687 struct vm_page *found, *splitpg; 1688 psize_t fcount; 1689 int memtype; 1690 1691 fcount = 0; 1692 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { 1693 /* We're done. */ 1694 if (fcount == count) 1695 break; 1696 1697 /* Outside requested range. */ 1698 if (!(start == 0 && end == 0) && 1699 !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end)) 1700 continue; 1701 1702 /* Range is empty. */ 1703 if (pmr->nsegs == 0) 1704 continue; 1705 1706 /* Loop over all memtypes, starting at memtype_init. */ 1707 memtype = memtype_init; 1708 while (fcount != count) { 1709 found = TAILQ_FIRST(&pmr->single[memtype]); 1710 /* 1711 * If found is outside the range, walk the list 1712 * until we find something that intersects with 1713 * boundaries. 1714 */ 1715 while (found && !PMR_INTERSECTS_WITH( 1716 atop(VM_PAGE_TO_PHYS(found)), 1717 atop(VM_PAGE_TO_PHYS(found)) + 1, 1718 start, end)) 1719 found = TAILQ_NEXT(found, pageq); 1720 1721 if (found == NULL) { 1722 /* 1723 * Check if the size tree contains a range 1724 * that intersects with the boundaries. As the 1725 * allocation is for any page, try the smallest 1726 * range so that large ranges are preserved for 1727 * more constrained cases. Only one entry is 1728 * checked here, to avoid a brute-force search. 1729 * 1730 * Note that a size tree gives pg[1] instead of 1731 * pg[0]. 1732 */ 1733 found = RBT_MIN(uvm_pmr_size, 1734 &pmr->size[memtype]); 1735 if (found != NULL) { 1736 found--; 1737 if (!PMR_INTERSECTS_WITH( 1738 atop(VM_PAGE_TO_PHYS(found)), 1739 atop(VM_PAGE_TO_PHYS(found)) + 1740 found->fpgsz, start, end)) 1741 found = NULL; 1742 } 1743 } 1744 if (found == NULL) { 1745 /* 1746 * Try address-guided search to meet the page 1747 * number constraints. 1748 */ 1749 found = RBT_ROOT(uvm_pmr_addr, &pmr->addr); 1750 if (found != NULL) { 1751 found = uvm_pmr_rootupdate(pmr, found, 1752 start, end, memtype); 1753 } 1754 } 1755 if (found != NULL) { 1756 uvm_pmr_assertvalid(pmr); 1757 uvm_pmr_remove_size(pmr, found); 1758 1759 /* 1760 * If the page intersects the end, then it'll 1761 * need splitting. 1762 * 1763 * Note that we don't need to split if the page 1764 * intersects start: the drain function will 1765 * simply stop on hitting start. 1766 */ 1767 if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) + 1768 found->fpgsz > end) { 1769 psize_t splitsz = 1770 atop(VM_PAGE_TO_PHYS(found)) + 1771 found->fpgsz - end; 1772 1773 uvm_pmr_remove_addr(pmr, found); 1774 uvm_pmr_assertvalid(pmr); 1775 found->fpgsz -= splitsz; 1776 splitpg = found + found->fpgsz; 1777 splitpg->fpgsz = splitsz; 1778 uvm_pmr_insert(pmr, splitpg, 1); 1779 1780 /* 1781 * At this point, splitpg and found 1782 * actually should be joined. 1783 * But we explicitly disable that, 1784 * because we will start subtracting 1785 * from found. 1786 */ 1787 KASSERT(start == 0 || 1788 atop(VM_PAGE_TO_PHYS(found)) + 1789 found->fpgsz > start); 1790 uvm_pmr_insert_addr(pmr, found, 1); 1791 } 1792 1793 /* 1794 * Fetch pages from the end. 1795 * If the range is larger than the requested 1796 * number of pages, this saves us an addr-tree 1797 * update. 1798 * 1799 * Since we take from the end and insert at 1800 * the head, any ranges keep preserved. 1801 */ 1802 while (found->fpgsz > 0 && fcount < count && 1803 (start == 0 || 1804 atop(VM_PAGE_TO_PHYS(found)) + 1805 found->fpgsz > start)) { 1806 found->fpgsz--; 1807 fcount++; 1808 TAILQ_INSERT_HEAD(result, 1809 &found[found->fpgsz], pageq); 1810 } 1811 if (found->fpgsz > 0) { 1812 uvm_pmr_insert_size(pmr, found); 1813 KDASSERT(fcount == count); 1814 uvm_pmr_assertvalid(pmr); 1815 return fcount; 1816 } 1817 1818 /* 1819 * Delayed addr-tree removal. 1820 */ 1821 uvm_pmr_remove_addr(pmr, found); 1822 uvm_pmr_assertvalid(pmr); 1823 } else { 1824 if (memtype_only) 1825 break; 1826 /* 1827 * Skip to the next memtype. 1828 */ 1829 memtype += 1; 1830 if (memtype == UVM_PMR_MEMTYPE_MAX) 1831 memtype = 0; 1832 if (memtype == memtype_init) 1833 break; 1834 } 1835 } 1836 } 1837 1838 /* 1839 * Search finished. 1840 * 1841 * Ran out of ranges before enough pages were gathered, or we hit the 1842 * case where found->fpgsz == count - fcount, in which case the 1843 * above exit condition didn't trigger. 1844 * 1845 * On failure, caller will free the pages. 1846 */ 1847 return fcount; 1848 } 1849 1850 #ifdef DDB 1851 /* 1852 * Print information about pmemrange. 1853 * Does not do locking (so either call it from DDB or acquire fpageq lock 1854 * before invoking. 1855 */ 1856 void 1857 uvm_pmr_print(void) 1858 { 1859 struct uvm_pmemrange *pmr; 1860 struct vm_page *pg; 1861 psize_t size[UVM_PMR_MEMTYPE_MAX]; 1862 psize_t free; 1863 int useq_len; 1864 int mt; 1865 1866 printf("Ranges, use queue:\n"); 1867 useq_len = 0; 1868 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { 1869 useq_len++; 1870 free = 0; 1871 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { 1872 pg = RBT_MAX(uvm_pmr_size, &pmr->size[mt]); 1873 if (pg != NULL) 1874 pg--; 1875 else 1876 pg = TAILQ_FIRST(&pmr->single[mt]); 1877 size[mt] = (pg == NULL ? 0 : pg->fpgsz); 1878 1879 RBT_FOREACH(pg, uvm_pmr_addr, &pmr->addr) 1880 free += pg->fpgsz; 1881 } 1882 1883 printf("* [0x%lx-0x%lx] use=%d nsegs=%ld", 1884 (unsigned long)pmr->low, (unsigned long)pmr->high, 1885 pmr->use, (unsigned long)pmr->nsegs); 1886 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { 1887 printf(" maxsegsz[%d]=0x%lx", mt, 1888 (unsigned long)size[mt]); 1889 } 1890 printf(" free=0x%lx\n", (unsigned long)free); 1891 } 1892 printf("#ranges = %d\n", useq_len); 1893 } 1894 #endif 1895 1896 /* 1897 * uvm_wait_pla: wait (sleep) for the page daemon to free some pages 1898 * in a specific physmem area. 1899 * 1900 * Returns ENOMEM if the pagedaemon failed to free any pages. 1901 * If not failok, failure will lead to panic. 1902 * 1903 * Must be called with fpageq locked. 1904 */ 1905 int 1906 uvm_wait_pla(paddr_t low, paddr_t high, paddr_t size, int failok) 1907 { 1908 struct uvm_pmalloc pma; 1909 const char *wmsg = "pmrwait"; 1910 1911 if (curproc == uvm.pagedaemon_proc) { 1912 /* 1913 * This is not that uncommon when the pagedaemon is trying 1914 * to flush out a large mmapped file. VOP_WRITE will circle 1915 * back through the buffer cache and try to get more memory. 1916 * The pagedaemon starts by calling bufbackoff, but we can 1917 * easily use up that reserve in a single scan iteration. 1918 */ 1919 uvm_unlock_fpageq(); 1920 if (bufbackoff(NULL, atop(size)) == 0) { 1921 uvm_lock_fpageq(); 1922 return 0; 1923 } 1924 uvm_lock_fpageq(); 1925 1926 /* 1927 * XXX detect pagedaemon deadlock - see comment in 1928 * uvm_wait(), as this is exactly the same issue. 1929 */ 1930 printf("pagedaemon: wait_pla deadlock detected!\n"); 1931 msleep(&uvmexp.free, &uvm.fpageqlock, PVM, wmsg, hz >> 3); 1932 #if defined(DEBUG) 1933 /* DEBUG: panic so we can debug it */ 1934 panic("wait_pla pagedaemon deadlock"); 1935 #endif 1936 return 0; 1937 } 1938 1939 for (;;) { 1940 pma.pm_constraint.ucr_low = low; 1941 pma.pm_constraint.ucr_high = high; 1942 pma.pm_size = size; 1943 pma.pm_flags = UVM_PMA_LINKED; 1944 TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, &pma, pmq); 1945 1946 wakeup(&uvm.pagedaemon); /* wake the daemon! */ 1947 while (pma.pm_flags & (UVM_PMA_LINKED | UVM_PMA_BUSY)) 1948 msleep(&pma, &uvm.fpageqlock, PVM, wmsg, 0); 1949 1950 if (!(pma.pm_flags & UVM_PMA_FREED) && 1951 pma.pm_flags & UVM_PMA_FAIL) { 1952 if (failok) 1953 return ENOMEM; 1954 printf("uvm_wait: failed to free %ld pages between " 1955 "0x%lx-0x%lx\n", atop(size), low, high); 1956 } else 1957 return 0; 1958 } 1959 /* UNREACHABLE */ 1960 } 1961 1962 /* 1963 * Wake up uvm_pmalloc sleepers. 1964 */ 1965 void 1966 uvm_wakeup_pla(paddr_t low, psize_t len) 1967 { 1968 struct uvm_pmalloc *pma, *pma_next; 1969 paddr_t high; 1970 1971 high = low + len; 1972 1973 /* Wake specific allocations waiting for this memory. */ 1974 for (pma = TAILQ_FIRST(&uvm.pmr_control.allocs); pma != NULL; 1975 pma = pma_next) { 1976 pma_next = TAILQ_NEXT(pma, pmq); 1977 1978 if (low < pma->pm_constraint.ucr_high && 1979 high > pma->pm_constraint.ucr_low) { 1980 pma->pm_flags |= UVM_PMA_FREED; 1981 if (!(pma->pm_flags & UVM_PMA_BUSY)) { 1982 pma->pm_flags &= ~UVM_PMA_LINKED; 1983 TAILQ_REMOVE(&uvm.pmr_control.allocs, pma, 1984 pmq); 1985 wakeup(pma); 1986 } 1987 } 1988 } 1989 } 1990 1991 void 1992 uvm_pagezero_thread(void *arg) 1993 { 1994 struct pglist pgl; 1995 struct vm_page *pg; 1996 int count; 1997 1998 /* Run at the lowest possible priority. */ 1999 curproc->p_p->ps_nice = NZERO + PRIO_MAX; 2000 2001 KERNEL_UNLOCK(); 2002 2003 TAILQ_INIT(&pgl); 2004 for (;;) { 2005 uvm_lock_fpageq(); 2006 while (uvmexp.zeropages >= UVM_PAGEZERO_TARGET || 2007 (count = uvm_pmr_get1page(16, UVM_PMR_MEMTYPE_DIRTY, 2008 &pgl, 0, 0, 1)) == 0) { 2009 msleep(&uvmexp.zeropages, &uvm.fpageqlock, MAXPRI, 2010 "pgzero", 0); 2011 } 2012 uvm_unlock_fpageq(); 2013 2014 TAILQ_FOREACH(pg, &pgl, pageq) { 2015 uvm_pagezero(pg); 2016 atomic_setbits_int(&pg->pg_flags, PG_ZERO); 2017 } 2018 2019 uvm_lock_fpageq(); 2020 while (!TAILQ_EMPTY(&pgl)) 2021 uvm_pmr_remove_1strange(&pgl, 0, NULL, 0); 2022 uvmexp.zeropages += count; 2023 uvm_unlock_fpageq(); 2024 2025 yield(); 2026 } 2027 } 2028