1 /* $OpenBSD: uvm_pmemrange.c,v 1.61 2021/03/12 14:15:49 jsg Exp $ */ 2 3 /* 4 * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/param.h> 20 #include <sys/systm.h> 21 #include <uvm/uvm.h> 22 #include <sys/malloc.h> 23 #include <sys/kernel.h> 24 #include <sys/proc.h> 25 #include <sys/mount.h> 26 27 /* 28 * 2 trees: addr tree and size tree. 29 * 30 * The allocator keeps chunks of free pages (called a range). 31 * Two pages are part of the same range if: 32 * - all pages in between are part of that range, 33 * - they are of the same memory type (zeroed or non-zeroed), 34 * - they are part of the same pmemrange. 35 * A pmemrange is a range of memory which is part of the same vm_physseg 36 * and has a use-count. 37 * 38 * addr tree is vm_page[0].objt 39 * size tree is vm_page[1].objt 40 * 41 * The size tree is not used for memory ranges of 1 page, instead, 42 * single queue is vm_page[0].pageq 43 * 44 * vm_page[0].fpgsz describes the length of a free range. Two adjecent ranges 45 * are joined, unless: 46 * - they have pages in between them which are not free 47 * - they belong to different memtypes (zeroed vs dirty memory) 48 * - they are in different pmemrange areas (ISA vs non-ISA memory for instance) 49 * - they are not a continuation of the same array 50 * The latter issue is caused by vm_physseg ordering and splitting from the 51 * MD initialization machinery. The MD code is dependant on freelists and 52 * happens to split ISA memory from non-ISA memory. 53 * (Note: freelists die die die!) 54 * 55 * uvm_page_init guarantees that every vm_physseg contains an array of 56 * struct vm_page. Also, uvm_page_physload allocates an array of struct 57 * vm_page. This code depends on that array. The array may break across 58 * vm_physsegs boundaries. 59 */ 60 61 /* 62 * Validate the flags of the page. (Used in asserts.) 63 * Any free page must have the PQ_FREE flag set. 64 * Free pages may be zeroed. 65 * Pmap flags are left untouched. 66 * 67 * The PQ_FREE flag is not checked here: by not checking, we can easily use 68 * this check in pages which are freed. 69 */ 70 #define VALID_FLAGS(pg_flags) \ 71 (((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0) 72 73 /* Tree comparators. */ 74 int uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *, 75 const struct uvm_pmemrange *); 76 int uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *); 77 int uvm_pmr_pg_to_memtype(struct vm_page *); 78 79 #ifdef DDB 80 void uvm_pmr_print(void); 81 #endif 82 83 /* 84 * Memory types. The page flags are used to derive what the current memory 85 * type of a page is. 86 */ 87 int 88 uvm_pmr_pg_to_memtype(struct vm_page *pg) 89 { 90 if (pg->pg_flags & PG_ZERO) 91 return UVM_PMR_MEMTYPE_ZERO; 92 /* Default: dirty memory. */ 93 return UVM_PMR_MEMTYPE_DIRTY; 94 } 95 96 /* Trees. */ 97 RBT_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp); 98 RBT_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp); 99 RBT_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr, 100 uvm_pmemrange_addr_cmp); 101 102 /* Validation. */ 103 #ifdef DEBUG 104 void uvm_pmr_assertvalid(struct uvm_pmemrange *pmr); 105 #else 106 #define uvm_pmr_assertvalid(pmr) do {} while (0) 107 #endif 108 109 psize_t uvm_pmr_get1page(psize_t, int, struct pglist *, 110 paddr_t, paddr_t, int); 111 112 struct uvm_pmemrange *uvm_pmr_allocpmr(void); 113 struct vm_page *uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int); 114 struct vm_page *uvm_pmr_nextsz(struct uvm_pmemrange *, 115 struct vm_page *, int); 116 void uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, 117 struct vm_page *pg, struct vm_page **pg_prev, 118 struct vm_page **pg_next); 119 struct vm_page *uvm_pmr_findnextsegment(struct uvm_pmemrange *, 120 struct vm_page *, paddr_t); 121 struct vm_page *uvm_pmr_findprevsegment(struct uvm_pmemrange *, 122 struct vm_page *, paddr_t); 123 psize_t uvm_pmr_remove_1strange(struct pglist *, paddr_t, 124 struct vm_page **, int); 125 psize_t uvm_pmr_remove_1strange_reverse(struct pglist *, 126 paddr_t *); 127 void uvm_pmr_split(paddr_t); 128 struct uvm_pmemrange *uvm_pmemrange_find(paddr_t); 129 struct uvm_pmemrange *uvm_pmemrange_use_insert(struct uvm_pmemrange_use *, 130 struct uvm_pmemrange *); 131 psize_t pow2divide(psize_t, psize_t); 132 struct vm_page *uvm_pmr_rootupdate(struct uvm_pmemrange *, 133 struct vm_page *, paddr_t, paddr_t, int); 134 135 /* 136 * Computes num/denom and rounds it up to the next power-of-2. 137 * 138 * This is a division function which calculates an approximation of 139 * num/denom, with result =~ num/denom. It is meant to be fast and doesn't 140 * have to be accurate. 141 * 142 * Providing too large a value makes the allocator slightly faster, at the 143 * risk of hitting the failure case more often. Providing too small a value 144 * makes the allocator a bit slower, but less likely to hit a failure case. 145 */ 146 psize_t 147 pow2divide(psize_t num, psize_t denom) 148 { 149 int rshift; 150 151 for (rshift = 0; num > denom; rshift++, denom <<= 1) 152 ; 153 return (paddr_t)1 << rshift; 154 } 155 156 /* 157 * Predicate: lhs is a subrange or rhs. 158 * 159 * If rhs_low == 0: don't care about lower bound. 160 * If rhs_high == 0: don't care about upper bound. 161 */ 162 #define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high) \ 163 (((rhs_low) == 0 || (lhs_low) >= (rhs_low)) && \ 164 ((rhs_high) == 0 || (lhs_high) <= (rhs_high))) 165 166 /* 167 * Predicate: lhs intersects with rhs. 168 * 169 * If rhs_low == 0: don't care about lower bound. 170 * If rhs_high == 0: don't care about upper bound. 171 * Ranges don't intersect if they don't have any page in common, array 172 * semantics mean that < instead of <= should be used here. 173 */ 174 #define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high) \ 175 (((rhs_low) == 0 || (rhs_low) < (lhs_high)) && \ 176 ((rhs_high) == 0 || (lhs_low) < (rhs_high))) 177 178 /* 179 * Align to power-of-2 alignment. 180 */ 181 #define PMR_ALIGN(pgno, align) \ 182 (((pgno) + ((align) - 1)) & ~((align) - 1)) 183 #define PMR_ALIGN_DOWN(pgno, align) \ 184 ((pgno) & ~((align) - 1)) 185 186 187 /* 188 * Comparator: sort by address ascending. 189 */ 190 int 191 uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *lhs, 192 const struct uvm_pmemrange *rhs) 193 { 194 return lhs->low < rhs->low ? -1 : lhs->low > rhs->low; 195 } 196 197 /* 198 * Comparator: sort by use ascending. 199 * 200 * The higher the use value of a range, the more devices need memory in 201 * this range. Therefore allocate from the range with the lowest use first. 202 */ 203 int 204 uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs) 205 { 206 int result; 207 208 result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use; 209 if (result == 0) 210 result = uvm_pmemrange_addr_cmp(lhs, rhs); 211 return result; 212 } 213 214 int 215 uvm_pmr_addr_cmp(const struct vm_page *lhs, const struct vm_page *rhs) 216 { 217 paddr_t lhs_addr, rhs_addr; 218 219 lhs_addr = VM_PAGE_TO_PHYS(lhs); 220 rhs_addr = VM_PAGE_TO_PHYS(rhs); 221 222 return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr); 223 } 224 225 int 226 uvm_pmr_size_cmp(const struct vm_page *lhs, const struct vm_page *rhs) 227 { 228 psize_t lhs_size, rhs_size; 229 int cmp; 230 231 /* Using second tree, so we receive pg[1] instead of pg[0]. */ 232 lhs_size = (lhs - 1)->fpgsz; 233 rhs_size = (rhs - 1)->fpgsz; 234 235 cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size); 236 if (cmp == 0) 237 cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1); 238 return cmp; 239 } 240 241 /* 242 * Find the first range of free pages that is at least sz pages long. 243 */ 244 struct vm_page * 245 uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti) 246 { 247 struct vm_page *node, *best; 248 249 KASSERT(sz >= 1); 250 251 if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti])) 252 return TAILQ_FIRST(&pmr->single[mti]); 253 254 node = RBT_ROOT(uvm_pmr_size, &pmr->size[mti]); 255 best = NULL; 256 while (node != NULL) { 257 if ((node - 1)->fpgsz >= sz) { 258 best = (node - 1); 259 node = RBT_LEFT(uvm_objtree, node); 260 } else 261 node = RBT_RIGHT(uvm_objtree, node); 262 } 263 return best; 264 } 265 266 /* 267 * Finds the next range. The next range has a size >= pg->fpgsz. 268 * Returns NULL if no more ranges are available. 269 */ 270 struct vm_page * 271 uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt) 272 { 273 struct vm_page *npg; 274 275 KASSERT(pmr != NULL && pg != NULL); 276 if (pg->fpgsz == 1) { 277 if (TAILQ_NEXT(pg, pageq) != NULL) 278 return TAILQ_NEXT(pg, pageq); 279 else 280 npg = RBT_MIN(uvm_pmr_size, &pmr->size[mt]); 281 } else 282 npg = RBT_NEXT(uvm_pmr_size, pg + 1); 283 284 return npg == NULL ? NULL : npg - 1; 285 } 286 287 /* 288 * Finds the previous and next ranges relative to the (uninserted) pg range. 289 * 290 * *pg_prev == NULL if no previous range is available, that can join with 291 * pg. 292 * *pg_next == NULL if no next range is available, that can join with 293 * pg. 294 */ 295 void 296 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg, 297 struct vm_page **pg_prev, struct vm_page **pg_next) 298 { 299 KASSERT(pg_prev != NULL && pg_next != NULL); 300 301 *pg_next = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg); 302 if (*pg_next == NULL) 303 *pg_prev = RBT_MAX(uvm_pmr_addr, &pmr->addr); 304 else 305 *pg_prev = RBT_PREV(uvm_pmr_addr, *pg_next); 306 307 KDASSERT(*pg_next == NULL || 308 VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg)); 309 KDASSERT(*pg_prev == NULL || 310 VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg)); 311 312 /* Reset if not contig. */ 313 if (*pg_prev != NULL && 314 (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz 315 != atop(VM_PAGE_TO_PHYS(pg)) || 316 *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */ 317 uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg))) 318 *pg_prev = NULL; 319 if (*pg_next != NULL && 320 (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz 321 != atop(VM_PAGE_TO_PHYS(*pg_next)) || 322 pg + pg->fpgsz != *pg_next || /* Array broke. */ 323 uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg))) 324 *pg_next = NULL; 325 return; 326 } 327 328 /* 329 * Remove a range from the address tree. 330 * Address tree maintains pmr counters. 331 */ 332 void 333 uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg) 334 { 335 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg); 336 KDASSERT(pg->pg_flags & PQ_FREE); 337 RBT_REMOVE(uvm_pmr_addr, &pmr->addr, pg); 338 339 pmr->nsegs--; 340 } 341 /* 342 * Remove a range from the size tree. 343 */ 344 void 345 uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg) 346 { 347 int memtype; 348 #ifdef DEBUG 349 struct vm_page *i; 350 #endif 351 352 KDASSERT(pg->fpgsz >= 1); 353 KDASSERT(pg->pg_flags & PQ_FREE); 354 memtype = uvm_pmr_pg_to_memtype(pg); 355 356 if (pg->fpgsz == 1) { 357 #ifdef DEBUG 358 TAILQ_FOREACH(i, &pmr->single[memtype], pageq) { 359 if (i == pg) 360 break; 361 } 362 KDASSERT(i == pg); 363 #endif 364 TAILQ_REMOVE(&pmr->single[memtype], pg, pageq); 365 } else { 366 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[memtype], 367 pg + 1) == pg + 1); 368 RBT_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1); 369 } 370 } 371 /* Remove from both trees. */ 372 void 373 uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg) 374 { 375 uvm_pmr_assertvalid(pmr); 376 uvm_pmr_remove_size(pmr, pg); 377 uvm_pmr_remove_addr(pmr, pg); 378 uvm_pmr_assertvalid(pmr); 379 } 380 381 /* 382 * Insert the range described in pg. 383 * Returns the range thus created (which may be joined with the previous and 384 * next ranges). 385 * If no_join, the caller guarantees that the range cannot possibly join 386 * with adjecent ranges. 387 */ 388 struct vm_page * 389 uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join) 390 { 391 struct vm_page *prev, *next; 392 393 #ifdef DEBUG 394 struct vm_page *i; 395 int mt; 396 #endif 397 398 KDASSERT(pg->pg_flags & PQ_FREE); 399 KDASSERT(pg->fpgsz >= 1); 400 401 #ifdef DEBUG 402 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { 403 TAILQ_FOREACH(i, &pmr->single[mt], pageq) 404 KDASSERT(i != pg); 405 if (pg->fpgsz > 1) { 406 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mt], 407 pg + 1) == NULL); 408 } 409 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL); 410 } 411 #endif 412 413 if (!no_join) { 414 uvm_pmr_pnaddr(pmr, pg, &prev, &next); 415 if (next != NULL) { 416 uvm_pmr_remove_size(pmr, next); 417 uvm_pmr_remove_addr(pmr, next); 418 pg->fpgsz += next->fpgsz; 419 next->fpgsz = 0; 420 } 421 if (prev != NULL) { 422 uvm_pmr_remove_size(pmr, prev); 423 prev->fpgsz += pg->fpgsz; 424 pg->fpgsz = 0; 425 return prev; 426 } 427 } 428 429 RBT_INSERT(uvm_pmr_addr, &pmr->addr, pg); 430 431 pmr->nsegs++; 432 433 return pg; 434 } 435 /* 436 * Insert the range described in pg. 437 * Returns the range thus created (which may be joined with the previous and 438 * next ranges). 439 * Page must already be in the address tree. 440 */ 441 void 442 uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg) 443 { 444 int memtype; 445 #ifdef DEBUG 446 struct vm_page *i; 447 int mti; 448 #endif 449 450 KDASSERT(pg->fpgsz >= 1); 451 KDASSERT(pg->pg_flags & PQ_FREE); 452 453 memtype = uvm_pmr_pg_to_memtype(pg); 454 #ifdef DEBUG 455 for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) { 456 TAILQ_FOREACH(i, &pmr->single[mti], pageq) 457 KDASSERT(i != pg); 458 if (pg->fpgsz > 1) { 459 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mti], 460 pg + 1) == NULL); 461 } 462 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg); 463 } 464 for (i = pg; i < pg + pg->fpgsz; i++) 465 KASSERT(uvm_pmr_pg_to_memtype(i) == memtype); 466 #endif 467 468 if (pg->fpgsz == 1) 469 TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq); 470 else 471 RBT_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1); 472 } 473 /* Insert in both trees. */ 474 struct vm_page * 475 uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join) 476 { 477 uvm_pmr_assertvalid(pmr); 478 pg = uvm_pmr_insert_addr(pmr, pg, no_join); 479 uvm_pmr_insert_size(pmr, pg); 480 uvm_pmr_assertvalid(pmr); 481 return pg; 482 } 483 484 /* 485 * Find the last page that is part of this segment. 486 * => pg: the range at which to start the search. 487 * => boundary: the page number boundary specification (0 = no boundary). 488 * => pmr: the pmemrange of the page. 489 * 490 * This function returns 1 before the next range, so if you want to have the 491 * next range, you need to run TAILQ_NEXT(result, pageq) after calling. 492 * The reason is that this way, the length of the segment is easily 493 * calculated using: atop(result) - atop(pg) + 1. 494 * Hence this function also never returns NULL. 495 */ 496 struct vm_page * 497 uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr, 498 struct vm_page *pg, paddr_t boundary) 499 { 500 paddr_t first_boundary; 501 struct vm_page *next; 502 struct vm_page *prev; 503 504 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) && 505 pmr->high > atop(VM_PAGE_TO_PHYS(pg))); 506 if (boundary != 0) { 507 first_boundary = 508 PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary); 509 } else 510 first_boundary = 0; 511 512 /* 513 * Increase next until it hits the first page of the next segment. 514 * 515 * While loop checks the following: 516 * - next != NULL we have not reached the end of pgl 517 * - boundary == 0 || next < first_boundary 518 * we do not cross a boundary 519 * - atop(prev) + 1 == atop(next) 520 * still in the same segment 521 * - low <= last 522 * - high > last still in the same memory range 523 * - memtype is equal allocator is unable to view different memtypes 524 * as part of the same segment 525 * - prev + 1 == next no array breakage occurs 526 */ 527 prev = pg; 528 next = TAILQ_NEXT(prev, pageq); 529 while (next != NULL && 530 (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) && 531 atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) && 532 pmr->low <= atop(VM_PAGE_TO_PHYS(next)) && 533 pmr->high > atop(VM_PAGE_TO_PHYS(next)) && 534 uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) && 535 prev + 1 == next) { 536 prev = next; 537 next = TAILQ_NEXT(prev, pageq); 538 } 539 540 /* 541 * End of this segment. 542 */ 543 return prev; 544 } 545 546 /* 547 * Find the first page that is part of this segment. 548 * => pg: the range at which to start the search. 549 * => boundary: the page number boundary specification (0 = no boundary). 550 * => pmr: the pmemrange of the page. 551 * 552 * This function returns 1 after the previous range, so if you want to have the 553 * previous range, you need to run TAILQ_NEXT(result, pageq) after calling. 554 * The reason is that this way, the length of the segment is easily 555 * calculated using: atop(pg) - atop(result) + 1. 556 * Hence this function also never returns NULL. 557 */ 558 struct vm_page * 559 uvm_pmr_findprevsegment(struct uvm_pmemrange *pmr, 560 struct vm_page *pg, paddr_t boundary) 561 { 562 paddr_t first_boundary; 563 struct vm_page *next; 564 struct vm_page *prev; 565 566 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) && 567 pmr->high > atop(VM_PAGE_TO_PHYS(pg))); 568 if (boundary != 0) { 569 first_boundary = 570 PMR_ALIGN_DOWN(atop(VM_PAGE_TO_PHYS(pg)), boundary); 571 } else 572 first_boundary = 0; 573 574 /* 575 * Increase next until it hits the first page of the previous segment. 576 * 577 * While loop checks the following: 578 * - next != NULL we have not reached the end of pgl 579 * - boundary == 0 || next >= first_boundary 580 * we do not cross a boundary 581 * - atop(prev) - 1 == atop(next) 582 * still in the same segment 583 * - low <= last 584 * - high > last still in the same memory range 585 * - memtype is equal allocator is unable to view different memtypes 586 * as part of the same segment 587 * - prev - 1 == next no array breakage occurs 588 */ 589 prev = pg; 590 next = TAILQ_NEXT(prev, pageq); 591 while (next != NULL && 592 (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) >= first_boundary) && 593 atop(VM_PAGE_TO_PHYS(prev)) - 1 == atop(VM_PAGE_TO_PHYS(next)) && 594 pmr->low <= atop(VM_PAGE_TO_PHYS(next)) && 595 pmr->high > atop(VM_PAGE_TO_PHYS(next)) && 596 uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) && 597 prev - 1 == next) { 598 prev = next; 599 next = TAILQ_NEXT(prev, pageq); 600 } 601 602 /* 603 * Start of this segment. 604 */ 605 return prev; 606 } 607 608 /* 609 * Remove the first segment of contiguous pages from pgl. 610 * A segment ends if it crosses boundary (unless boundary = 0) or 611 * if it would enter a different uvm_pmemrange. 612 * 613 * Work: the page range that the caller is currently working with. 614 * May be null. 615 * 616 * If is_desperate is non-zero, the smallest segment is erased. Otherwise, 617 * the first segment is erased (which, if called by uvm_pmr_getpages(), 618 * probably is the smallest or very close to it). 619 */ 620 psize_t 621 uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary, 622 struct vm_page **work, int is_desperate) 623 { 624 struct vm_page *start, *end, *iter, *iter_end, *inserted, *lowest; 625 psize_t count; 626 struct uvm_pmemrange *pmr, *pmr_iter; 627 628 KASSERT(!TAILQ_EMPTY(pgl)); 629 630 /* 631 * Initialize to first page. 632 * Unless desperate scan finds a better candidate, this is what'll be 633 * erased. 634 */ 635 start = TAILQ_FIRST(pgl); 636 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start))); 637 end = uvm_pmr_findnextsegment(pmr, start, boundary); 638 639 /* 640 * If we are desperate, we _really_ want to get rid of the smallest 641 * element (rather than a close match to the smallest element). 642 */ 643 if (is_desperate) { 644 /* Linear search for smallest segment. */ 645 pmr_iter = pmr; 646 for (iter = TAILQ_NEXT(end, pageq); 647 iter != NULL && start != end; 648 iter = TAILQ_NEXT(iter_end, pageq)) { 649 /* 650 * Only update pmr if it doesn't match current 651 * iteration. 652 */ 653 if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) || 654 pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) { 655 pmr_iter = uvm_pmemrange_find(atop( 656 VM_PAGE_TO_PHYS(iter))); 657 } 658 659 iter_end = uvm_pmr_findnextsegment(pmr_iter, iter, 660 boundary); 661 662 /* 663 * Current iteration is smaller than best match so 664 * far; update. 665 */ 666 if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) < 667 VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) { 668 start = iter; 669 end = iter_end; 670 pmr = pmr_iter; 671 } 672 } 673 } 674 675 /* 676 * Calculate count and end of the list. 677 */ 678 count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1; 679 lowest = start; 680 end = TAILQ_NEXT(end, pageq); 681 682 /* 683 * Actually remove the range of pages. 684 * 685 * Sadly, this cannot be done using pointer iteration: 686 * vm_physseg is not guaranteed to be sorted on address, hence 687 * uvm_page_init() may not have initialized its array sorted by 688 * page number. 689 */ 690 for (iter = start; iter != end; iter = iter_end) { 691 iter_end = TAILQ_NEXT(iter, pageq); 692 TAILQ_REMOVE(pgl, iter, pageq); 693 } 694 695 lowest->fpgsz = count; 696 inserted = uvm_pmr_insert(pmr, lowest, 0); 697 698 /* 699 * If the caller was working on a range and this function modified 700 * that range, update the pointer. 701 */ 702 if (work != NULL && *work != NULL && 703 atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) && 704 atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz > 705 atop(VM_PAGE_TO_PHYS(*work))) 706 *work = inserted; 707 return count; 708 } 709 710 /* 711 * Remove the first segment of contiguous pages from a pgl 712 * with the list elements in reverse order of physaddr. 713 * 714 * A segment ends if it would enter a different uvm_pmemrange. 715 * 716 * Stores starting physical address of the segment in pstart. 717 */ 718 psize_t 719 uvm_pmr_remove_1strange_reverse(struct pglist *pgl, paddr_t *pstart) 720 { 721 struct vm_page *start, *end, *iter, *iter_end, *lowest; 722 psize_t count; 723 struct uvm_pmemrange *pmr; 724 725 KASSERT(!TAILQ_EMPTY(pgl)); 726 727 start = TAILQ_FIRST(pgl); 728 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start))); 729 end = uvm_pmr_findprevsegment(pmr, start, 0); 730 731 KASSERT(end <= start); 732 733 /* 734 * Calculate count and end of the list. 735 */ 736 count = atop(VM_PAGE_TO_PHYS(start) - VM_PAGE_TO_PHYS(end)) + 1; 737 lowest = end; 738 end = TAILQ_NEXT(end, pageq); 739 740 /* 741 * Actually remove the range of pages. 742 * 743 * Sadly, this cannot be done using pointer iteration: 744 * vm_physseg is not guaranteed to be sorted on address, hence 745 * uvm_page_init() may not have initialized its array sorted by 746 * page number. 747 */ 748 for (iter = start; iter != end; iter = iter_end) { 749 iter_end = TAILQ_NEXT(iter, pageq); 750 TAILQ_REMOVE(pgl, iter, pageq); 751 } 752 753 lowest->fpgsz = count; 754 (void) uvm_pmr_insert(pmr, lowest, 0); 755 756 *pstart = VM_PAGE_TO_PHYS(lowest); 757 return count; 758 } 759 760 /* 761 * Extract a number of pages from a segment of free pages. 762 * Called by uvm_pmr_getpages. 763 * 764 * Returns the segment that was created from pages left over at the tail 765 * of the remove set of pages, or NULL if no pages were left at the tail. 766 */ 767 struct vm_page * 768 uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg, 769 paddr_t start, paddr_t end, struct pglist *result) 770 { 771 struct vm_page *after, *pg_i; 772 psize_t before_sz, after_sz; 773 #ifdef DEBUG 774 psize_t i; 775 #endif 776 777 KDASSERT(end > start); 778 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg))); 779 KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz); 780 KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start); 781 KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end); 782 783 before_sz = start - atop(VM_PAGE_TO_PHYS(pg)); 784 after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end; 785 KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz); 786 uvm_pmr_assertvalid(pmr); 787 788 uvm_pmr_remove_size(pmr, pg); 789 if (before_sz == 0) 790 uvm_pmr_remove_addr(pmr, pg); 791 after = pg + before_sz + (end - start); 792 793 /* Add selected pages to result. */ 794 for (pg_i = pg + before_sz; pg_i != after; pg_i++) { 795 KASSERT(pg_i->pg_flags & PQ_FREE); 796 pg_i->fpgsz = 0; 797 TAILQ_INSERT_TAIL(result, pg_i, pageq); 798 } 799 800 /* Before handling. */ 801 if (before_sz > 0) { 802 pg->fpgsz = before_sz; 803 uvm_pmr_insert_size(pmr, pg); 804 } 805 806 /* After handling. */ 807 if (after_sz > 0) { 808 #ifdef DEBUG 809 for (i = 0; i < after_sz; i++) { 810 KASSERT(!uvm_pmr_isfree(after + i)); 811 } 812 #endif 813 KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end); 814 after->fpgsz = after_sz; 815 after = uvm_pmr_insert_addr(pmr, after, 1); 816 uvm_pmr_insert_size(pmr, after); 817 } 818 819 uvm_pmr_assertvalid(pmr); 820 return (after_sz > 0 ? after : NULL); 821 } 822 823 /* 824 * Indicate to the page daemon that a nowait call failed and it should 825 * recover at least some memory in the most restricted region (assumed 826 * to be dma_constraint). 827 */ 828 extern volatile int uvm_nowait_failed; 829 830 /* 831 * Acquire a number of pages. 832 * 833 * count: the number of pages returned 834 * start: lowest page number 835 * end: highest page number +1 836 * (start = end = 0: no limitation) 837 * align: power-of-2 alignment constraint (align = 1: no alignment) 838 * boundary: power-of-2 boundary (boundary = 0: no boundary) 839 * maxseg: maximum number of segments to return 840 * flags: UVM_PLA_* flags 841 * result: returned pages storage (uses pageq) 842 */ 843 int 844 uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align, 845 paddr_t boundary, int maxseg, int flags, struct pglist *result) 846 { 847 struct uvm_pmemrange *pmr; /* Iterate memory ranges. */ 848 struct vm_page *found, *f_next; /* Iterate chunks. */ 849 psize_t fcount; /* Current found pages. */ 850 int fnsegs; /* Current segment counter. */ 851 int try, start_try; 852 psize_t search[3]; 853 paddr_t fstart, fend; /* Pages to be taken from found. */ 854 int memtype; /* Requested memtype. */ 855 int memtype_init; /* Best memtype. */ 856 int desperate; /* True if allocation failed. */ 857 #ifdef DIAGNOSTIC 858 struct vm_page *diag_prev; /* Used during validation. */ 859 #endif /* DIAGNOSTIC */ 860 861 /* 862 * Validate arguments. 863 */ 864 KASSERT(count > 0); 865 KASSERT(start == 0 || end == 0 || start < end); 866 KASSERT(align >= 1); 867 KASSERT(powerof2(align)); 868 KASSERT(maxseg > 0); 869 KASSERT(boundary == 0 || powerof2(boundary)); 870 KASSERT(boundary == 0 || maxseg * boundary >= count); 871 KASSERT(TAILQ_EMPTY(result)); 872 KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT)); 873 874 /* 875 * TRYCONTIG is a noop if you only want a single segment. 876 * Remove it if that's the case: otherwise it'll deny the fast 877 * allocation. 878 */ 879 if (maxseg == 1 || count == 1) 880 flags &= ~UVM_PLA_TRYCONTIG; 881 882 /* 883 * Configure search. 884 * 885 * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case. 886 * search[1] is multiple segments, chosen to fulfill the search in 887 * approximately even-sized segments. 888 * This is a good trade-off between slightly reduced allocation speed 889 * and less fragmentation. 890 * search[2] is the worst case, in which all segments are evaluated. 891 * This provides the least fragmentation, but makes the search 892 * possibly longer (although in the case it is selected, that no 893 * longer matters most). 894 * 895 * The exception is when maxseg == 1: since we can only fulfill that 896 * with one segment of size pages, only a single search type has to 897 * be attempted. 898 */ 899 if (maxseg == 1 || count == 1) { 900 start_try = 2; 901 search[2] = count; 902 } else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) { 903 start_try = 2; 904 search[2] = 1; 905 } else { 906 start_try = 0; 907 search[0] = count; 908 search[1] = pow2divide(count, maxseg); 909 search[2] = 1; 910 if ((flags & UVM_PLA_TRYCONTIG) == 0) 911 start_try = 1; 912 if (search[1] >= search[0]) { 913 search[1] = search[0]; 914 start_try = 1; 915 } 916 if (search[2] >= search[start_try]) { 917 start_try = 2; 918 } 919 } 920 921 /* 922 * Memory type: if zeroed memory is requested, traverse the zero set. 923 * Otherwise, traverse the dirty set. 924 * 925 * The memtype iterator is reinitialized to memtype_init on entrance 926 * of a pmemrange. 927 */ 928 if (flags & UVM_PLA_ZERO) 929 memtype_init = UVM_PMR_MEMTYPE_ZERO; 930 else 931 memtype_init = UVM_PMR_MEMTYPE_DIRTY; 932 933 /* 934 * Initially, we're not desperate. 935 * 936 * Note that if we return from a sleep, we are still desperate. 937 * Chances are that memory pressure is still high, so resetting 938 * seems over-optimistic to me. 939 */ 940 desperate = 0; 941 942 again: 943 uvm_lock_fpageq(); 944 945 /* 946 * check to see if we need to generate some free pages waking 947 * the pagedaemon. 948 */ 949 if ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freemin || 950 ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg && 951 (uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg)) 952 wakeup(&uvm.pagedaemon); 953 954 /* 955 * fail if any of these conditions is true: 956 * [1] there really are no free pages, or 957 * [2] only kernel "reserved" pages remain and 958 * the UVM_PLA_USERESERVE flag wasn't used. 959 * [3] only pagedaemon "reserved" pages remain and 960 * the requestor isn't the pagedaemon nor the syncer. 961 */ 962 if ((uvmexp.free <= uvmexp.reserve_kernel) && 963 !(flags & UVM_PLA_USERESERVE)) { 964 uvm_unlock_fpageq(); 965 return ENOMEM; 966 } 967 968 if ((uvmexp.free <= (uvmexp.reserve_pagedaemon + count)) && 969 (curproc != uvm.pagedaemon_proc) && (curproc != syncerproc)) { 970 uvm_unlock_fpageq(); 971 if (flags & UVM_PLA_WAITOK) { 972 uvm_wait("uvm_pmr_getpages"); 973 goto again; 974 } 975 return ENOMEM; 976 } 977 978 retry: /* Return point after sleeping. */ 979 fcount = 0; 980 fnsegs = 0; 981 982 retry_desperate: 983 /* 984 * If we just want any page(s), go for the really fast option. 985 */ 986 if (count <= maxseg && align == 1 && boundary == 0 && 987 (flags & UVM_PLA_TRYCONTIG) == 0) { 988 fcount += uvm_pmr_get1page(count - fcount, memtype_init, 989 result, start, end, 0); 990 991 /* 992 * If we found sufficient pages, go to the success exit code. 993 * 994 * Otherwise, go immediately to fail, since we collected 995 * all we could anyway. 996 */ 997 if (fcount == count) 998 goto out; 999 else 1000 goto fail; 1001 } 1002 1003 /* 1004 * The heart of the contig case. 1005 * 1006 * The code actually looks like this: 1007 * 1008 * foreach (struct pmemrange) { 1009 * foreach (memtype) { 1010 * foreach(try) { 1011 * foreach (free range of memtype in pmemrange, 1012 * starting at search[try]) { 1013 * while (range has space left) 1014 * take from range 1015 * } 1016 * } 1017 * } 1018 * 1019 * if next pmemrange has higher usecount than current: 1020 * enter desperate case (which will drain the pmemranges 1021 * until empty prior to moving to the next one) 1022 * } 1023 * 1024 * When desperate is activated, try always starts at the highest 1025 * value. The memtype loop is using a goto ReScanMemtype. 1026 * The try loop is using a goto ReScan. 1027 * The 'range has space left' loop uses label DrainFound. 1028 * 1029 * Writing them all as loops would take up a lot of screen space in 1030 * the form of indentation and some parts are easier to express 1031 * using the labels. 1032 */ 1033 1034 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { 1035 /* Empty range. */ 1036 if (pmr->nsegs == 0) 1037 continue; 1038 1039 /* Outside requested range. */ 1040 if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end)) 1041 continue; 1042 1043 memtype = memtype_init; 1044 1045 rescan_memtype: /* Return point at memtype++. */ 1046 try = start_try; 1047 1048 rescan: /* Return point at try++. */ 1049 for (found = uvm_pmr_nfindsz(pmr, search[try], memtype); 1050 found != NULL; 1051 found = f_next) { 1052 f_next = uvm_pmr_nextsz(pmr, found, memtype); 1053 1054 fstart = atop(VM_PAGE_TO_PHYS(found)); 1055 if (start != 0) 1056 fstart = MAX(start, fstart); 1057 drain_found: 1058 /* 1059 * Throw away the first segment if fnsegs == maxseg 1060 * 1061 * Note that f_next is still valid after this call, 1062 * since we only allocated from entries before f_next. 1063 * We don't revisit the entries we already extracted 1064 * from unless we entered the desperate case. 1065 */ 1066 if (fnsegs == maxseg) { 1067 fnsegs--; 1068 fcount -= 1069 uvm_pmr_remove_1strange(result, boundary, 1070 &found, desperate); 1071 } 1072 1073 fstart = PMR_ALIGN(fstart, align); 1074 fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz; 1075 if (end != 0) 1076 fend = MIN(end, fend); 1077 if (boundary != 0) { 1078 fend = 1079 MIN(fend, PMR_ALIGN(fstart + 1, boundary)); 1080 } 1081 if (fstart >= fend) 1082 continue; 1083 if (fend - fstart > count - fcount) 1084 fend = fstart + (count - fcount); 1085 1086 fcount += fend - fstart; 1087 fnsegs++; 1088 found = uvm_pmr_extract_range(pmr, found, 1089 fstart, fend, result); 1090 1091 if (fcount == count) 1092 goto out; 1093 1094 /* 1095 * If there's still space left in found, try to 1096 * fully drain it prior to continuing. 1097 */ 1098 if (found != NULL) { 1099 fstart = fend; 1100 goto drain_found; 1101 } 1102 } 1103 1104 /* Try a smaller search now. */ 1105 if (++try < nitems(search)) 1106 goto rescan; 1107 1108 /* 1109 * Exhaust all memory types prior to going to the next memory 1110 * segment. 1111 * This means that zero-vs-dirty are eaten prior to moving 1112 * to a pmemrange with a higher use-count. 1113 * 1114 * Code is basically a difficult way of writing: 1115 * memtype = memtype_init; 1116 * do { 1117 * ...; 1118 * memtype += 1; 1119 * memtype %= MEMTYPE_MAX; 1120 * } while (memtype != memtype_init); 1121 */ 1122 memtype += 1; 1123 if (memtype == UVM_PMR_MEMTYPE_MAX) 1124 memtype = 0; 1125 if (memtype != memtype_init) 1126 goto rescan_memtype; 1127 1128 /* 1129 * If not desperate, enter desperate case prior to eating all 1130 * the good stuff in the next range. 1131 */ 1132 if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL && 1133 TAILQ_NEXT(pmr, pmr_use)->use != pmr->use) 1134 break; 1135 } 1136 1137 /* 1138 * Not enough memory of the requested type available. Fall back to 1139 * less good memory that we'll clean up better later. 1140 * 1141 * This algorithm is not very smart though, it just starts scanning 1142 * a different typed range, but the nicer ranges of the previous 1143 * iteration may fall out. Hence there is a small chance of a false 1144 * negative. 1145 * 1146 * When desperate: scan all sizes starting at the smallest 1147 * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may 1148 * allow us to hit the fast path now). 1149 * 1150 * Also, because we will revisit entries we scanned before, we need 1151 * to reset the page queue, or we may end up releasing entries in 1152 * such a way as to invalidate f_next. 1153 */ 1154 if (!desperate) { 1155 desperate = 1; 1156 start_try = nitems(search) - 1; 1157 flags &= ~UVM_PLA_TRYCONTIG; 1158 1159 while (!TAILQ_EMPTY(result)) 1160 uvm_pmr_remove_1strange(result, 0, NULL, 0); 1161 fnsegs = 0; 1162 fcount = 0; 1163 goto retry_desperate; 1164 } 1165 1166 fail: 1167 /* Allocation failed. */ 1168 /* XXX: claim from memory reserve here */ 1169 1170 while (!TAILQ_EMPTY(result)) 1171 uvm_pmr_remove_1strange(result, 0, NULL, 0); 1172 1173 if (flags & UVM_PLA_WAITOK) { 1174 if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count), 1175 flags & UVM_PLA_FAILOK) == 0) 1176 goto retry; 1177 KASSERT(flags & UVM_PLA_FAILOK); 1178 } else { 1179 if (!(flags & UVM_PLA_NOWAKE)) { 1180 uvm_nowait_failed = 1; 1181 wakeup(&uvm.pagedaemon); 1182 } 1183 } 1184 uvm_unlock_fpageq(); 1185 1186 return ENOMEM; 1187 1188 out: 1189 /* Allocation successful. */ 1190 uvmexp.free -= fcount; 1191 1192 uvm_unlock_fpageq(); 1193 1194 /* Update statistics and zero pages if UVM_PLA_ZERO. */ 1195 #ifdef DIAGNOSTIC 1196 fnsegs = 0; 1197 fcount = 0; 1198 diag_prev = NULL; 1199 #endif /* DIAGNOSTIC */ 1200 TAILQ_FOREACH(found, result, pageq) { 1201 atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK); 1202 1203 if (found->pg_flags & PG_ZERO) { 1204 uvm_lock_fpageq(); 1205 uvmexp.zeropages--; 1206 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) 1207 wakeup(&uvmexp.zeropages); 1208 uvm_unlock_fpageq(); 1209 } 1210 if (flags & UVM_PLA_ZERO) { 1211 if (found->pg_flags & PG_ZERO) 1212 uvmexp.pga_zerohit++; 1213 else { 1214 uvmexp.pga_zeromiss++; 1215 uvm_pagezero(found); 1216 } 1217 } 1218 atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE); 1219 1220 found->uobject = NULL; 1221 found->uanon = NULL; 1222 found->pg_version++; 1223 1224 /* 1225 * Validate that the page matches range criterium. 1226 */ 1227 KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start); 1228 KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end); 1229 1230 #ifdef DIAGNOSTIC 1231 /* 1232 * Update fcount (# found pages) and 1233 * fnsegs (# found segments) counters. 1234 */ 1235 if (diag_prev == NULL || 1236 /* new segment if it contains a hole */ 1237 atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 != 1238 atop(VM_PAGE_TO_PHYS(found)) || 1239 /* new segment if it crosses boundary */ 1240 (atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) != 1241 (atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1))) 1242 fnsegs++; 1243 fcount++; 1244 1245 diag_prev = found; 1246 #endif /* DIAGNOSTIC */ 1247 } 1248 1249 #ifdef DIAGNOSTIC 1250 /* 1251 * Panic on algorithm failure. 1252 */ 1253 if (fcount != count || fnsegs > maxseg) { 1254 panic("pmemrange allocation error: " 1255 "allocated %ld pages in %d segments, " 1256 "but request was %ld pages in %d segments", 1257 fcount, fnsegs, count, maxseg); 1258 } 1259 #endif /* DIAGNOSTIC */ 1260 1261 return 0; 1262 } 1263 1264 /* 1265 * Free a number of contig pages (invoked by uvm_page_init). 1266 */ 1267 void 1268 uvm_pmr_freepages(struct vm_page *pg, psize_t count) 1269 { 1270 struct uvm_pmemrange *pmr; 1271 psize_t i, pmr_count; 1272 struct vm_page *firstpg = pg; 1273 1274 for (i = 0; i < count; i++) { 1275 KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) == 1276 atop(VM_PAGE_TO_PHYS(pg)) + i); 1277 1278 if (!((pg[i].pg_flags & PQ_FREE) == 0 && 1279 VALID_FLAGS(pg[i].pg_flags))) { 1280 printf("Flags: 0x%x, will panic now.\n", 1281 pg[i].pg_flags); 1282 } 1283 KASSERT((pg[i].pg_flags & PQ_FREE) == 0 && 1284 VALID_FLAGS(pg[i].pg_flags)); 1285 atomic_setbits_int(&pg[i].pg_flags, PQ_FREE); 1286 atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO); 1287 } 1288 1289 uvm_lock_fpageq(); 1290 1291 for (i = count; i > 0; i -= pmr_count) { 1292 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); 1293 KASSERT(pmr != NULL); 1294 1295 pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg))); 1296 pg->fpgsz = pmr_count; 1297 uvm_pmr_insert(pmr, pg, 0); 1298 1299 uvmexp.free += pmr_count; 1300 pg += pmr_count; 1301 } 1302 wakeup(&uvmexp.free); 1303 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) 1304 wakeup(&uvmexp.zeropages); 1305 1306 uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count)); 1307 1308 uvm_unlock_fpageq(); 1309 } 1310 1311 /* 1312 * Free all pages in the queue. 1313 */ 1314 void 1315 uvm_pmr_freepageq(struct pglist *pgl) 1316 { 1317 struct vm_page *pg; 1318 paddr_t pstart; 1319 psize_t plen; 1320 1321 TAILQ_FOREACH(pg, pgl, pageq) { 1322 if (!((pg->pg_flags & PQ_FREE) == 0 && 1323 VALID_FLAGS(pg->pg_flags))) { 1324 printf("Flags: 0x%x, will panic now.\n", 1325 pg->pg_flags); 1326 } 1327 KASSERT((pg->pg_flags & PQ_FREE) == 0 && 1328 VALID_FLAGS(pg->pg_flags)); 1329 atomic_setbits_int(&pg->pg_flags, PQ_FREE); 1330 atomic_clearbits_int(&pg->pg_flags, PG_ZERO); 1331 } 1332 1333 uvm_lock_fpageq(); 1334 while (!TAILQ_EMPTY(pgl)) { 1335 pg = TAILQ_FIRST(pgl); 1336 if (pg == TAILQ_NEXT(pg, pageq) + 1) { 1337 /* 1338 * If pg is one behind the position of the 1339 * next page in the list in the page array, 1340 * try going backwards instead of forward. 1341 */ 1342 plen = uvm_pmr_remove_1strange_reverse(pgl, &pstart); 1343 } else { 1344 pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl)); 1345 plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0); 1346 } 1347 uvmexp.free += plen; 1348 1349 uvm_wakeup_pla(pstart, ptoa(plen)); 1350 } 1351 wakeup(&uvmexp.free); 1352 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) 1353 wakeup(&uvmexp.zeropages); 1354 uvm_unlock_fpageq(); 1355 1356 return; 1357 } 1358 1359 /* 1360 * Store a pmemrange in the list. 1361 * 1362 * The list is sorted by use. 1363 */ 1364 struct uvm_pmemrange * 1365 uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq, 1366 struct uvm_pmemrange *pmr) 1367 { 1368 struct uvm_pmemrange *iter; 1369 int cmp = 1; 1370 1371 TAILQ_FOREACH(iter, useq, pmr_use) { 1372 cmp = uvm_pmemrange_use_cmp(pmr, iter); 1373 if (cmp == 0) 1374 return iter; 1375 if (cmp == -1) 1376 break; 1377 } 1378 1379 if (iter == NULL) 1380 TAILQ_INSERT_TAIL(useq, pmr, pmr_use); 1381 else 1382 TAILQ_INSERT_BEFORE(iter, pmr, pmr_use); 1383 return NULL; 1384 } 1385 1386 #ifdef DEBUG 1387 /* 1388 * Validation of the whole pmemrange. 1389 * Called with fpageq locked. 1390 */ 1391 void 1392 uvm_pmr_assertvalid(struct uvm_pmemrange *pmr) 1393 { 1394 struct vm_page *prev, *next, *i, *xref; 1395 int lcv, mti; 1396 1397 /* Empty range */ 1398 if (pmr->nsegs == 0) 1399 return; 1400 1401 /* Validate address tree. */ 1402 RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) { 1403 /* Validate the range. */ 1404 KASSERT(i->fpgsz > 0); 1405 KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low); 1406 KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz 1407 <= pmr->high); 1408 1409 /* Validate each page in this range. */ 1410 for (lcv = 0; lcv < i->fpgsz; lcv++) { 1411 /* 1412 * Only the first page has a size specification. 1413 * Rest is size 0. 1414 */ 1415 KASSERT(lcv == 0 || i[lcv].fpgsz == 0); 1416 /* 1417 * Flag check. 1418 */ 1419 KASSERT(VALID_FLAGS(i[lcv].pg_flags) && 1420 (i[lcv].pg_flags & PQ_FREE) == PQ_FREE); 1421 /* 1422 * Free pages are: 1423 * - not wired 1424 * - have no vm_anon 1425 * - have no uvm_object 1426 */ 1427 KASSERT(i[lcv].wire_count == 0); 1428 KASSERT(i[lcv].uanon == (void*)0xdeadbeef || 1429 i[lcv].uanon == NULL); 1430 KASSERT(i[lcv].uobject == (void*)0xdeadbeef || 1431 i[lcv].uobject == NULL); 1432 /* 1433 * Pages in a single range always have the same 1434 * memtype. 1435 */ 1436 KASSERT(uvm_pmr_pg_to_memtype(&i[0]) == 1437 uvm_pmr_pg_to_memtype(&i[lcv])); 1438 } 1439 1440 /* Check that it shouldn't be joined with its predecessor. */ 1441 prev = RBT_PREV(uvm_pmr_addr, i); 1442 if (prev != NULL) { 1443 KASSERT(uvm_pmr_pg_to_memtype(i) != 1444 uvm_pmr_pg_to_memtype(prev) || 1445 atop(VM_PAGE_TO_PHYS(i)) > 1446 atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz || 1447 prev + prev->fpgsz != i); 1448 } 1449 1450 /* Assert i is in the size tree as well. */ 1451 if (i->fpgsz == 1) { 1452 TAILQ_FOREACH(xref, 1453 &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) { 1454 if (xref == i) 1455 break; 1456 } 1457 KASSERT(xref == i); 1458 } else { 1459 KASSERT(RBT_FIND(uvm_pmr_size, 1460 &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) == 1461 i + 1); 1462 } 1463 } 1464 1465 /* Validate size tree. */ 1466 for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) { 1467 for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) { 1468 next = uvm_pmr_nextsz(pmr, i, mti); 1469 if (next != NULL) { 1470 KASSERT(i->fpgsz <= 1471 next->fpgsz); 1472 } 1473 1474 /* Assert i is in the addr tree as well. */ 1475 KASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, i) == i); 1476 1477 /* Assert i is of the correct memory type. */ 1478 KASSERT(uvm_pmr_pg_to_memtype(i) == mti); 1479 } 1480 } 1481 1482 /* Validate nsegs statistic. */ 1483 lcv = 0; 1484 RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) 1485 lcv++; 1486 KASSERT(pmr->nsegs == lcv); 1487 } 1488 #endif /* DEBUG */ 1489 1490 /* 1491 * Split pmr at split point pageno. 1492 * Called with fpageq unlocked. 1493 * 1494 * Split is only applied if a pmemrange spans pageno. 1495 */ 1496 void 1497 uvm_pmr_split(paddr_t pageno) 1498 { 1499 struct uvm_pmemrange *pmr, *drain; 1500 struct vm_page *rebuild, *prev, *next; 1501 psize_t prev_sz; 1502 1503 uvm_lock_fpageq(); 1504 pmr = uvm_pmemrange_find(pageno); 1505 if (pmr == NULL || !(pmr->low < pageno)) { 1506 /* No split required. */ 1507 uvm_unlock_fpageq(); 1508 return; 1509 } 1510 1511 KASSERT(pmr->low < pageno); 1512 KASSERT(pmr->high > pageno); 1513 1514 /* 1515 * uvm_pmr_allocpmr() calls into malloc() which in turn calls into 1516 * uvm_kmemalloc which calls into pmemrange, making the locking 1517 * a bit hard, so we just race! 1518 */ 1519 uvm_unlock_fpageq(); 1520 drain = uvm_pmr_allocpmr(); 1521 uvm_lock_fpageq(); 1522 pmr = uvm_pmemrange_find(pageno); 1523 if (pmr == NULL || !(pmr->low < pageno)) { 1524 /* 1525 * We lost the race since someone else ran this or a related 1526 * function, however this should be triggered very rarely so 1527 * we just leak the pmr. 1528 */ 1529 printf("uvm_pmr_split: lost one pmr\n"); 1530 uvm_unlock_fpageq(); 1531 return; 1532 } 1533 1534 drain->low = pageno; 1535 drain->high = pmr->high; 1536 drain->use = pmr->use; 1537 1538 uvm_pmr_assertvalid(pmr); 1539 uvm_pmr_assertvalid(drain); 1540 KASSERT(drain->nsegs == 0); 1541 1542 RBT_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) { 1543 if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno) 1544 break; 1545 } 1546 if (rebuild == NULL) 1547 prev = RBT_MAX(uvm_pmr_addr, &pmr->addr); 1548 else 1549 prev = RBT_PREV(uvm_pmr_addr, rebuild); 1550 KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno); 1551 1552 /* 1553 * Handle free chunk that spans the split point. 1554 */ 1555 if (prev != NULL && 1556 atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) { 1557 psize_t before, after; 1558 1559 KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno); 1560 1561 uvm_pmr_remove(pmr, prev); 1562 prev_sz = prev->fpgsz; 1563 before = pageno - atop(VM_PAGE_TO_PHYS(prev)); 1564 after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno; 1565 1566 KASSERT(before > 0); 1567 KASSERT(after > 0); 1568 1569 prev->fpgsz = before; 1570 uvm_pmr_insert(pmr, prev, 1); 1571 (prev + before)->fpgsz = after; 1572 uvm_pmr_insert(drain, prev + before, 1); 1573 } 1574 1575 /* Move free chunks that no longer fall in the range. */ 1576 for (; rebuild != NULL; rebuild = next) { 1577 next = RBT_NEXT(uvm_pmr_addr, rebuild); 1578 1579 uvm_pmr_remove(pmr, rebuild); 1580 uvm_pmr_insert(drain, rebuild, 1); 1581 } 1582 1583 pmr->high = pageno; 1584 uvm_pmr_assertvalid(pmr); 1585 uvm_pmr_assertvalid(drain); 1586 1587 RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain); 1588 uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain); 1589 uvm_unlock_fpageq(); 1590 } 1591 1592 /* 1593 * Increase the usage counter for the given range of memory. 1594 * 1595 * The more usage counters a given range of memory has, the more will be 1596 * attempted not to allocate from it. 1597 * 1598 * Addresses here are in paddr_t, not page-numbers. 1599 * The lowest and highest allowed address are specified. 1600 */ 1601 void 1602 uvm_pmr_use_inc(paddr_t low, paddr_t high) 1603 { 1604 struct uvm_pmemrange *pmr; 1605 paddr_t sz; 1606 1607 /* pmr uses page numbers, translate low and high. */ 1608 high++; 1609 high = atop(trunc_page(high)); 1610 low = atop(round_page(low)); 1611 uvm_pmr_split(low); 1612 uvm_pmr_split(high); 1613 1614 sz = 0; 1615 uvm_lock_fpageq(); 1616 /* Increase use count on segments in range. */ 1617 RBT_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) { 1618 if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) { 1619 TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use); 1620 pmr->use++; 1621 sz += pmr->high - pmr->low; 1622 uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr); 1623 } 1624 uvm_pmr_assertvalid(pmr); 1625 } 1626 uvm_unlock_fpageq(); 1627 1628 KASSERT(sz >= high - low); 1629 } 1630 1631 /* 1632 * Allocate a pmemrange. 1633 * 1634 * If called from uvm_page_init, the uvm_pageboot_alloc is used. 1635 * If called after uvm_init, malloc is used. 1636 * (And if called in between, you're dead.) 1637 */ 1638 struct uvm_pmemrange * 1639 uvm_pmr_allocpmr(void) 1640 { 1641 struct uvm_pmemrange *nw; 1642 int i; 1643 1644 /* We're only ever hitting the !uvm.page_init_done case for now. */ 1645 if (!uvm.page_init_done) { 1646 nw = (struct uvm_pmemrange *) 1647 uvm_pageboot_alloc(sizeof(struct uvm_pmemrange)); 1648 } else { 1649 nw = malloc(sizeof(struct uvm_pmemrange), 1650 M_VMMAP, M_NOWAIT); 1651 } 1652 KASSERT(nw != NULL); 1653 memset(nw, 0, sizeof(struct uvm_pmemrange)); 1654 RBT_INIT(uvm_pmr_addr, &nw->addr); 1655 for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) { 1656 RBT_INIT(uvm_pmr_size, &nw->size[i]); 1657 TAILQ_INIT(&nw->single[i]); 1658 } 1659 return nw; 1660 } 1661 1662 /* 1663 * Initialization of pmr. 1664 * Called by uvm_page_init. 1665 * 1666 * Sets up pmemranges. 1667 */ 1668 void 1669 uvm_pmr_init(void) 1670 { 1671 struct uvm_pmemrange *new_pmr; 1672 int i; 1673 1674 TAILQ_INIT(&uvm.pmr_control.use); 1675 RBT_INIT(uvm_pmemrange_addr, &uvm.pmr_control.addr); 1676 TAILQ_INIT(&uvm.pmr_control.allocs); 1677 1678 /* By default, one range for the entire address space. */ 1679 new_pmr = uvm_pmr_allocpmr(); 1680 new_pmr->low = 0; 1681 new_pmr->high = atop((paddr_t)-1) + 1; 1682 1683 RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr); 1684 uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr); 1685 1686 for (i = 0; uvm_md_constraints[i] != NULL; i++) { 1687 uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low, 1688 uvm_md_constraints[i]->ucr_high); 1689 } 1690 } 1691 1692 /* 1693 * Find the pmemrange that contains the given page number. 1694 * 1695 * (Manually traverses the binary tree, because that is cheaper on stack 1696 * usage.) 1697 */ 1698 struct uvm_pmemrange * 1699 uvm_pmemrange_find(paddr_t pageno) 1700 { 1701 struct uvm_pmemrange *pmr; 1702 1703 pmr = RBT_ROOT(uvm_pmemrange_addr, &uvm.pmr_control.addr); 1704 while (pmr != NULL) { 1705 if (pmr->low > pageno) 1706 pmr = RBT_LEFT(uvm_pmemrange_addr, pmr); 1707 else if (pmr->high <= pageno) 1708 pmr = RBT_RIGHT(uvm_pmemrange_addr, pmr); 1709 else 1710 break; 1711 } 1712 1713 return pmr; 1714 } 1715 1716 #if defined(DDB) || defined(DEBUG) 1717 /* 1718 * Return true if the given page is in any of the free lists. 1719 * Used by uvm_page_printit. 1720 * This function is safe, even if the page is not on the freeq. 1721 * Note: does not apply locking, only called from ddb. 1722 */ 1723 int 1724 uvm_pmr_isfree(struct vm_page *pg) 1725 { 1726 struct vm_page *r; 1727 struct uvm_pmemrange *pmr; 1728 1729 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); 1730 if (pmr == NULL) 1731 return 0; 1732 r = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg); 1733 if (r == NULL) 1734 r = RBT_MAX(uvm_pmr_addr, &pmr->addr); 1735 else if (r != pg) 1736 r = RBT_PREV(uvm_pmr_addr, r); 1737 if (r == NULL) 1738 return 0; /* Empty tree. */ 1739 1740 KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg))); 1741 return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz > 1742 atop(VM_PAGE_TO_PHYS(pg)); 1743 } 1744 #endif /* DEBUG */ 1745 1746 /* 1747 * Given a root of a tree, find a range which intersects start, end and 1748 * is of the same memtype. 1749 * 1750 * Page must be in the address tree. 1751 */ 1752 struct vm_page* 1753 uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root, 1754 paddr_t start, paddr_t end, int memtype) 1755 { 1756 int direction; 1757 struct vm_page *root; 1758 struct vm_page *high, *high_next; 1759 struct vm_page *low, *low_next; 1760 1761 KDASSERT(pmr != NULL && init_root != NULL); 1762 root = init_root; 1763 1764 /* Which direction to use for searching. */ 1765 if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start) 1766 direction = 1; 1767 else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end) 1768 direction = -1; 1769 else /* nothing to do */ 1770 return root; 1771 1772 /* First, update root to fall within the chosen range. */ 1773 while (root && !PMR_INTERSECTS_WITH( 1774 atop(VM_PAGE_TO_PHYS(root)), 1775 atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz, 1776 start, end)) { 1777 if (direction == 1) 1778 root = RBT_RIGHT(uvm_objtree, root); 1779 else 1780 root = RBT_LEFT(uvm_objtree, root); 1781 } 1782 if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype) 1783 return root; 1784 1785 /* 1786 * Root is valid, but of the wrong memtype. 1787 * 1788 * Try to find a range that has the given memtype in the subtree 1789 * (memtype mismatches are costly, either because the conversion 1790 * is expensive, or a later allocation will need to do the opposite 1791 * conversion, which will be expensive). 1792 * 1793 * 1794 * First, simply increase address until we hit something we can use. 1795 * Cache the upper page, so we can page-walk later. 1796 */ 1797 high = root; 1798 high_next = RBT_RIGHT(uvm_objtree, high); 1799 while (high_next != NULL && PMR_INTERSECTS_WITH( 1800 atop(VM_PAGE_TO_PHYS(high_next)), 1801 atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz, 1802 start, end)) { 1803 high = high_next; 1804 if (uvm_pmr_pg_to_memtype(high) == memtype) 1805 return high; 1806 high_next = RBT_RIGHT(uvm_objtree, high); 1807 } 1808 1809 /* 1810 * Second, decrease the address until we hit something we can use. 1811 * Cache the lower page, so we can page-walk later. 1812 */ 1813 low = root; 1814 low_next = RBT_LEFT(uvm_objtree, low); 1815 while (low_next != NULL && PMR_INTERSECTS_WITH( 1816 atop(VM_PAGE_TO_PHYS(low_next)), 1817 atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz, 1818 start, end)) { 1819 low = low_next; 1820 if (uvm_pmr_pg_to_memtype(low) == memtype) 1821 return low; 1822 low_next = RBT_LEFT(uvm_objtree, low); 1823 } 1824 1825 if (low == high) 1826 return NULL; 1827 1828 /* No hits. Walk the address tree until we find something usable. */ 1829 for (low = RBT_NEXT(uvm_pmr_addr, low); 1830 low != high; 1831 low = RBT_NEXT(uvm_pmr_addr, low)) { 1832 KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)), 1833 atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz, 1834 start, end)); 1835 if (uvm_pmr_pg_to_memtype(low) == memtype) 1836 return low; 1837 } 1838 1839 /* Nothing found. */ 1840 return NULL; 1841 } 1842 1843 /* 1844 * Allocate any page, the fastest way. Page number constraints only. 1845 */ 1846 psize_t 1847 uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result, 1848 paddr_t start, paddr_t end, int memtype_only) 1849 { 1850 struct uvm_pmemrange *pmr; 1851 struct vm_page *found, *splitpg; 1852 psize_t fcount; 1853 int memtype; 1854 1855 fcount = 0; 1856 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { 1857 /* We're done. */ 1858 if (fcount == count) 1859 break; 1860 1861 /* Outside requested range. */ 1862 if (!(start == 0 && end == 0) && 1863 !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end)) 1864 continue; 1865 1866 /* Range is empty. */ 1867 if (pmr->nsegs == 0) 1868 continue; 1869 1870 /* Loop over all memtypes, starting at memtype_init. */ 1871 memtype = memtype_init; 1872 while (fcount != count) { 1873 found = TAILQ_FIRST(&pmr->single[memtype]); 1874 /* 1875 * If found is outside the range, walk the list 1876 * until we find something that intersects with 1877 * boundaries. 1878 */ 1879 while (found && !PMR_INTERSECTS_WITH( 1880 atop(VM_PAGE_TO_PHYS(found)), 1881 atop(VM_PAGE_TO_PHYS(found)) + 1, 1882 start, end)) 1883 found = TAILQ_NEXT(found, pageq); 1884 1885 if (found == NULL) { 1886 /* 1887 * Check if the size tree contains a range 1888 * that intersects with the boundaries. As the 1889 * allocation is for any page, try the smallest 1890 * range so that large ranges are preserved for 1891 * more constrained cases. Only one entry is 1892 * checked here, to avoid a brute-force search. 1893 * 1894 * Note that a size tree gives pg[1] instead of 1895 * pg[0]. 1896 */ 1897 found = RBT_MIN(uvm_pmr_size, 1898 &pmr->size[memtype]); 1899 if (found != NULL) { 1900 found--; 1901 if (!PMR_INTERSECTS_WITH( 1902 atop(VM_PAGE_TO_PHYS(found)), 1903 atop(VM_PAGE_TO_PHYS(found)) + 1904 found->fpgsz, start, end)) 1905 found = NULL; 1906 } 1907 } 1908 if (found == NULL) { 1909 /* 1910 * Try address-guided search to meet the page 1911 * number constraints. 1912 */ 1913 found = RBT_ROOT(uvm_pmr_addr, &pmr->addr); 1914 if (found != NULL) { 1915 found = uvm_pmr_rootupdate(pmr, found, 1916 start, end, memtype); 1917 } 1918 } 1919 if (found != NULL) { 1920 uvm_pmr_assertvalid(pmr); 1921 uvm_pmr_remove_size(pmr, found); 1922 1923 /* 1924 * If the page intersects the end, then it'll 1925 * need splitting. 1926 * 1927 * Note that we don't need to split if the page 1928 * intersects start: the drain function will 1929 * simply stop on hitting start. 1930 */ 1931 if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) + 1932 found->fpgsz > end) { 1933 psize_t splitsz = 1934 atop(VM_PAGE_TO_PHYS(found)) + 1935 found->fpgsz - end; 1936 1937 uvm_pmr_remove_addr(pmr, found); 1938 uvm_pmr_assertvalid(pmr); 1939 found->fpgsz -= splitsz; 1940 splitpg = found + found->fpgsz; 1941 splitpg->fpgsz = splitsz; 1942 uvm_pmr_insert(pmr, splitpg, 1); 1943 1944 /* 1945 * At this point, splitpg and found 1946 * actually should be joined. 1947 * But we explicitly disable that, 1948 * because we will start subtracting 1949 * from found. 1950 */ 1951 KASSERT(start == 0 || 1952 atop(VM_PAGE_TO_PHYS(found)) + 1953 found->fpgsz > start); 1954 uvm_pmr_insert_addr(pmr, found, 1); 1955 } 1956 1957 /* 1958 * Fetch pages from the end. 1959 * If the range is larger than the requested 1960 * number of pages, this saves us an addr-tree 1961 * update. 1962 * 1963 * Since we take from the end and insert at 1964 * the head, any ranges keep preserved. 1965 */ 1966 while (found->fpgsz > 0 && fcount < count && 1967 (start == 0 || 1968 atop(VM_PAGE_TO_PHYS(found)) + 1969 found->fpgsz > start)) { 1970 found->fpgsz--; 1971 fcount++; 1972 TAILQ_INSERT_HEAD(result, 1973 &found[found->fpgsz], pageq); 1974 } 1975 if (found->fpgsz > 0) { 1976 uvm_pmr_insert_size(pmr, found); 1977 KDASSERT(fcount == count); 1978 uvm_pmr_assertvalid(pmr); 1979 return fcount; 1980 } 1981 1982 /* 1983 * Delayed addr-tree removal. 1984 */ 1985 uvm_pmr_remove_addr(pmr, found); 1986 uvm_pmr_assertvalid(pmr); 1987 } else { 1988 if (memtype_only) 1989 break; 1990 /* 1991 * Skip to the next memtype. 1992 */ 1993 memtype += 1; 1994 if (memtype == UVM_PMR_MEMTYPE_MAX) 1995 memtype = 0; 1996 if (memtype == memtype_init) 1997 break; 1998 } 1999 } 2000 } 2001 2002 /* 2003 * Search finished. 2004 * 2005 * Ran out of ranges before enough pages were gathered, or we hit the 2006 * case where found->fpgsz == count - fcount, in which case the 2007 * above exit condition didn't trigger. 2008 * 2009 * On failure, caller will free the pages. 2010 */ 2011 return fcount; 2012 } 2013 2014 #ifdef DDB 2015 /* 2016 * Print information about pmemrange. 2017 * Does not do locking (so either call it from DDB or acquire fpageq lock 2018 * before invoking. 2019 */ 2020 void 2021 uvm_pmr_print(void) 2022 { 2023 struct uvm_pmemrange *pmr; 2024 struct vm_page *pg; 2025 psize_t size[UVM_PMR_MEMTYPE_MAX]; 2026 psize_t free; 2027 int useq_len; 2028 int mt; 2029 2030 printf("Ranges, use queue:\n"); 2031 useq_len = 0; 2032 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { 2033 useq_len++; 2034 free = 0; 2035 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { 2036 pg = RBT_MAX(uvm_pmr_size, &pmr->size[mt]); 2037 if (pg != NULL) 2038 pg--; 2039 else 2040 pg = TAILQ_FIRST(&pmr->single[mt]); 2041 size[mt] = (pg == NULL ? 0 : pg->fpgsz); 2042 2043 RBT_FOREACH(pg, uvm_pmr_addr, &pmr->addr) 2044 free += pg->fpgsz; 2045 } 2046 2047 printf("* [0x%lx-0x%lx] use=%d nsegs=%ld", 2048 (unsigned long)pmr->low, (unsigned long)pmr->high, 2049 pmr->use, (unsigned long)pmr->nsegs); 2050 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { 2051 printf(" maxsegsz[%d]=0x%lx", mt, 2052 (unsigned long)size[mt]); 2053 } 2054 printf(" free=0x%lx\n", (unsigned long)free); 2055 } 2056 printf("#ranges = %d\n", useq_len); 2057 } 2058 #endif 2059 2060 /* 2061 * uvm_wait_pla: wait (sleep) for the page daemon to free some pages 2062 * in a specific physmem area. 2063 * 2064 * Returns ENOMEM if the pagedaemon failed to free any pages. 2065 * If not failok, failure will lead to panic. 2066 * 2067 * Must be called with fpageq locked. 2068 */ 2069 int 2070 uvm_wait_pla(paddr_t low, paddr_t high, paddr_t size, int failok) 2071 { 2072 struct uvm_pmalloc pma; 2073 const char *wmsg = "pmrwait"; 2074 2075 if (curproc == uvm.pagedaemon_proc) { 2076 /* 2077 * This is not that uncommon when the pagedaemon is trying 2078 * to flush out a large mmapped file. VOP_WRITE will circle 2079 * back through the buffer cache and try to get more memory. 2080 * The pagedaemon starts by calling bufbackoff, but we can 2081 * easily use up that reserve in a single scan iteration. 2082 */ 2083 uvm_unlock_fpageq(); 2084 if (bufbackoff(NULL, atop(size)) == 0) { 2085 uvm_lock_fpageq(); 2086 return 0; 2087 } 2088 uvm_lock_fpageq(); 2089 2090 /* 2091 * XXX detect pagedaemon deadlock - see comment in 2092 * uvm_wait(), as this is exactly the same issue. 2093 */ 2094 printf("pagedaemon: wait_pla deadlock detected!\n"); 2095 msleep_nsec(&uvmexp.free, &uvm.fpageqlock, PVM, wmsg, 2096 MSEC_TO_NSEC(125)); 2097 #if defined(DEBUG) 2098 /* DEBUG: panic so we can debug it */ 2099 panic("wait_pla pagedaemon deadlock"); 2100 #endif 2101 return 0; 2102 } 2103 2104 for (;;) { 2105 pma.pm_constraint.ucr_low = low; 2106 pma.pm_constraint.ucr_high = high; 2107 pma.pm_size = size; 2108 pma.pm_flags = UVM_PMA_LINKED; 2109 TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, &pma, pmq); 2110 2111 wakeup(&uvm.pagedaemon); /* wake the daemon! */ 2112 while (pma.pm_flags & (UVM_PMA_LINKED | UVM_PMA_BUSY)) 2113 msleep_nsec(&pma, &uvm.fpageqlock, PVM, wmsg, INFSLP); 2114 2115 if (!(pma.pm_flags & UVM_PMA_FREED) && 2116 pma.pm_flags & UVM_PMA_FAIL) { 2117 if (failok) 2118 return ENOMEM; 2119 printf("uvm_wait: failed to free %ld pages between " 2120 "0x%lx-0x%lx\n", atop(size), low, high); 2121 } else 2122 return 0; 2123 } 2124 /* UNREACHABLE */ 2125 } 2126 2127 /* 2128 * Wake up uvm_pmalloc sleepers. 2129 */ 2130 void 2131 uvm_wakeup_pla(paddr_t low, psize_t len) 2132 { 2133 struct uvm_pmalloc *pma, *pma_next; 2134 paddr_t high; 2135 2136 high = low + len; 2137 2138 /* Wake specific allocations waiting for this memory. */ 2139 for (pma = TAILQ_FIRST(&uvm.pmr_control.allocs); pma != NULL; 2140 pma = pma_next) { 2141 pma_next = TAILQ_NEXT(pma, pmq); 2142 2143 if (low < pma->pm_constraint.ucr_high && 2144 high > pma->pm_constraint.ucr_low) { 2145 pma->pm_flags |= UVM_PMA_FREED; 2146 if (!(pma->pm_flags & UVM_PMA_BUSY)) { 2147 pma->pm_flags &= ~UVM_PMA_LINKED; 2148 TAILQ_REMOVE(&uvm.pmr_control.allocs, pma, 2149 pmq); 2150 wakeup(pma); 2151 } 2152 } 2153 } 2154 } 2155 2156 void 2157 uvm_pagezero_thread(void *arg) 2158 { 2159 struct pglist pgl; 2160 struct vm_page *pg; 2161 int count; 2162 2163 /* Run at the lowest possible priority. */ 2164 curproc->p_p->ps_nice = NZERO + PRIO_MAX; 2165 2166 KERNEL_UNLOCK(); 2167 2168 TAILQ_INIT(&pgl); 2169 for (;;) { 2170 uvm_lock_fpageq(); 2171 while (uvmexp.zeropages >= UVM_PAGEZERO_TARGET || 2172 (count = uvm_pmr_get1page(16, UVM_PMR_MEMTYPE_DIRTY, 2173 &pgl, 0, 0, 1)) == 0) { 2174 msleep_nsec(&uvmexp.zeropages, &uvm.fpageqlock, 2175 MAXPRI, "pgzero", INFSLP); 2176 } 2177 uvm_unlock_fpageq(); 2178 2179 TAILQ_FOREACH(pg, &pgl, pageq) { 2180 uvm_pagezero(pg); 2181 atomic_setbits_int(&pg->pg_flags, PG_ZERO); 2182 } 2183 2184 uvm_lock_fpageq(); 2185 while (!TAILQ_EMPTY(&pgl)) 2186 uvm_pmr_remove_1strange(&pgl, 0, NULL, 0); 2187 uvmexp.zeropages += count; 2188 uvm_unlock_fpageq(); 2189 2190 yield(); 2191 } 2192 } 2193