1 /* $OpenBSD: uvm_pmemrange.c,v 1.59 2020/02/18 12:13:40 mpi Exp $ */ 2 3 /* 4 * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/param.h> 20 #include <sys/systm.h> 21 #include <uvm/uvm.h> 22 #include <sys/malloc.h> 23 #include <sys/kernel.h> 24 #include <sys/proc.h> 25 #include <sys/mount.h> 26 27 /* 28 * 2 trees: addr tree and size tree. 29 * 30 * The allocator keeps chunks of free pages (called a range). 31 * Two pages are part of the same range if: 32 * - all pages in between are part of that range, 33 * - they are of the same memory type (zeroed or non-zeroed), 34 * - they are part of the same pmemrange. 35 * A pmemrange is a range of memory which is part of the same vm_physseg 36 * and has a use-count. 37 * 38 * addr tree is vm_page[0].objt 39 * size tree is vm_page[1].objt 40 * 41 * The size tree is not used for memory ranges of 1 page, instead, 42 * single queue is vm_page[0].pageq 43 * 44 * vm_page[0].fpgsz describes the length of a free range. Two adjecent ranges 45 * are joined, unless: 46 * - they have pages in between them which are not free 47 * - they belong to different memtypes (zeroed vs dirty memory) 48 * - they are in different pmemrange areas (ISA vs non-ISA memory for instance) 49 * - they are not a continuation of the same array 50 * The latter issue is caused by vm_physseg ordering and splitting from the 51 * MD initialization machinery. The MD code is dependant on freelists and 52 * happens to split ISA memory from non-ISA memory. 53 * (Note: freelists die die die!) 54 * 55 * uvm_page_init guarantees that every vm_physseg contains an array of 56 * struct vm_page. Also, uvm_page_physload allocates an array of struct 57 * vm_page. This code depends on that array. The array may break across 58 * vm_physsegs boundaries. 59 */ 60 61 /* 62 * Validate the flags of the page. (Used in asserts.) 63 * Any free page must have the PQ_FREE flag set. 64 * Free pages may be zeroed. 65 * Pmap flags are left untouched. 66 * 67 * The PQ_FREE flag is not checked here: by not checking, we can easily use 68 * this check in pages which are freed. 69 */ 70 #define VALID_FLAGS(pg_flags) \ 71 (((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0) 72 73 /* Tree comparators. */ 74 int uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *, 75 const struct uvm_pmemrange *); 76 int uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *); 77 int uvm_pmr_pg_to_memtype(struct vm_page *); 78 79 #ifdef DDB 80 void uvm_pmr_print(void); 81 #endif 82 83 /* 84 * Memory types. The page flags are used to derive what the current memory 85 * type of a page is. 86 */ 87 int 88 uvm_pmr_pg_to_memtype(struct vm_page *pg) 89 { 90 if (pg->pg_flags & PG_ZERO) 91 return UVM_PMR_MEMTYPE_ZERO; 92 /* Default: dirty memory. */ 93 return UVM_PMR_MEMTYPE_DIRTY; 94 } 95 96 /* Trees. */ 97 RBT_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp); 98 RBT_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp); 99 RBT_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr, 100 uvm_pmemrange_addr_cmp); 101 102 /* Validation. */ 103 #ifdef DEBUG 104 void uvm_pmr_assertvalid(struct uvm_pmemrange *pmr); 105 #else 106 #define uvm_pmr_assertvalid(pmr) do {} while (0) 107 #endif 108 109 psize_t uvm_pmr_get1page(psize_t, int, struct pglist *, 110 paddr_t, paddr_t, int); 111 112 struct uvm_pmemrange *uvm_pmr_allocpmr(void); 113 struct vm_page *uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int); 114 struct vm_page *uvm_pmr_nextsz(struct uvm_pmemrange *, 115 struct vm_page *, int); 116 void uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, 117 struct vm_page *pg, struct vm_page **pg_prev, 118 struct vm_page **pg_next); 119 struct vm_page *uvm_pmr_findnextsegment(struct uvm_pmemrange *, 120 struct vm_page *, paddr_t); 121 struct vm_page *uvm_pmr_findprevsegment(struct uvm_pmemrange *, 122 struct vm_page *, paddr_t); 123 psize_t uvm_pmr_remove_1strange(struct pglist *, paddr_t, 124 struct vm_page **, int); 125 psize_t uvm_pmr_remove_1strange_reverse(struct pglist *, 126 paddr_t *); 127 void uvm_pmr_split(paddr_t); 128 struct uvm_pmemrange *uvm_pmemrange_find(paddr_t); 129 struct uvm_pmemrange *uvm_pmemrange_use_insert(struct uvm_pmemrange_use *, 130 struct uvm_pmemrange *); 131 psize_t pow2divide(psize_t, psize_t); 132 struct vm_page *uvm_pmr_rootupdate(struct uvm_pmemrange *, 133 struct vm_page *, paddr_t, paddr_t, int); 134 135 /* 136 * Computes num/denom and rounds it up to the next power-of-2. 137 * 138 * This is a division function which calculates an approximation of 139 * num/denom, with result =~ num/denom. It is meant to be fast and doesn't 140 * have to be accurate. 141 * 142 * Providing too large a value makes the allocator slightly faster, at the 143 * risk of hitting the failure case more often. Providing too small a value 144 * makes the allocator a bit slower, but less likely to hit a failure case. 145 */ 146 psize_t 147 pow2divide(psize_t num, psize_t denom) 148 { 149 int rshift; 150 151 for (rshift = 0; num > denom; rshift++, denom <<= 1) 152 ; 153 return (paddr_t)1 << rshift; 154 } 155 156 /* 157 * Predicate: lhs is a subrange or rhs. 158 * 159 * If rhs_low == 0: don't care about lower bound. 160 * If rhs_high == 0: don't care about upper bound. 161 */ 162 #define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high) \ 163 (((rhs_low) == 0 || (lhs_low) >= (rhs_low)) && \ 164 ((rhs_high) == 0 || (lhs_high) <= (rhs_high))) 165 166 /* 167 * Predicate: lhs intersects with rhs. 168 * 169 * If rhs_low == 0: don't care about lower bound. 170 * If rhs_high == 0: don't care about upper bound. 171 * Ranges don't intersect if they don't have any page in common, array 172 * semantics mean that < instead of <= should be used here. 173 */ 174 #define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high) \ 175 (((rhs_low) == 0 || (rhs_low) < (lhs_high)) && \ 176 ((rhs_high) == 0 || (lhs_low) < (rhs_high))) 177 178 /* 179 * Align to power-of-2 alignment. 180 */ 181 #define PMR_ALIGN(pgno, align) \ 182 (((pgno) + ((align) - 1)) & ~((align) - 1)) 183 #define PMR_ALIGN_DOWN(pgno, align) \ 184 ((pgno) & ~((align) - 1)) 185 186 187 /* 188 * Comparator: sort by address ascending. 189 */ 190 int 191 uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *lhs, 192 const struct uvm_pmemrange *rhs) 193 { 194 return lhs->low < rhs->low ? -1 : lhs->low > rhs->low; 195 } 196 197 /* 198 * Comparator: sort by use ascending. 199 * 200 * The higher the use value of a range, the more devices need memory in 201 * this range. Therefore allocate from the range with the lowest use first. 202 */ 203 int 204 uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs) 205 { 206 int result; 207 208 result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use; 209 if (result == 0) 210 result = uvm_pmemrange_addr_cmp(lhs, rhs); 211 return result; 212 } 213 214 int 215 uvm_pmr_addr_cmp(const struct vm_page *lhs, const struct vm_page *rhs) 216 { 217 paddr_t lhs_addr, rhs_addr; 218 219 lhs_addr = VM_PAGE_TO_PHYS(lhs); 220 rhs_addr = VM_PAGE_TO_PHYS(rhs); 221 222 return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr); 223 } 224 225 int 226 uvm_pmr_size_cmp(const struct vm_page *lhs, const struct vm_page *rhs) 227 { 228 psize_t lhs_size, rhs_size; 229 int cmp; 230 231 /* Using second tree, so we receive pg[1] instead of pg[0]. */ 232 lhs_size = (lhs - 1)->fpgsz; 233 rhs_size = (rhs - 1)->fpgsz; 234 235 cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size); 236 if (cmp == 0) 237 cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1); 238 return cmp; 239 } 240 241 /* 242 * Find the first range of free pages that is at least sz pages long. 243 */ 244 struct vm_page * 245 uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti) 246 { 247 struct vm_page *node, *best; 248 249 KASSERT(sz >= 1); 250 251 if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti])) 252 return TAILQ_FIRST(&pmr->single[mti]); 253 254 node = RBT_ROOT(uvm_pmr_size, &pmr->size[mti]); 255 best = NULL; 256 while (node != NULL) { 257 if ((node - 1)->fpgsz >= sz) { 258 best = (node - 1); 259 node = RBT_LEFT(uvm_objtree, node); 260 } else 261 node = RBT_RIGHT(uvm_objtree, node); 262 } 263 return best; 264 } 265 266 /* 267 * Finds the next range. The next range has a size >= pg->fpgsz. 268 * Returns NULL if no more ranges are available. 269 */ 270 struct vm_page * 271 uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt) 272 { 273 struct vm_page *npg; 274 275 KASSERT(pmr != NULL && pg != NULL); 276 if (pg->fpgsz == 1) { 277 if (TAILQ_NEXT(pg, pageq) != NULL) 278 return TAILQ_NEXT(pg, pageq); 279 else 280 npg = RBT_MIN(uvm_pmr_size, &pmr->size[mt]); 281 } else 282 npg = RBT_NEXT(uvm_pmr_size, pg + 1); 283 284 return npg == NULL ? NULL : npg - 1; 285 } 286 287 /* 288 * Finds the previous and next ranges relative to the (uninserted) pg range. 289 * 290 * *pg_prev == NULL if no previous range is available, that can join with 291 * pg. 292 * *pg_next == NULL if no next range is available, that can join with 293 * pg. 294 */ 295 void 296 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg, 297 struct vm_page **pg_prev, struct vm_page **pg_next) 298 { 299 KASSERT(pg_prev != NULL && pg_next != NULL); 300 301 *pg_next = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg); 302 if (*pg_next == NULL) 303 *pg_prev = RBT_MAX(uvm_pmr_addr, &pmr->addr); 304 else 305 *pg_prev = RBT_PREV(uvm_pmr_addr, *pg_next); 306 307 KDASSERT(*pg_next == NULL || 308 VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg)); 309 KDASSERT(*pg_prev == NULL || 310 VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg)); 311 312 /* Reset if not contig. */ 313 if (*pg_prev != NULL && 314 (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz 315 != atop(VM_PAGE_TO_PHYS(pg)) || 316 *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */ 317 uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg))) 318 *pg_prev = NULL; 319 if (*pg_next != NULL && 320 (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz 321 != atop(VM_PAGE_TO_PHYS(*pg_next)) || 322 pg + pg->fpgsz != *pg_next || /* Array broke. */ 323 uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg))) 324 *pg_next = NULL; 325 return; 326 } 327 328 /* 329 * Remove a range from the address tree. 330 * Address tree maintains pmr counters. 331 */ 332 void 333 uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg) 334 { 335 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg); 336 KDASSERT(pg->pg_flags & PQ_FREE); 337 RBT_REMOVE(uvm_pmr_addr, &pmr->addr, pg); 338 339 pmr->nsegs--; 340 } 341 /* 342 * Remove a range from the size tree. 343 */ 344 void 345 uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg) 346 { 347 int memtype; 348 #ifdef DEBUG 349 struct vm_page *i; 350 #endif 351 352 KDASSERT(pg->fpgsz >= 1); 353 KDASSERT(pg->pg_flags & PQ_FREE); 354 memtype = uvm_pmr_pg_to_memtype(pg); 355 356 if (pg->fpgsz == 1) { 357 #ifdef DEBUG 358 TAILQ_FOREACH(i, &pmr->single[memtype], pageq) { 359 if (i == pg) 360 break; 361 } 362 KDASSERT(i == pg); 363 #endif 364 TAILQ_REMOVE(&pmr->single[memtype], pg, pageq); 365 } else { 366 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[memtype], 367 pg + 1) == pg + 1); 368 RBT_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1); 369 } 370 } 371 /* Remove from both trees. */ 372 void 373 uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg) 374 { 375 uvm_pmr_assertvalid(pmr); 376 uvm_pmr_remove_size(pmr, pg); 377 uvm_pmr_remove_addr(pmr, pg); 378 uvm_pmr_assertvalid(pmr); 379 } 380 381 /* 382 * Insert the range described in pg. 383 * Returns the range thus created (which may be joined with the previous and 384 * next ranges). 385 * If no_join, the caller guarantees that the range cannot possibly join 386 * with adjecent ranges. 387 */ 388 struct vm_page * 389 uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join) 390 { 391 struct vm_page *prev, *next; 392 393 #ifdef DEBUG 394 struct vm_page *i; 395 int mt; 396 #endif 397 398 KDASSERT(pg->pg_flags & PQ_FREE); 399 KDASSERT(pg->fpgsz >= 1); 400 401 #ifdef DEBUG 402 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { 403 TAILQ_FOREACH(i, &pmr->single[mt], pageq) 404 KDASSERT(i != pg); 405 if (pg->fpgsz > 1) { 406 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mt], 407 pg + 1) == NULL); 408 } 409 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL); 410 } 411 #endif 412 413 if (!no_join) { 414 uvm_pmr_pnaddr(pmr, pg, &prev, &next); 415 if (next != NULL) { 416 uvm_pmr_remove_size(pmr, next); 417 uvm_pmr_remove_addr(pmr, next); 418 pg->fpgsz += next->fpgsz; 419 next->fpgsz = 0; 420 } 421 if (prev != NULL) { 422 uvm_pmr_remove_size(pmr, prev); 423 prev->fpgsz += pg->fpgsz; 424 pg->fpgsz = 0; 425 return prev; 426 } 427 } 428 429 RBT_INSERT(uvm_pmr_addr, &pmr->addr, pg); 430 431 pmr->nsegs++; 432 433 return pg; 434 } 435 /* 436 * Insert the range described in pg. 437 * Returns the range thus created (which may be joined with the previous and 438 * next ranges). 439 * Page must already be in the address tree. 440 */ 441 void 442 uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg) 443 { 444 int memtype; 445 #ifdef DEBUG 446 struct vm_page *i; 447 int mti; 448 #endif 449 450 KDASSERT(pg->fpgsz >= 1); 451 KDASSERT(pg->pg_flags & PQ_FREE); 452 453 memtype = uvm_pmr_pg_to_memtype(pg); 454 #ifdef DEBUG 455 for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) { 456 TAILQ_FOREACH(i, &pmr->single[mti], pageq) 457 KDASSERT(i != pg); 458 if (pg->fpgsz > 1) { 459 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mti], 460 pg + 1) == NULL); 461 } 462 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg); 463 } 464 for (i = pg; i < pg + pg->fpgsz; i++) 465 KASSERT(uvm_pmr_pg_to_memtype(i) == memtype); 466 #endif 467 468 if (pg->fpgsz == 1) 469 TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq); 470 else 471 RBT_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1); 472 } 473 /* Insert in both trees. */ 474 struct vm_page * 475 uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join) 476 { 477 uvm_pmr_assertvalid(pmr); 478 pg = uvm_pmr_insert_addr(pmr, pg, no_join); 479 uvm_pmr_insert_size(pmr, pg); 480 uvm_pmr_assertvalid(pmr); 481 return pg; 482 } 483 484 /* 485 * Find the last page that is part of this segment. 486 * => pg: the range at which to start the search. 487 * => boundary: the page number boundary specification (0 = no boundary). 488 * => pmr: the pmemrange of the page. 489 * 490 * This function returns 1 before the next range, so if you want to have the 491 * next range, you need to run TAILQ_NEXT(result, pageq) after calling. 492 * The reason is that this way, the length of the segment is easily 493 * calculated using: atop(result) - atop(pg) + 1. 494 * Hence this function also never returns NULL. 495 */ 496 struct vm_page * 497 uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr, 498 struct vm_page *pg, paddr_t boundary) 499 { 500 paddr_t first_boundary; 501 struct vm_page *next; 502 struct vm_page *prev; 503 504 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) && 505 pmr->high > atop(VM_PAGE_TO_PHYS(pg))); 506 if (boundary != 0) { 507 first_boundary = 508 PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary); 509 } else 510 first_boundary = 0; 511 512 /* 513 * Increase next until it hits the first page of the next segment. 514 * 515 * While loop checks the following: 516 * - next != NULL we have not reached the end of pgl 517 * - boundary == 0 || next < first_boundary 518 * we do not cross a boundary 519 * - atop(prev) + 1 == atop(next) 520 * still in the same segment 521 * - low <= last 522 * - high > last still in the same memory range 523 * - memtype is equal allocator is unable to view different memtypes 524 * as part of the same segment 525 * - prev + 1 == next no array breakage occurs 526 */ 527 prev = pg; 528 next = TAILQ_NEXT(prev, pageq); 529 while (next != NULL && 530 (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) && 531 atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) && 532 pmr->low <= atop(VM_PAGE_TO_PHYS(next)) && 533 pmr->high > atop(VM_PAGE_TO_PHYS(next)) && 534 uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) && 535 prev + 1 == next) { 536 prev = next; 537 next = TAILQ_NEXT(prev, pageq); 538 } 539 540 /* 541 * End of this segment. 542 */ 543 return prev; 544 } 545 546 /* 547 * Find the first page that is part of this segment. 548 * => pg: the range at which to start the search. 549 * => boundary: the page number boundary specification (0 = no boundary). 550 * => pmr: the pmemrange of the page. 551 * 552 * This function returns 1 after the previous range, so if you want to have the 553 * previous range, you need to run TAILQ_NEXT(result, pageq) after calling. 554 * The reason is that this way, the length of the segment is easily 555 * calculated using: atop(pg) - atop(result) + 1. 556 * Hence this function also never returns NULL. 557 */ 558 struct vm_page * 559 uvm_pmr_findprevsegment(struct uvm_pmemrange *pmr, 560 struct vm_page *pg, paddr_t boundary) 561 { 562 paddr_t first_boundary; 563 struct vm_page *next; 564 struct vm_page *prev; 565 566 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) && 567 pmr->high > atop(VM_PAGE_TO_PHYS(pg))); 568 if (boundary != 0) { 569 first_boundary = 570 PMR_ALIGN_DOWN(atop(VM_PAGE_TO_PHYS(pg)), boundary); 571 } else 572 first_boundary = 0; 573 574 /* 575 * Increase next until it hits the first page of the previous segment. 576 * 577 * While loop checks the following: 578 * - next != NULL we have not reached the end of pgl 579 * - boundary == 0 || next >= first_boundary 580 * we do not cross a boundary 581 * - atop(prev) - 1 == atop(next) 582 * still in the same segment 583 * - low <= last 584 * - high > last still in the same memory range 585 * - memtype is equal allocator is unable to view different memtypes 586 * as part of the same segment 587 * - prev - 1 == next no array breakage occurs 588 */ 589 prev = pg; 590 next = TAILQ_NEXT(prev, pageq); 591 while (next != NULL && 592 (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) >= first_boundary) && 593 atop(VM_PAGE_TO_PHYS(prev)) - 1 == atop(VM_PAGE_TO_PHYS(next)) && 594 pmr->low <= atop(VM_PAGE_TO_PHYS(next)) && 595 pmr->high > atop(VM_PAGE_TO_PHYS(next)) && 596 uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) && 597 prev - 1 == next) { 598 prev = next; 599 next = TAILQ_NEXT(prev, pageq); 600 } 601 602 /* 603 * Start of this segment. 604 */ 605 return prev; 606 } 607 608 /* 609 * Remove the first segment of contiguous pages from pgl. 610 * A segment ends if it crosses boundary (unless boundary = 0) or 611 * if it would enter a different uvm_pmemrange. 612 * 613 * Work: the page range that the caller is currently working with. 614 * May be null. 615 * 616 * If is_desperate is non-zero, the smallest segment is erased. Otherwise, 617 * the first segment is erased (which, if called by uvm_pmr_getpages(), 618 * probably is the smallest or very close to it). 619 */ 620 psize_t 621 uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary, 622 struct vm_page **work, int is_desperate) 623 { 624 struct vm_page *start, *end, *iter, *iter_end, *inserted, *lowest; 625 psize_t count; 626 struct uvm_pmemrange *pmr, *pmr_iter; 627 628 KASSERT(!TAILQ_EMPTY(pgl)); 629 630 /* 631 * Initialize to first page. 632 * Unless desperate scan finds a better candidate, this is what'll be 633 * erased. 634 */ 635 start = TAILQ_FIRST(pgl); 636 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start))); 637 end = uvm_pmr_findnextsegment(pmr, start, boundary); 638 639 /* 640 * If we are desperate, we _really_ want to get rid of the smallest 641 * element (rather than a close match to the smallest element). 642 */ 643 if (is_desperate) { 644 /* Linear search for smallest segment. */ 645 pmr_iter = pmr; 646 for (iter = TAILQ_NEXT(end, pageq); 647 iter != NULL && start != end; 648 iter = TAILQ_NEXT(iter_end, pageq)) { 649 /* 650 * Only update pmr if it doesn't match current 651 * iteration. 652 */ 653 if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) || 654 pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) { 655 pmr_iter = uvm_pmemrange_find(atop( 656 VM_PAGE_TO_PHYS(iter))); 657 } 658 659 iter_end = uvm_pmr_findnextsegment(pmr_iter, iter, 660 boundary); 661 662 /* 663 * Current iteration is smaller than best match so 664 * far; update. 665 */ 666 if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) < 667 VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) { 668 start = iter; 669 end = iter_end; 670 pmr = pmr_iter; 671 } 672 } 673 } 674 675 /* 676 * Calculate count and end of the list. 677 */ 678 count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1; 679 lowest = start; 680 end = TAILQ_NEXT(end, pageq); 681 682 /* 683 * Actually remove the range of pages. 684 * 685 * Sadly, this cannot be done using pointer iteration: 686 * vm_physseg is not guaranteed to be sorted on address, hence 687 * uvm_page_init() may not have initialized its array sorted by 688 * page number. 689 */ 690 for (iter = start; iter != end; iter = iter_end) { 691 iter_end = TAILQ_NEXT(iter, pageq); 692 TAILQ_REMOVE(pgl, iter, pageq); 693 } 694 695 lowest->fpgsz = count; 696 inserted = uvm_pmr_insert(pmr, lowest, 0); 697 698 /* 699 * If the caller was working on a range and this function modified 700 * that range, update the pointer. 701 */ 702 if (work != NULL && *work != NULL && 703 atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) && 704 atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz > 705 atop(VM_PAGE_TO_PHYS(*work))) 706 *work = inserted; 707 return count; 708 } 709 710 /* 711 * Remove the first segment of contiguous pages from a pgl 712 * with the list elements in reverse order of physaddr. 713 * 714 * A segment ends if it would enter a different uvm_pmemrange. 715 * 716 * Stores starting physical address of the segment in pstart. 717 */ 718 psize_t 719 uvm_pmr_remove_1strange_reverse(struct pglist *pgl, paddr_t *pstart) 720 { 721 struct vm_page *start, *end, *iter, *iter_end, *lowest; 722 psize_t count; 723 struct uvm_pmemrange *pmr; 724 725 KASSERT(!TAILQ_EMPTY(pgl)); 726 727 start = TAILQ_FIRST(pgl); 728 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start))); 729 end = uvm_pmr_findprevsegment(pmr, start, 0); 730 731 KASSERT(end <= start); 732 733 /* 734 * Calculate count and end of the list. 735 */ 736 count = atop(VM_PAGE_TO_PHYS(start) - VM_PAGE_TO_PHYS(end)) + 1; 737 lowest = end; 738 end = TAILQ_NEXT(end, pageq); 739 740 /* 741 * Actually remove the range of pages. 742 * 743 * Sadly, this cannot be done using pointer iteration: 744 * vm_physseg is not guaranteed to be sorted on address, hence 745 * uvm_page_init() may not have initialized its array sorted by 746 * page number. 747 */ 748 for (iter = start; iter != end; iter = iter_end) { 749 iter_end = TAILQ_NEXT(iter, pageq); 750 TAILQ_REMOVE(pgl, iter, pageq); 751 } 752 753 lowest->fpgsz = count; 754 (void) uvm_pmr_insert(pmr, lowest, 0); 755 756 *pstart = VM_PAGE_TO_PHYS(lowest); 757 return count; 758 } 759 760 /* 761 * Extract a number of pages from a segment of free pages. 762 * Called by uvm_pmr_getpages. 763 * 764 * Returns the segment that was created from pages left over at the tail 765 * of the remove set of pages, or NULL if no pages were left at the tail. 766 */ 767 struct vm_page * 768 uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg, 769 paddr_t start, paddr_t end, struct pglist *result) 770 { 771 struct vm_page *after, *pg_i; 772 psize_t before_sz, after_sz; 773 #ifdef DEBUG 774 psize_t i; 775 #endif 776 777 KDASSERT(end > start); 778 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg))); 779 KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz); 780 KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start); 781 KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end); 782 783 before_sz = start - atop(VM_PAGE_TO_PHYS(pg)); 784 after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end; 785 KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz); 786 uvm_pmr_assertvalid(pmr); 787 788 uvm_pmr_remove_size(pmr, pg); 789 if (before_sz == 0) 790 uvm_pmr_remove_addr(pmr, pg); 791 after = pg + before_sz + (end - start); 792 793 /* Add selected pages to result. */ 794 for (pg_i = pg + before_sz; pg_i != after; pg_i++) { 795 KASSERT(pg_i->pg_flags & PQ_FREE); 796 pg_i->fpgsz = 0; 797 TAILQ_INSERT_TAIL(result, pg_i, pageq); 798 } 799 800 /* Before handling. */ 801 if (before_sz > 0) { 802 pg->fpgsz = before_sz; 803 uvm_pmr_insert_size(pmr, pg); 804 } 805 806 /* After handling. */ 807 if (after_sz > 0) { 808 #ifdef DEBUG 809 for (i = 0; i < after_sz; i++) { 810 KASSERT(!uvm_pmr_isfree(after + i)); 811 } 812 #endif 813 KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end); 814 after->fpgsz = after_sz; 815 after = uvm_pmr_insert_addr(pmr, after, 1); 816 uvm_pmr_insert_size(pmr, after); 817 } 818 819 uvm_pmr_assertvalid(pmr); 820 return (after_sz > 0 ? after : NULL); 821 } 822 823 /* 824 * Indicate to the page daemon that a nowait call failed and it should 825 * recover at least some memory in the most restricted region (assumed 826 * to be dma_constraint). 827 */ 828 extern volatile int uvm_nowait_failed; 829 830 /* 831 * Acquire a number of pages. 832 * 833 * count: the number of pages returned 834 * start: lowest page number 835 * end: highest page number +1 836 * (start = end = 0: no limitation) 837 * align: power-of-2 alignment constraint (align = 1: no alignment) 838 * boundary: power-of-2 boundary (boundary = 0: no boundary) 839 * maxseg: maximum number of segments to return 840 * flags: UVM_PLA_* flags 841 * result: returned pages storage (uses pageq) 842 */ 843 int 844 uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align, 845 paddr_t boundary, int maxseg, int flags, struct pglist *result) 846 { 847 struct uvm_pmemrange *pmr; /* Iterate memory ranges. */ 848 struct vm_page *found, *f_next; /* Iterate chunks. */ 849 psize_t fcount; /* Current found pages. */ 850 int fnsegs; /* Current segment counter. */ 851 int try, start_try; 852 psize_t search[3]; 853 paddr_t fstart, fend; /* Pages to be taken from found. */ 854 int memtype; /* Requested memtype. */ 855 int memtype_init; /* Best memtype. */ 856 int desperate; /* True if allocation failed. */ 857 #ifdef DIAGNOSTIC 858 struct vm_page *diag_prev; /* Used during validation. */ 859 #endif /* DIAGNOSTIC */ 860 861 /* 862 * Validate arguments. 863 */ 864 KASSERT(count > 0); 865 KASSERT(start == 0 || end == 0 || start < end); 866 KASSERT(align >= 1); 867 KASSERT(powerof2(align)); 868 KASSERT(maxseg > 0); 869 KASSERT(boundary == 0 || powerof2(boundary)); 870 KASSERT(boundary == 0 || maxseg * boundary >= count); 871 KASSERT(TAILQ_EMPTY(result)); 872 873 /* 874 * TRYCONTIG is a noop if you only want a single segment. 875 * Remove it if that's the case: otherwise it'll deny the fast 876 * allocation. 877 */ 878 if (maxseg == 1 || count == 1) 879 flags &= ~UVM_PLA_TRYCONTIG; 880 881 /* 882 * Configure search. 883 * 884 * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case. 885 * search[1] is multiple segments, chosen to fulfill the search in 886 * approximately even-sized segments. 887 * This is a good trade-off between slightly reduced allocation speed 888 * and less fragmentation. 889 * search[2] is the worst case, in which all segments are evaluated. 890 * This provides the least fragmentation, but makes the search 891 * possibly longer (although in the case it is selected, that no 892 * longer matters most). 893 * 894 * The exception is when maxseg == 1: since we can only fulfill that 895 * with one segment of size pages, only a single search type has to 896 * be attempted. 897 */ 898 if (maxseg == 1 || count == 1) { 899 start_try = 2; 900 search[2] = count; 901 } else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) { 902 start_try = 2; 903 search[2] = 1; 904 } else { 905 start_try = 0; 906 search[0] = count; 907 search[1] = pow2divide(count, maxseg); 908 search[2] = 1; 909 if ((flags & UVM_PLA_TRYCONTIG) == 0) 910 start_try = 1; 911 if (search[1] >= search[0]) { 912 search[1] = search[0]; 913 start_try = 1; 914 } 915 if (search[2] >= search[start_try]) { 916 start_try = 2; 917 } 918 } 919 920 /* 921 * Memory type: if zeroed memory is requested, traverse the zero set. 922 * Otherwise, traverse the dirty set. 923 * 924 * The memtype iterator is reinitialized to memtype_init on entrance 925 * of a pmemrange. 926 */ 927 if (flags & UVM_PLA_ZERO) 928 memtype_init = UVM_PMR_MEMTYPE_ZERO; 929 else 930 memtype_init = UVM_PMR_MEMTYPE_DIRTY; 931 932 /* 933 * Initially, we're not desperate. 934 * 935 * Note that if we return from a sleep, we are still desperate. 936 * Chances are that memory pressure is still high, so resetting 937 * seems over-optimistic to me. 938 */ 939 desperate = 0; 940 941 uvm_lock_fpageq(); 942 943 retry: /* Return point after sleeping. */ 944 fcount = 0; 945 fnsegs = 0; 946 947 retry_desperate: 948 /* 949 * If we just want any page(s), go for the really fast option. 950 */ 951 if (count <= maxseg && align == 1 && boundary == 0 && 952 (flags & UVM_PLA_TRYCONTIG) == 0) { 953 fcount += uvm_pmr_get1page(count - fcount, memtype_init, 954 result, start, end, 0); 955 956 /* 957 * If we found sufficient pages, go to the succes exit code. 958 * 959 * Otherwise, go immediately to fail, since we collected 960 * all we could anyway. 961 */ 962 if (fcount == count) 963 goto out; 964 else 965 goto fail; 966 } 967 968 /* 969 * The heart of the contig case. 970 * 971 * The code actually looks like this: 972 * 973 * foreach (struct pmemrange) { 974 * foreach (memtype) { 975 * foreach(try) { 976 * foreach (free range of memtype in pmemrange, 977 * starting at search[try]) { 978 * while (range has space left) 979 * take from range 980 * } 981 * } 982 * } 983 * 984 * if next pmemrange has higher usecount than current: 985 * enter desperate case (which will drain the pmemranges 986 * until empty prior to moving to the next one) 987 * } 988 * 989 * When desperate is activated, try always starts at the highest 990 * value. The memtype loop is using a goto ReScanMemtype. 991 * The try loop is using a goto ReScan. 992 * The 'range has space left' loop uses label DrainFound. 993 * 994 * Writing them all as loops would take up a lot of screen space in 995 * the form of indentation and some parts are easier to express 996 * using the labels. 997 */ 998 999 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { 1000 /* Empty range. */ 1001 if (pmr->nsegs == 0) 1002 continue; 1003 1004 /* Outside requested range. */ 1005 if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end)) 1006 continue; 1007 1008 memtype = memtype_init; 1009 1010 rescan_memtype: /* Return point at memtype++. */ 1011 try = start_try; 1012 1013 rescan: /* Return point at try++. */ 1014 for (found = uvm_pmr_nfindsz(pmr, search[try], memtype); 1015 found != NULL; 1016 found = f_next) { 1017 f_next = uvm_pmr_nextsz(pmr, found, memtype); 1018 1019 fstart = atop(VM_PAGE_TO_PHYS(found)); 1020 if (start != 0) 1021 fstart = MAX(start, fstart); 1022 drain_found: 1023 /* 1024 * Throw away the first segment if fnsegs == maxseg 1025 * 1026 * Note that f_next is still valid after this call, 1027 * since we only allocated from entries before f_next. 1028 * We don't revisit the entries we already extracted 1029 * from unless we entered the desperate case. 1030 */ 1031 if (fnsegs == maxseg) { 1032 fnsegs--; 1033 fcount -= 1034 uvm_pmr_remove_1strange(result, boundary, 1035 &found, desperate); 1036 } 1037 1038 fstart = PMR_ALIGN(fstart, align); 1039 fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz; 1040 if (end != 0) 1041 fend = MIN(end, fend); 1042 if (boundary != 0) { 1043 fend = 1044 MIN(fend, PMR_ALIGN(fstart + 1, boundary)); 1045 } 1046 if (fstart >= fend) 1047 continue; 1048 if (fend - fstart > count - fcount) 1049 fend = fstart + (count - fcount); 1050 1051 fcount += fend - fstart; 1052 fnsegs++; 1053 found = uvm_pmr_extract_range(pmr, found, 1054 fstart, fend, result); 1055 1056 if (fcount == count) 1057 goto out; 1058 1059 /* 1060 * If there's still space left in found, try to 1061 * fully drain it prior to continueing. 1062 */ 1063 if (found != NULL) { 1064 fstart = fend; 1065 goto drain_found; 1066 } 1067 } 1068 1069 /* Try a smaller search now. */ 1070 if (++try < nitems(search)) 1071 goto rescan; 1072 1073 /* 1074 * Exhaust all memory types prior to going to the next memory 1075 * segment. 1076 * This means that zero-vs-dirty are eaten prior to moving 1077 * to a pmemrange with a higher use-count. 1078 * 1079 * Code is basically a difficult way of writing: 1080 * memtype = memtype_init; 1081 * do { 1082 * ...; 1083 * memtype += 1; 1084 * memtype %= MEMTYPE_MAX; 1085 * } while (memtype != memtype_init); 1086 */ 1087 memtype += 1; 1088 if (memtype == UVM_PMR_MEMTYPE_MAX) 1089 memtype = 0; 1090 if (memtype != memtype_init) 1091 goto rescan_memtype; 1092 1093 /* 1094 * If not desperate, enter desperate case prior to eating all 1095 * the good stuff in the next range. 1096 */ 1097 if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL && 1098 TAILQ_NEXT(pmr, pmr_use)->use != pmr->use) 1099 break; 1100 } 1101 1102 /* 1103 * Not enough memory of the requested type available. Fall back to 1104 * less good memory that we'll clean up better later. 1105 * 1106 * This algorithm is not very smart though, it just starts scanning 1107 * a different typed range, but the nicer ranges of the previous 1108 * iteration may fall out. Hence there is a small chance of a false 1109 * negative. 1110 * 1111 * When desparate: scan all sizes starting at the smallest 1112 * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may 1113 * allow us to hit the fast path now). 1114 * 1115 * Also, because we will revisit entries we scanned before, we need 1116 * to reset the page queue, or we may end up releasing entries in 1117 * such a way as to invalidate f_next. 1118 */ 1119 if (!desperate) { 1120 desperate = 1; 1121 start_try = nitems(search) - 1; 1122 flags &= ~UVM_PLA_TRYCONTIG; 1123 1124 while (!TAILQ_EMPTY(result)) 1125 uvm_pmr_remove_1strange(result, 0, NULL, 0); 1126 fnsegs = 0; 1127 fcount = 0; 1128 goto retry_desperate; 1129 } 1130 1131 fail: 1132 /* Allocation failed. */ 1133 /* XXX: claim from memory reserve here */ 1134 1135 while (!TAILQ_EMPTY(result)) 1136 uvm_pmr_remove_1strange(result, 0, NULL, 0); 1137 1138 if (flags & UVM_PLA_WAITOK) { 1139 if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count), 1140 flags & UVM_PLA_FAILOK) == 0) 1141 goto retry; 1142 KASSERT(flags & UVM_PLA_FAILOK); 1143 } else { 1144 if (!(flags & UVM_PLA_NOWAKE)) { 1145 uvm_nowait_failed = 1; 1146 wakeup(&uvm.pagedaemon); 1147 } 1148 } 1149 uvm_unlock_fpageq(); 1150 1151 return ENOMEM; 1152 1153 out: 1154 /* Allocation succesful. */ 1155 uvmexp.free -= fcount; 1156 1157 uvm_unlock_fpageq(); 1158 1159 /* Update statistics and zero pages if UVM_PLA_ZERO. */ 1160 #ifdef DIAGNOSTIC 1161 fnsegs = 0; 1162 fcount = 0; 1163 diag_prev = NULL; 1164 #endif /* DIAGNOSTIC */ 1165 TAILQ_FOREACH(found, result, pageq) { 1166 atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK); 1167 1168 if (found->pg_flags & PG_ZERO) { 1169 uvm_lock_fpageq(); 1170 uvmexp.zeropages--; 1171 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) 1172 wakeup(&uvmexp.zeropages); 1173 uvm_unlock_fpageq(); 1174 } 1175 if (flags & UVM_PLA_ZERO) { 1176 if (found->pg_flags & PG_ZERO) 1177 uvmexp.pga_zerohit++; 1178 else { 1179 uvmexp.pga_zeromiss++; 1180 uvm_pagezero(found); 1181 } 1182 } 1183 atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE); 1184 1185 found->uobject = NULL; 1186 found->uanon = NULL; 1187 found->pg_version++; 1188 1189 /* 1190 * Validate that the page matches range criterium. 1191 */ 1192 KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start); 1193 KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end); 1194 1195 #ifdef DIAGNOSTIC 1196 /* 1197 * Update fcount (# found pages) and 1198 * fnsegs (# found segments) counters. 1199 */ 1200 if (diag_prev == NULL || 1201 /* new segment if it contains a hole */ 1202 atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 != 1203 atop(VM_PAGE_TO_PHYS(found)) || 1204 /* new segment if it crosses boundary */ 1205 (atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) != 1206 (atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1))) 1207 fnsegs++; 1208 fcount++; 1209 1210 diag_prev = found; 1211 #endif /* DIAGNOSTIC */ 1212 } 1213 1214 #ifdef DIAGNOSTIC 1215 /* 1216 * Panic on algorithm failure. 1217 */ 1218 if (fcount != count || fnsegs > maxseg) { 1219 panic("pmemrange allocation error: " 1220 "allocated %ld pages in %d segments, " 1221 "but request was %ld pages in %d segments", 1222 fcount, fnsegs, count, maxseg); 1223 } 1224 #endif /* DIAGNOSTIC */ 1225 1226 return 0; 1227 } 1228 1229 /* 1230 * Free a number of contig pages (invoked by uvm_page_init). 1231 */ 1232 void 1233 uvm_pmr_freepages(struct vm_page *pg, psize_t count) 1234 { 1235 struct uvm_pmemrange *pmr; 1236 psize_t i, pmr_count; 1237 struct vm_page *firstpg = pg; 1238 1239 for (i = 0; i < count; i++) { 1240 KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) == 1241 atop(VM_PAGE_TO_PHYS(pg)) + i); 1242 1243 if (!((pg[i].pg_flags & PQ_FREE) == 0 && 1244 VALID_FLAGS(pg[i].pg_flags))) { 1245 printf("Flags: 0x%x, will panic now.\n", 1246 pg[i].pg_flags); 1247 } 1248 KASSERT((pg[i].pg_flags & PQ_FREE) == 0 && 1249 VALID_FLAGS(pg[i].pg_flags)); 1250 atomic_setbits_int(&pg[i].pg_flags, PQ_FREE); 1251 atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO); 1252 } 1253 1254 uvm_lock_fpageq(); 1255 1256 for (i = count; i > 0; i -= pmr_count) { 1257 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); 1258 KASSERT(pmr != NULL); 1259 1260 pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg))); 1261 pg->fpgsz = pmr_count; 1262 uvm_pmr_insert(pmr, pg, 0); 1263 1264 uvmexp.free += pmr_count; 1265 pg += pmr_count; 1266 } 1267 wakeup(&uvmexp.free); 1268 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) 1269 wakeup(&uvmexp.zeropages); 1270 1271 uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count)); 1272 1273 uvm_unlock_fpageq(); 1274 } 1275 1276 /* 1277 * Free all pages in the queue. 1278 */ 1279 void 1280 uvm_pmr_freepageq(struct pglist *pgl) 1281 { 1282 struct vm_page *pg; 1283 paddr_t pstart; 1284 psize_t plen; 1285 1286 TAILQ_FOREACH(pg, pgl, pageq) { 1287 if (!((pg->pg_flags & PQ_FREE) == 0 && 1288 VALID_FLAGS(pg->pg_flags))) { 1289 printf("Flags: 0x%x, will panic now.\n", 1290 pg->pg_flags); 1291 } 1292 KASSERT((pg->pg_flags & PQ_FREE) == 0 && 1293 VALID_FLAGS(pg->pg_flags)); 1294 atomic_setbits_int(&pg->pg_flags, PQ_FREE); 1295 atomic_clearbits_int(&pg->pg_flags, PG_ZERO); 1296 } 1297 1298 uvm_lock_fpageq(); 1299 while (!TAILQ_EMPTY(pgl)) { 1300 pg = TAILQ_FIRST(pgl); 1301 if (pg == TAILQ_NEXT(pg, pageq) + 1) { 1302 /* 1303 * If pg is one behind the position of the 1304 * next page in the list in the page array, 1305 * try going backwards instead of forward. 1306 */ 1307 plen = uvm_pmr_remove_1strange_reverse(pgl, &pstart); 1308 } else { 1309 pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl)); 1310 plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0); 1311 } 1312 uvmexp.free += plen; 1313 1314 uvm_wakeup_pla(pstart, ptoa(plen)); 1315 } 1316 wakeup(&uvmexp.free); 1317 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) 1318 wakeup(&uvmexp.zeropages); 1319 uvm_unlock_fpageq(); 1320 1321 return; 1322 } 1323 1324 /* 1325 * Store a pmemrange in the list. 1326 * 1327 * The list is sorted by use. 1328 */ 1329 struct uvm_pmemrange * 1330 uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq, 1331 struct uvm_pmemrange *pmr) 1332 { 1333 struct uvm_pmemrange *iter; 1334 int cmp = 1; 1335 1336 TAILQ_FOREACH(iter, useq, pmr_use) { 1337 cmp = uvm_pmemrange_use_cmp(pmr, iter); 1338 if (cmp == 0) 1339 return iter; 1340 if (cmp == -1) 1341 break; 1342 } 1343 1344 if (iter == NULL) 1345 TAILQ_INSERT_TAIL(useq, pmr, pmr_use); 1346 else 1347 TAILQ_INSERT_BEFORE(iter, pmr, pmr_use); 1348 return NULL; 1349 } 1350 1351 #ifdef DEBUG 1352 /* 1353 * Validation of the whole pmemrange. 1354 * Called with fpageq locked. 1355 */ 1356 void 1357 uvm_pmr_assertvalid(struct uvm_pmemrange *pmr) 1358 { 1359 struct vm_page *prev, *next, *i, *xref; 1360 int lcv, mti; 1361 1362 /* Empty range */ 1363 if (pmr->nsegs == 0) 1364 return; 1365 1366 /* Validate address tree. */ 1367 RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) { 1368 /* Validate the range. */ 1369 KASSERT(i->fpgsz > 0); 1370 KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low); 1371 KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz 1372 <= pmr->high); 1373 1374 /* Validate each page in this range. */ 1375 for (lcv = 0; lcv < i->fpgsz; lcv++) { 1376 /* 1377 * Only the first page has a size specification. 1378 * Rest is size 0. 1379 */ 1380 KASSERT(lcv == 0 || i[lcv].fpgsz == 0); 1381 /* 1382 * Flag check. 1383 */ 1384 KASSERT(VALID_FLAGS(i[lcv].pg_flags) && 1385 (i[lcv].pg_flags & PQ_FREE) == PQ_FREE); 1386 /* 1387 * Free pages are: 1388 * - not wired 1389 * - have no vm_anon 1390 * - have no uvm_object 1391 */ 1392 KASSERT(i[lcv].wire_count == 0); 1393 KASSERT(i[lcv].uanon == (void*)0xdeadbeef || 1394 i[lcv].uanon == NULL); 1395 KASSERT(i[lcv].uobject == (void*)0xdeadbeef || 1396 i[lcv].uobject == NULL); 1397 /* 1398 * Pages in a single range always have the same 1399 * memtype. 1400 */ 1401 KASSERT(uvm_pmr_pg_to_memtype(&i[0]) == 1402 uvm_pmr_pg_to_memtype(&i[lcv])); 1403 } 1404 1405 /* Check that it shouldn't be joined with its predecessor. */ 1406 prev = RBT_PREV(uvm_pmr_addr, i); 1407 if (prev != NULL) { 1408 KASSERT(uvm_pmr_pg_to_memtype(i) != 1409 uvm_pmr_pg_to_memtype(prev) || 1410 atop(VM_PAGE_TO_PHYS(i)) > 1411 atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz || 1412 prev + prev->fpgsz != i); 1413 } 1414 1415 /* Assert i is in the size tree as well. */ 1416 if (i->fpgsz == 1) { 1417 TAILQ_FOREACH(xref, 1418 &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) { 1419 if (xref == i) 1420 break; 1421 } 1422 KASSERT(xref == i); 1423 } else { 1424 KASSERT(RBT_FIND(uvm_pmr_size, 1425 &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) == 1426 i + 1); 1427 } 1428 } 1429 1430 /* Validate size tree. */ 1431 for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) { 1432 for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) { 1433 next = uvm_pmr_nextsz(pmr, i, mti); 1434 if (next != NULL) { 1435 KASSERT(i->fpgsz <= 1436 next->fpgsz); 1437 } 1438 1439 /* Assert i is in the addr tree as well. */ 1440 KASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, i) == i); 1441 1442 /* Assert i is of the correct memory type. */ 1443 KASSERT(uvm_pmr_pg_to_memtype(i) == mti); 1444 } 1445 } 1446 1447 /* Validate nsegs statistic. */ 1448 lcv = 0; 1449 RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) 1450 lcv++; 1451 KASSERT(pmr->nsegs == lcv); 1452 } 1453 #endif /* DEBUG */ 1454 1455 /* 1456 * Split pmr at split point pageno. 1457 * Called with fpageq unlocked. 1458 * 1459 * Split is only applied if a pmemrange spans pageno. 1460 */ 1461 void 1462 uvm_pmr_split(paddr_t pageno) 1463 { 1464 struct uvm_pmemrange *pmr, *drain; 1465 struct vm_page *rebuild, *prev, *next; 1466 psize_t prev_sz; 1467 1468 uvm_lock_fpageq(); 1469 pmr = uvm_pmemrange_find(pageno); 1470 if (pmr == NULL || !(pmr->low < pageno)) { 1471 /* No split required. */ 1472 uvm_unlock_fpageq(); 1473 return; 1474 } 1475 1476 KASSERT(pmr->low < pageno); 1477 KASSERT(pmr->high > pageno); 1478 1479 /* 1480 * uvm_pmr_allocpmr() calls into malloc() which in turn calls into 1481 * uvm_kmemalloc which calls into pmemrange, making the locking 1482 * a bit hard, so we just race! 1483 */ 1484 uvm_unlock_fpageq(); 1485 drain = uvm_pmr_allocpmr(); 1486 uvm_lock_fpageq(); 1487 pmr = uvm_pmemrange_find(pageno); 1488 if (pmr == NULL || !(pmr->low < pageno)) { 1489 /* 1490 * We lost the race since someone else ran this or a related 1491 * function, however this should be triggered very rarely so 1492 * we just leak the pmr. 1493 */ 1494 printf("uvm_pmr_split: lost one pmr\n"); 1495 uvm_unlock_fpageq(); 1496 return; 1497 } 1498 1499 drain->low = pageno; 1500 drain->high = pmr->high; 1501 drain->use = pmr->use; 1502 1503 uvm_pmr_assertvalid(pmr); 1504 uvm_pmr_assertvalid(drain); 1505 KASSERT(drain->nsegs == 0); 1506 1507 RBT_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) { 1508 if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno) 1509 break; 1510 } 1511 if (rebuild == NULL) 1512 prev = RBT_MAX(uvm_pmr_addr, &pmr->addr); 1513 else 1514 prev = RBT_PREV(uvm_pmr_addr, rebuild); 1515 KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno); 1516 1517 /* 1518 * Handle free chunk that spans the split point. 1519 */ 1520 if (prev != NULL && 1521 atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) { 1522 psize_t before, after; 1523 1524 KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno); 1525 1526 uvm_pmr_remove(pmr, prev); 1527 prev_sz = prev->fpgsz; 1528 before = pageno - atop(VM_PAGE_TO_PHYS(prev)); 1529 after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno; 1530 1531 KASSERT(before > 0); 1532 KASSERT(after > 0); 1533 1534 prev->fpgsz = before; 1535 uvm_pmr_insert(pmr, prev, 1); 1536 (prev + before)->fpgsz = after; 1537 uvm_pmr_insert(drain, prev + before, 1); 1538 } 1539 1540 /* Move free chunks that no longer fall in the range. */ 1541 for (; rebuild != NULL; rebuild = next) { 1542 next = RBT_NEXT(uvm_pmr_addr, rebuild); 1543 1544 uvm_pmr_remove(pmr, rebuild); 1545 uvm_pmr_insert(drain, rebuild, 1); 1546 } 1547 1548 pmr->high = pageno; 1549 uvm_pmr_assertvalid(pmr); 1550 uvm_pmr_assertvalid(drain); 1551 1552 RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain); 1553 uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain); 1554 uvm_unlock_fpageq(); 1555 } 1556 1557 /* 1558 * Increase the usage counter for the given range of memory. 1559 * 1560 * The more usage counters a given range of memory has, the more will be 1561 * attempted not to allocate from it. 1562 * 1563 * Addresses here are in paddr_t, not page-numbers. 1564 * The lowest and highest allowed address are specified. 1565 */ 1566 void 1567 uvm_pmr_use_inc(paddr_t low, paddr_t high) 1568 { 1569 struct uvm_pmemrange *pmr; 1570 paddr_t sz; 1571 1572 /* pmr uses page numbers, translate low and high. */ 1573 high++; 1574 high = atop(trunc_page(high)); 1575 low = atop(round_page(low)); 1576 uvm_pmr_split(low); 1577 uvm_pmr_split(high); 1578 1579 sz = 0; 1580 uvm_lock_fpageq(); 1581 /* Increase use count on segments in range. */ 1582 RBT_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) { 1583 if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) { 1584 TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use); 1585 pmr->use++; 1586 sz += pmr->high - pmr->low; 1587 uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr); 1588 } 1589 uvm_pmr_assertvalid(pmr); 1590 } 1591 uvm_unlock_fpageq(); 1592 1593 KASSERT(sz >= high - low); 1594 } 1595 1596 /* 1597 * Allocate a pmemrange. 1598 * 1599 * If called from uvm_page_init, the uvm_pageboot_alloc is used. 1600 * If called after uvm_init, malloc is used. 1601 * (And if called in between, you're dead.) 1602 */ 1603 struct uvm_pmemrange * 1604 uvm_pmr_allocpmr(void) 1605 { 1606 struct uvm_pmemrange *nw; 1607 int i; 1608 1609 /* We're only ever hitting the !uvm.page_init_done case for now. */ 1610 if (!uvm.page_init_done) { 1611 nw = (struct uvm_pmemrange *) 1612 uvm_pageboot_alloc(sizeof(struct uvm_pmemrange)); 1613 } else { 1614 nw = malloc(sizeof(struct uvm_pmemrange), 1615 M_VMMAP, M_NOWAIT); 1616 } 1617 KASSERT(nw != NULL); 1618 memset(nw, 0, sizeof(struct uvm_pmemrange)); 1619 RBT_INIT(uvm_pmr_addr, &nw->addr); 1620 for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) { 1621 RBT_INIT(uvm_pmr_size, &nw->size[i]); 1622 TAILQ_INIT(&nw->single[i]); 1623 } 1624 return nw; 1625 } 1626 1627 /* 1628 * Initialization of pmr. 1629 * Called by uvm_page_init. 1630 * 1631 * Sets up pmemranges. 1632 */ 1633 void 1634 uvm_pmr_init(void) 1635 { 1636 struct uvm_pmemrange *new_pmr; 1637 int i; 1638 1639 TAILQ_INIT(&uvm.pmr_control.use); 1640 RBT_INIT(uvm_pmemrange_addr, &uvm.pmr_control.addr); 1641 TAILQ_INIT(&uvm.pmr_control.allocs); 1642 1643 /* By default, one range for the entire address space. */ 1644 new_pmr = uvm_pmr_allocpmr(); 1645 new_pmr->low = 0; 1646 new_pmr->high = atop((paddr_t)-1) + 1; 1647 1648 RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr); 1649 uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr); 1650 1651 for (i = 0; uvm_md_constraints[i] != NULL; i++) { 1652 uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low, 1653 uvm_md_constraints[i]->ucr_high); 1654 } 1655 } 1656 1657 /* 1658 * Find the pmemrange that contains the given page number. 1659 * 1660 * (Manually traverses the binary tree, because that is cheaper on stack 1661 * usage.) 1662 */ 1663 struct uvm_pmemrange * 1664 uvm_pmemrange_find(paddr_t pageno) 1665 { 1666 struct uvm_pmemrange *pmr; 1667 1668 pmr = RBT_ROOT(uvm_pmemrange_addr, &uvm.pmr_control.addr); 1669 while (pmr != NULL) { 1670 if (pmr->low > pageno) 1671 pmr = RBT_LEFT(uvm_pmemrange_addr, pmr); 1672 else if (pmr->high <= pageno) 1673 pmr = RBT_RIGHT(uvm_pmemrange_addr, pmr); 1674 else 1675 break; 1676 } 1677 1678 return pmr; 1679 } 1680 1681 #if defined(DDB) || defined(DEBUG) 1682 /* 1683 * Return true if the given page is in any of the free lists. 1684 * Used by uvm_page_printit. 1685 * This function is safe, even if the page is not on the freeq. 1686 * Note: does not apply locking, only called from ddb. 1687 */ 1688 int 1689 uvm_pmr_isfree(struct vm_page *pg) 1690 { 1691 struct vm_page *r; 1692 struct uvm_pmemrange *pmr; 1693 1694 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); 1695 if (pmr == NULL) 1696 return 0; 1697 r = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg); 1698 if (r == NULL) 1699 r = RBT_MAX(uvm_pmr_addr, &pmr->addr); 1700 else if (r != pg) 1701 r = RBT_PREV(uvm_pmr_addr, r); 1702 if (r == NULL) 1703 return 0; /* Empty tree. */ 1704 1705 KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg))); 1706 return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz > 1707 atop(VM_PAGE_TO_PHYS(pg)); 1708 } 1709 #endif /* DEBUG */ 1710 1711 /* 1712 * Given a root of a tree, find a range which intersects start, end and 1713 * is of the same memtype. 1714 * 1715 * Page must be in the address tree. 1716 */ 1717 struct vm_page* 1718 uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root, 1719 paddr_t start, paddr_t end, int memtype) 1720 { 1721 int direction; 1722 struct vm_page *root; 1723 struct vm_page *high, *high_next; 1724 struct vm_page *low, *low_next; 1725 1726 KDASSERT(pmr != NULL && init_root != NULL); 1727 root = init_root; 1728 1729 /* Which direction to use for searching. */ 1730 if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start) 1731 direction = 1; 1732 else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end) 1733 direction = -1; 1734 else /* nothing to do */ 1735 return root; 1736 1737 /* First, update root to fall within the chosen range. */ 1738 while (root && !PMR_INTERSECTS_WITH( 1739 atop(VM_PAGE_TO_PHYS(root)), 1740 atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz, 1741 start, end)) { 1742 if (direction == 1) 1743 root = RBT_RIGHT(uvm_objtree, root); 1744 else 1745 root = RBT_LEFT(uvm_objtree, root); 1746 } 1747 if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype) 1748 return root; 1749 1750 /* 1751 * Root is valid, but of the wrong memtype. 1752 * 1753 * Try to find a range that has the given memtype in the subtree 1754 * (memtype mismatches are costly, either because the conversion 1755 * is expensive, or a later allocation will need to do the opposite 1756 * conversion, which will be expensive). 1757 * 1758 * 1759 * First, simply increase address until we hit something we can use. 1760 * Cache the upper page, so we can page-walk later. 1761 */ 1762 high = root; 1763 high_next = RBT_RIGHT(uvm_objtree, high); 1764 while (high_next != NULL && PMR_INTERSECTS_WITH( 1765 atop(VM_PAGE_TO_PHYS(high_next)), 1766 atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz, 1767 start, end)) { 1768 high = high_next; 1769 if (uvm_pmr_pg_to_memtype(high) == memtype) 1770 return high; 1771 high_next = RBT_RIGHT(uvm_objtree, high); 1772 } 1773 1774 /* 1775 * Second, decrease the address until we hit something we can use. 1776 * Cache the lower page, so we can page-walk later. 1777 */ 1778 low = root; 1779 low_next = RBT_LEFT(uvm_objtree, low); 1780 while (low_next != NULL && PMR_INTERSECTS_WITH( 1781 atop(VM_PAGE_TO_PHYS(low_next)), 1782 atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz, 1783 start, end)) { 1784 low = low_next; 1785 if (uvm_pmr_pg_to_memtype(low) == memtype) 1786 return low; 1787 low_next = RBT_LEFT(uvm_objtree, low); 1788 } 1789 1790 if (low == high) 1791 return NULL; 1792 1793 /* No hits. Walk the address tree until we find something usable. */ 1794 for (low = RBT_NEXT(uvm_pmr_addr, low); 1795 low != high; 1796 low = RBT_NEXT(uvm_pmr_addr, low)) { 1797 KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)), 1798 atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz, 1799 start, end)); 1800 if (uvm_pmr_pg_to_memtype(low) == memtype) 1801 return low; 1802 } 1803 1804 /* Nothing found. */ 1805 return NULL; 1806 } 1807 1808 /* 1809 * Allocate any page, the fastest way. Page number constraints only. 1810 */ 1811 psize_t 1812 uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result, 1813 paddr_t start, paddr_t end, int memtype_only) 1814 { 1815 struct uvm_pmemrange *pmr; 1816 struct vm_page *found, *splitpg; 1817 psize_t fcount; 1818 int memtype; 1819 1820 fcount = 0; 1821 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { 1822 /* We're done. */ 1823 if (fcount == count) 1824 break; 1825 1826 /* Outside requested range. */ 1827 if (!(start == 0 && end == 0) && 1828 !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end)) 1829 continue; 1830 1831 /* Range is empty. */ 1832 if (pmr->nsegs == 0) 1833 continue; 1834 1835 /* Loop over all memtypes, starting at memtype_init. */ 1836 memtype = memtype_init; 1837 while (fcount != count) { 1838 found = TAILQ_FIRST(&pmr->single[memtype]); 1839 /* 1840 * If found is outside the range, walk the list 1841 * until we find something that intersects with 1842 * boundaries. 1843 */ 1844 while (found && !PMR_INTERSECTS_WITH( 1845 atop(VM_PAGE_TO_PHYS(found)), 1846 atop(VM_PAGE_TO_PHYS(found)) + 1, 1847 start, end)) 1848 found = TAILQ_NEXT(found, pageq); 1849 1850 if (found == NULL) { 1851 /* 1852 * Check if the size tree contains a range 1853 * that intersects with the boundaries. As the 1854 * allocation is for any page, try the smallest 1855 * range so that large ranges are preserved for 1856 * more constrained cases. Only one entry is 1857 * checked here, to avoid a brute-force search. 1858 * 1859 * Note that a size tree gives pg[1] instead of 1860 * pg[0]. 1861 */ 1862 found = RBT_MIN(uvm_pmr_size, 1863 &pmr->size[memtype]); 1864 if (found != NULL) { 1865 found--; 1866 if (!PMR_INTERSECTS_WITH( 1867 atop(VM_PAGE_TO_PHYS(found)), 1868 atop(VM_PAGE_TO_PHYS(found)) + 1869 found->fpgsz, start, end)) 1870 found = NULL; 1871 } 1872 } 1873 if (found == NULL) { 1874 /* 1875 * Try address-guided search to meet the page 1876 * number constraints. 1877 */ 1878 found = RBT_ROOT(uvm_pmr_addr, &pmr->addr); 1879 if (found != NULL) { 1880 found = uvm_pmr_rootupdate(pmr, found, 1881 start, end, memtype); 1882 } 1883 } 1884 if (found != NULL) { 1885 uvm_pmr_assertvalid(pmr); 1886 uvm_pmr_remove_size(pmr, found); 1887 1888 /* 1889 * If the page intersects the end, then it'll 1890 * need splitting. 1891 * 1892 * Note that we don't need to split if the page 1893 * intersects start: the drain function will 1894 * simply stop on hitting start. 1895 */ 1896 if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) + 1897 found->fpgsz > end) { 1898 psize_t splitsz = 1899 atop(VM_PAGE_TO_PHYS(found)) + 1900 found->fpgsz - end; 1901 1902 uvm_pmr_remove_addr(pmr, found); 1903 uvm_pmr_assertvalid(pmr); 1904 found->fpgsz -= splitsz; 1905 splitpg = found + found->fpgsz; 1906 splitpg->fpgsz = splitsz; 1907 uvm_pmr_insert(pmr, splitpg, 1); 1908 1909 /* 1910 * At this point, splitpg and found 1911 * actually should be joined. 1912 * But we explicitly disable that, 1913 * because we will start subtracting 1914 * from found. 1915 */ 1916 KASSERT(start == 0 || 1917 atop(VM_PAGE_TO_PHYS(found)) + 1918 found->fpgsz > start); 1919 uvm_pmr_insert_addr(pmr, found, 1); 1920 } 1921 1922 /* 1923 * Fetch pages from the end. 1924 * If the range is larger than the requested 1925 * number of pages, this saves us an addr-tree 1926 * update. 1927 * 1928 * Since we take from the end and insert at 1929 * the head, any ranges keep preserved. 1930 */ 1931 while (found->fpgsz > 0 && fcount < count && 1932 (start == 0 || 1933 atop(VM_PAGE_TO_PHYS(found)) + 1934 found->fpgsz > start)) { 1935 found->fpgsz--; 1936 fcount++; 1937 TAILQ_INSERT_HEAD(result, 1938 &found[found->fpgsz], pageq); 1939 } 1940 if (found->fpgsz > 0) { 1941 uvm_pmr_insert_size(pmr, found); 1942 KDASSERT(fcount == count); 1943 uvm_pmr_assertvalid(pmr); 1944 return fcount; 1945 } 1946 1947 /* 1948 * Delayed addr-tree removal. 1949 */ 1950 uvm_pmr_remove_addr(pmr, found); 1951 uvm_pmr_assertvalid(pmr); 1952 } else { 1953 if (memtype_only) 1954 break; 1955 /* 1956 * Skip to the next memtype. 1957 */ 1958 memtype += 1; 1959 if (memtype == UVM_PMR_MEMTYPE_MAX) 1960 memtype = 0; 1961 if (memtype == memtype_init) 1962 break; 1963 } 1964 } 1965 } 1966 1967 /* 1968 * Search finished. 1969 * 1970 * Ran out of ranges before enough pages were gathered, or we hit the 1971 * case where found->fpgsz == count - fcount, in which case the 1972 * above exit condition didn't trigger. 1973 * 1974 * On failure, caller will free the pages. 1975 */ 1976 return fcount; 1977 } 1978 1979 #ifdef DDB 1980 /* 1981 * Print information about pmemrange. 1982 * Does not do locking (so either call it from DDB or acquire fpageq lock 1983 * before invoking. 1984 */ 1985 void 1986 uvm_pmr_print(void) 1987 { 1988 struct uvm_pmemrange *pmr; 1989 struct vm_page *pg; 1990 psize_t size[UVM_PMR_MEMTYPE_MAX]; 1991 psize_t free; 1992 int useq_len; 1993 int mt; 1994 1995 printf("Ranges, use queue:\n"); 1996 useq_len = 0; 1997 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { 1998 useq_len++; 1999 free = 0; 2000 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { 2001 pg = RBT_MAX(uvm_pmr_size, &pmr->size[mt]); 2002 if (pg != NULL) 2003 pg--; 2004 else 2005 pg = TAILQ_FIRST(&pmr->single[mt]); 2006 size[mt] = (pg == NULL ? 0 : pg->fpgsz); 2007 2008 RBT_FOREACH(pg, uvm_pmr_addr, &pmr->addr) 2009 free += pg->fpgsz; 2010 } 2011 2012 printf("* [0x%lx-0x%lx] use=%d nsegs=%ld", 2013 (unsigned long)pmr->low, (unsigned long)pmr->high, 2014 pmr->use, (unsigned long)pmr->nsegs); 2015 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { 2016 printf(" maxsegsz[%d]=0x%lx", mt, 2017 (unsigned long)size[mt]); 2018 } 2019 printf(" free=0x%lx\n", (unsigned long)free); 2020 } 2021 printf("#ranges = %d\n", useq_len); 2022 } 2023 #endif 2024 2025 /* 2026 * uvm_wait_pla: wait (sleep) for the page daemon to free some pages 2027 * in a specific physmem area. 2028 * 2029 * Returns ENOMEM if the pagedaemon failed to free any pages. 2030 * If not failok, failure will lead to panic. 2031 * 2032 * Must be called with fpageq locked. 2033 */ 2034 int 2035 uvm_wait_pla(paddr_t low, paddr_t high, paddr_t size, int failok) 2036 { 2037 struct uvm_pmalloc pma; 2038 const char *wmsg = "pmrwait"; 2039 2040 if (curproc == uvm.pagedaemon_proc) { 2041 /* 2042 * This is not that uncommon when the pagedaemon is trying 2043 * to flush out a large mmapped file. VOP_WRITE will circle 2044 * back through the buffer cache and try to get more memory. 2045 * The pagedaemon starts by calling bufbackoff, but we can 2046 * easily use up that reserve in a single scan iteration. 2047 */ 2048 uvm_unlock_fpageq(); 2049 if (bufbackoff(NULL, atop(size)) == 0) { 2050 uvm_lock_fpageq(); 2051 return 0; 2052 } 2053 uvm_lock_fpageq(); 2054 2055 /* 2056 * XXX detect pagedaemon deadlock - see comment in 2057 * uvm_wait(), as this is exactly the same issue. 2058 */ 2059 printf("pagedaemon: wait_pla deadlock detected!\n"); 2060 msleep_nsec(&uvmexp.free, &uvm.fpageqlock, PVM, wmsg, 2061 MSEC_TO_NSEC(125)); 2062 #if defined(DEBUG) 2063 /* DEBUG: panic so we can debug it */ 2064 panic("wait_pla pagedaemon deadlock"); 2065 #endif 2066 return 0; 2067 } 2068 2069 for (;;) { 2070 pma.pm_constraint.ucr_low = low; 2071 pma.pm_constraint.ucr_high = high; 2072 pma.pm_size = size; 2073 pma.pm_flags = UVM_PMA_LINKED; 2074 TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, &pma, pmq); 2075 2076 wakeup(&uvm.pagedaemon); /* wake the daemon! */ 2077 while (pma.pm_flags & (UVM_PMA_LINKED | UVM_PMA_BUSY)) 2078 msleep_nsec(&pma, &uvm.fpageqlock, PVM, wmsg, INFSLP); 2079 2080 if (!(pma.pm_flags & UVM_PMA_FREED) && 2081 pma.pm_flags & UVM_PMA_FAIL) { 2082 if (failok) 2083 return ENOMEM; 2084 printf("uvm_wait: failed to free %ld pages between " 2085 "0x%lx-0x%lx\n", atop(size), low, high); 2086 } else 2087 return 0; 2088 } 2089 /* UNREACHABLE */ 2090 } 2091 2092 /* 2093 * Wake up uvm_pmalloc sleepers. 2094 */ 2095 void 2096 uvm_wakeup_pla(paddr_t low, psize_t len) 2097 { 2098 struct uvm_pmalloc *pma, *pma_next; 2099 paddr_t high; 2100 2101 high = low + len; 2102 2103 /* Wake specific allocations waiting for this memory. */ 2104 for (pma = TAILQ_FIRST(&uvm.pmr_control.allocs); pma != NULL; 2105 pma = pma_next) { 2106 pma_next = TAILQ_NEXT(pma, pmq); 2107 2108 if (low < pma->pm_constraint.ucr_high && 2109 high > pma->pm_constraint.ucr_low) { 2110 pma->pm_flags |= UVM_PMA_FREED; 2111 if (!(pma->pm_flags & UVM_PMA_BUSY)) { 2112 pma->pm_flags &= ~UVM_PMA_LINKED; 2113 TAILQ_REMOVE(&uvm.pmr_control.allocs, pma, 2114 pmq); 2115 wakeup(pma); 2116 } 2117 } 2118 } 2119 } 2120 2121 void 2122 uvm_pagezero_thread(void *arg) 2123 { 2124 struct pglist pgl; 2125 struct vm_page *pg; 2126 int count; 2127 2128 /* Run at the lowest possible priority. */ 2129 curproc->p_p->ps_nice = NZERO + PRIO_MAX; 2130 2131 KERNEL_UNLOCK(); 2132 2133 TAILQ_INIT(&pgl); 2134 for (;;) { 2135 uvm_lock_fpageq(); 2136 while (uvmexp.zeropages >= UVM_PAGEZERO_TARGET || 2137 (count = uvm_pmr_get1page(16, UVM_PMR_MEMTYPE_DIRTY, 2138 &pgl, 0, 0, 1)) == 0) { 2139 msleep_nsec(&uvmexp.zeropages, &uvm.fpageqlock, 2140 MAXPRI, "pgzero", INFSLP); 2141 } 2142 uvm_unlock_fpageq(); 2143 2144 TAILQ_FOREACH(pg, &pgl, pageq) { 2145 uvm_pagezero(pg); 2146 atomic_setbits_int(&pg->pg_flags, PG_ZERO); 2147 } 2148 2149 uvm_lock_fpageq(); 2150 while (!TAILQ_EMPTY(&pgl)) 2151 uvm_pmr_remove_1strange(&pgl, 0, NULL, 0); 2152 uvmexp.zeropages += count; 2153 uvm_unlock_fpageq(); 2154 2155 yield(); 2156 } 2157 } 2158