1 /* $OpenBSD: uvm_pmemrange.c,v 1.76 2024/11/08 15:54:33 mpi Exp $ */ 2 3 /* 4 * Copyright (c) 2024 Martin Pieuchot <mpi@openbsd.org> 5 * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19 20 #include <sys/param.h> 21 #include <sys/systm.h> 22 #include <uvm/uvm.h> 23 #include <sys/malloc.h> 24 #include <sys/kernel.h> 25 #include <sys/proc.h> 26 #include <sys/mount.h> 27 28 /* 29 * 2 trees: addr tree and size tree. 30 * 31 * The allocator keeps chunks of free pages (called a range). 32 * Two pages are part of the same range if: 33 * - all pages in between are part of that range, 34 * - they are of the same memory type (zeroed or non-zeroed), 35 * - they are part of the same pmemrange. 36 * A pmemrange is a range of memory which is part of the same vm_physseg 37 * and has a use-count. 38 * 39 * addr tree is vm_page[0].objt 40 * size tree is vm_page[1].objt 41 * 42 * The size tree is not used for memory ranges of 1 page, instead, 43 * single queue is vm_page[0].pageq 44 * 45 * vm_page[0].fpgsz describes the length of a free range. Two adjacent ranges 46 * are joined, unless: 47 * - they have pages in between them which are not free 48 * - they belong to different memtypes (zeroed vs dirty memory) 49 * - they are in different pmemrange areas (ISA vs non-ISA memory for instance) 50 * - they are not a continuation of the same array 51 * The latter issue is caused by vm_physseg ordering and splitting from the 52 * MD initialization machinery. The MD code is dependant on freelists and 53 * happens to split ISA memory from non-ISA memory. 54 * (Note: freelists die die die!) 55 * 56 * uvm_page_init guarantees that every vm_physseg contains an array of 57 * struct vm_page. Also, uvm_page_physload allocates an array of struct 58 * vm_page. This code depends on that array. The array may break across 59 * vm_physsegs boundaries. 60 */ 61 62 /* 63 * Validate the flags of the page. (Used in asserts.) 64 * Any free page must have the PQ_FREE flag set. 65 * Free pages may be zeroed. 66 * Pmap flags are left untouched. 67 * 68 * The PQ_FREE flag is not checked here: by not checking, we can easily use 69 * this check in pages which are freed. 70 */ 71 #define VALID_FLAGS(pg_flags) \ 72 (((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0) 73 74 /* Tree comparators. */ 75 int uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *, 76 const struct uvm_pmemrange *); 77 int uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *); 78 int uvm_pmr_pg_to_memtype(struct vm_page *); 79 80 #ifdef DDB 81 void uvm_pmr_print(void); 82 #endif 83 84 static inline int 85 in_pagedaemon(int allowsyncer) 86 { 87 if (curcpu()->ci_idepth > 0) 88 return 0; 89 if (curproc == uvm.pagedaemon_proc) 90 return 1; 91 /* XXX why is the syncer allowed to use the pagedaemon's reserve? */ 92 if (allowsyncer && (curproc == syncerproc)) 93 return 1; 94 return 0; 95 } 96 97 /* 98 * Memory types. The page flags are used to derive what the current memory 99 * type of a page is. 100 */ 101 int 102 uvm_pmr_pg_to_memtype(struct vm_page *pg) 103 { 104 if (pg->pg_flags & PG_ZERO) 105 return UVM_PMR_MEMTYPE_ZERO; 106 /* Default: dirty memory. */ 107 return UVM_PMR_MEMTYPE_DIRTY; 108 } 109 110 /* Trees. */ 111 RBT_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp); 112 RBT_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp); 113 RBT_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr, 114 uvm_pmemrange_addr_cmp); 115 116 /* Validation. */ 117 #ifdef DEBUG 118 void uvm_pmr_assertvalid(struct uvm_pmemrange *pmr); 119 #else 120 #define uvm_pmr_assertvalid(pmr) do {} while (0) 121 #endif 122 123 psize_t uvm_pmr_get1page(psize_t, int, struct pglist *, 124 paddr_t, paddr_t, int); 125 126 struct uvm_pmemrange *uvm_pmr_allocpmr(void); 127 struct vm_page *uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int); 128 struct vm_page *uvm_pmr_nextsz(struct uvm_pmemrange *, 129 struct vm_page *, int); 130 void uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, 131 struct vm_page *pg, struct vm_page **pg_prev, 132 struct vm_page **pg_next); 133 struct vm_page *uvm_pmr_findnextsegment(struct uvm_pmemrange *, 134 struct vm_page *, paddr_t); 135 struct vm_page *uvm_pmr_findprevsegment(struct uvm_pmemrange *, 136 struct vm_page *, paddr_t); 137 psize_t uvm_pmr_remove_1strange(struct pglist *, paddr_t, 138 struct vm_page **, int); 139 psize_t uvm_pmr_remove_1strange_reverse(struct pglist *, 140 paddr_t *); 141 void uvm_pmr_split(paddr_t); 142 struct uvm_pmemrange *uvm_pmemrange_find(paddr_t); 143 struct uvm_pmemrange *uvm_pmemrange_use_insert(struct uvm_pmemrange_use *, 144 struct uvm_pmemrange *); 145 psize_t pow2divide(psize_t, psize_t); 146 struct vm_page *uvm_pmr_rootupdate(struct uvm_pmemrange *, 147 struct vm_page *, paddr_t, paddr_t, int); 148 149 /* 150 * Computes num/denom and rounds it up to the next power-of-2. 151 * 152 * This is a division function which calculates an approximation of 153 * num/denom, with result =~ num/denom. It is meant to be fast and doesn't 154 * have to be accurate. 155 * 156 * Providing too large a value makes the allocator slightly faster, at the 157 * risk of hitting the failure case more often. Providing too small a value 158 * makes the allocator a bit slower, but less likely to hit a failure case. 159 */ 160 psize_t 161 pow2divide(psize_t num, psize_t denom) 162 { 163 int rshift; 164 165 for (rshift = 0; num > denom; rshift++, denom <<= 1) 166 ; 167 return (paddr_t)1 << rshift; 168 } 169 170 /* 171 * Predicate: lhs is a subrange or rhs. 172 * 173 * If rhs_low == 0: don't care about lower bound. 174 * If rhs_high == 0: don't care about upper bound. 175 */ 176 #define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high) \ 177 (((rhs_low) == 0 || (lhs_low) >= (rhs_low)) && \ 178 ((rhs_high) == 0 || (lhs_high) <= (rhs_high))) 179 180 /* 181 * Predicate: lhs intersects with rhs. 182 * 183 * If rhs_low == 0: don't care about lower bound. 184 * If rhs_high == 0: don't care about upper bound. 185 * Ranges don't intersect if they don't have any page in common, array 186 * semantics mean that < instead of <= should be used here. 187 */ 188 #define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high) \ 189 (((rhs_low) == 0 || (rhs_low) < (lhs_high)) && \ 190 ((rhs_high) == 0 || (lhs_low) < (rhs_high))) 191 192 /* 193 * Align to power-of-2 alignment. 194 */ 195 #define PMR_ALIGN(pgno, align) \ 196 (((pgno) + ((align) - 1)) & ~((align) - 1)) 197 #define PMR_ALIGN_DOWN(pgno, align) \ 198 ((pgno) & ~((align) - 1)) 199 200 201 /* 202 * Comparator: sort by address ascending. 203 */ 204 int 205 uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *lhs, 206 const struct uvm_pmemrange *rhs) 207 { 208 return lhs->low < rhs->low ? -1 : lhs->low > rhs->low; 209 } 210 211 /* 212 * Comparator: sort by use ascending. 213 * 214 * The higher the use value of a range, the more devices need memory in 215 * this range. Therefore allocate from the range with the lowest use first. 216 */ 217 int 218 uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs) 219 { 220 int result; 221 222 result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use; 223 if (result == 0) 224 result = uvm_pmemrange_addr_cmp(lhs, rhs); 225 return result; 226 } 227 228 int 229 uvm_pmr_addr_cmp(const struct vm_page *lhs, const struct vm_page *rhs) 230 { 231 paddr_t lhs_addr, rhs_addr; 232 233 lhs_addr = VM_PAGE_TO_PHYS(lhs); 234 rhs_addr = VM_PAGE_TO_PHYS(rhs); 235 236 return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr); 237 } 238 239 int 240 uvm_pmr_size_cmp(const struct vm_page *lhs, const struct vm_page *rhs) 241 { 242 psize_t lhs_size, rhs_size; 243 int cmp; 244 245 /* Using second tree, so we receive pg[1] instead of pg[0]. */ 246 lhs_size = (lhs - 1)->fpgsz; 247 rhs_size = (rhs - 1)->fpgsz; 248 249 cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size); 250 if (cmp == 0) 251 cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1); 252 return cmp; 253 } 254 255 /* 256 * Find the first range of free pages that is at least sz pages long. 257 */ 258 struct vm_page * 259 uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti) 260 { 261 struct vm_page *node, *best; 262 263 KASSERT(sz >= 1); 264 265 if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti])) 266 return TAILQ_FIRST(&pmr->single[mti]); 267 268 node = RBT_ROOT(uvm_pmr_size, &pmr->size[mti]); 269 best = NULL; 270 while (node != NULL) { 271 if ((node - 1)->fpgsz >= sz) { 272 best = (node - 1); 273 node = RBT_LEFT(uvm_objtree, node); 274 } else 275 node = RBT_RIGHT(uvm_objtree, node); 276 } 277 return best; 278 } 279 280 /* 281 * Finds the next range. The next range has a size >= pg->fpgsz. 282 * Returns NULL if no more ranges are available. 283 */ 284 struct vm_page * 285 uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt) 286 { 287 struct vm_page *npg; 288 289 KASSERT(pmr != NULL && pg != NULL); 290 if (pg->fpgsz == 1) { 291 if (TAILQ_NEXT(pg, pageq) != NULL) 292 return TAILQ_NEXT(pg, pageq); 293 else 294 npg = RBT_MIN(uvm_pmr_size, &pmr->size[mt]); 295 } else 296 npg = RBT_NEXT(uvm_pmr_size, pg + 1); 297 298 return npg == NULL ? NULL : npg - 1; 299 } 300 301 /* 302 * Finds the previous and next ranges relative to the (uninserted) pg range. 303 * 304 * *pg_prev == NULL if no previous range is available, that can join with 305 * pg. 306 * *pg_next == NULL if no next range is available, that can join with 307 * pg. 308 */ 309 void 310 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg, 311 struct vm_page **pg_prev, struct vm_page **pg_next) 312 { 313 KASSERT(pg_prev != NULL && pg_next != NULL); 314 315 *pg_next = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg); 316 if (*pg_next == NULL) 317 *pg_prev = RBT_MAX(uvm_pmr_addr, &pmr->addr); 318 else 319 *pg_prev = RBT_PREV(uvm_pmr_addr, *pg_next); 320 321 KDASSERT(*pg_next == NULL || 322 VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg)); 323 KDASSERT(*pg_prev == NULL || 324 VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg)); 325 326 /* Reset if not contig. */ 327 if (*pg_prev != NULL && 328 (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz 329 != atop(VM_PAGE_TO_PHYS(pg)) || 330 *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */ 331 uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg))) 332 *pg_prev = NULL; 333 if (*pg_next != NULL && 334 (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz 335 != atop(VM_PAGE_TO_PHYS(*pg_next)) || 336 pg + pg->fpgsz != *pg_next || /* Array broke. */ 337 uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg))) 338 *pg_next = NULL; 339 return; 340 } 341 342 /* 343 * Remove a range from the address tree. 344 * Address tree maintains pmr counters. 345 */ 346 void 347 uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg) 348 { 349 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg); 350 KDASSERT(pg->pg_flags & PQ_FREE); 351 RBT_REMOVE(uvm_pmr_addr, &pmr->addr, pg); 352 353 pmr->nsegs--; 354 } 355 /* 356 * Remove a range from the size tree. 357 */ 358 void 359 uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg) 360 { 361 int memtype; 362 #ifdef DEBUG 363 struct vm_page *i; 364 #endif 365 366 KDASSERT(pg->fpgsz >= 1); 367 KDASSERT(pg->pg_flags & PQ_FREE); 368 memtype = uvm_pmr_pg_to_memtype(pg); 369 370 if (pg->fpgsz == 1) { 371 #ifdef DEBUG 372 TAILQ_FOREACH(i, &pmr->single[memtype], pageq) { 373 if (i == pg) 374 break; 375 } 376 KDASSERT(i == pg); 377 #endif 378 TAILQ_REMOVE(&pmr->single[memtype], pg, pageq); 379 } else { 380 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[memtype], 381 pg + 1) == pg + 1); 382 RBT_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1); 383 } 384 } 385 /* Remove from both trees. */ 386 void 387 uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg) 388 { 389 uvm_pmr_assertvalid(pmr); 390 uvm_pmr_remove_size(pmr, pg); 391 uvm_pmr_remove_addr(pmr, pg); 392 uvm_pmr_assertvalid(pmr); 393 } 394 395 /* 396 * Insert the range described in pg. 397 * Returns the range thus created (which may be joined with the previous and 398 * next ranges). 399 * If no_join, the caller guarantees that the range cannot possibly join 400 * with adjacent ranges. 401 */ 402 struct vm_page * 403 uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join) 404 { 405 struct vm_page *prev, *next; 406 407 #ifdef DEBUG 408 struct vm_page *i; 409 int mt; 410 #endif 411 412 KDASSERT(pg->pg_flags & PQ_FREE); 413 KDASSERT(pg->fpgsz >= 1); 414 415 #ifdef DEBUG 416 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { 417 TAILQ_FOREACH(i, &pmr->single[mt], pageq) 418 KDASSERT(i != pg); 419 if (pg->fpgsz > 1) { 420 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mt], 421 pg + 1) == NULL); 422 } 423 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL); 424 } 425 #endif 426 427 if (!no_join) { 428 uvm_pmr_pnaddr(pmr, pg, &prev, &next); 429 if (next != NULL) { 430 uvm_pmr_remove_size(pmr, next); 431 uvm_pmr_remove_addr(pmr, next); 432 pg->fpgsz += next->fpgsz; 433 next->fpgsz = 0; 434 } 435 if (prev != NULL) { 436 uvm_pmr_remove_size(pmr, prev); 437 prev->fpgsz += pg->fpgsz; 438 pg->fpgsz = 0; 439 return prev; 440 } 441 } 442 443 RBT_INSERT(uvm_pmr_addr, &pmr->addr, pg); 444 445 pmr->nsegs++; 446 447 return pg; 448 } 449 /* 450 * Insert the range described in pg. 451 * Returns the range thus created (which may be joined with the previous and 452 * next ranges). 453 * Page must already be in the address tree. 454 */ 455 void 456 uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg) 457 { 458 int memtype; 459 #ifdef DEBUG 460 struct vm_page *i; 461 int mti; 462 #endif 463 464 KDASSERT(pg->fpgsz >= 1); 465 KDASSERT(pg->pg_flags & PQ_FREE); 466 467 memtype = uvm_pmr_pg_to_memtype(pg); 468 #ifdef DEBUG 469 for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) { 470 TAILQ_FOREACH(i, &pmr->single[mti], pageq) 471 KDASSERT(i != pg); 472 if (pg->fpgsz > 1) { 473 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mti], 474 pg + 1) == NULL); 475 } 476 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg); 477 } 478 for (i = pg; i < pg + pg->fpgsz; i++) 479 KASSERT(uvm_pmr_pg_to_memtype(i) == memtype); 480 #endif 481 482 if (pg->fpgsz == 1) 483 TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq); 484 else 485 RBT_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1); 486 } 487 /* Insert in both trees. */ 488 struct vm_page * 489 uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join) 490 { 491 uvm_pmr_assertvalid(pmr); 492 pg = uvm_pmr_insert_addr(pmr, pg, no_join); 493 uvm_pmr_insert_size(pmr, pg); 494 uvm_pmr_assertvalid(pmr); 495 return pg; 496 } 497 498 /* 499 * Find the last page that is part of this segment. 500 * => pg: the range at which to start the search. 501 * => boundary: the page number boundary specification (0 = no boundary). 502 * => pmr: the pmemrange of the page. 503 * 504 * This function returns 1 before the next range, so if you want to have the 505 * next range, you need to run TAILQ_NEXT(result, pageq) after calling. 506 * The reason is that this way, the length of the segment is easily 507 * calculated using: atop(result) - atop(pg) + 1. 508 * Hence this function also never returns NULL. 509 */ 510 struct vm_page * 511 uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr, 512 struct vm_page *pg, paddr_t boundary) 513 { 514 paddr_t first_boundary; 515 struct vm_page *next; 516 struct vm_page *prev; 517 518 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) && 519 pmr->high > atop(VM_PAGE_TO_PHYS(pg))); 520 if (boundary != 0) { 521 first_boundary = 522 PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary); 523 } else 524 first_boundary = 0; 525 526 /* 527 * Increase next until it hits the first page of the next segment. 528 * 529 * While loop checks the following: 530 * - next != NULL we have not reached the end of pgl 531 * - boundary == 0 || next < first_boundary 532 * we do not cross a boundary 533 * - atop(prev) + 1 == atop(next) 534 * still in the same segment 535 * - low <= last 536 * - high > last still in the same memory range 537 * - memtype is equal allocator is unable to view different memtypes 538 * as part of the same segment 539 * - prev + 1 == next no array breakage occurs 540 */ 541 prev = pg; 542 next = TAILQ_NEXT(prev, pageq); 543 while (next != NULL && 544 (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) && 545 atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) && 546 pmr->low <= atop(VM_PAGE_TO_PHYS(next)) && 547 pmr->high > atop(VM_PAGE_TO_PHYS(next)) && 548 uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) && 549 prev + 1 == next) { 550 prev = next; 551 next = TAILQ_NEXT(prev, pageq); 552 } 553 554 /* 555 * End of this segment. 556 */ 557 return prev; 558 } 559 560 /* 561 * Find the first page that is part of this segment. 562 * => pg: the range at which to start the search. 563 * => boundary: the page number boundary specification (0 = no boundary). 564 * => pmr: the pmemrange of the page. 565 * 566 * This function returns 1 after the previous range, so if you want to have the 567 * previous range, you need to run TAILQ_NEXT(result, pageq) after calling. 568 * The reason is that this way, the length of the segment is easily 569 * calculated using: atop(pg) - atop(result) + 1. 570 * Hence this function also never returns NULL. 571 */ 572 struct vm_page * 573 uvm_pmr_findprevsegment(struct uvm_pmemrange *pmr, 574 struct vm_page *pg, paddr_t boundary) 575 { 576 paddr_t first_boundary; 577 struct vm_page *next; 578 struct vm_page *prev; 579 580 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) && 581 pmr->high > atop(VM_PAGE_TO_PHYS(pg))); 582 if (boundary != 0) { 583 first_boundary = 584 PMR_ALIGN_DOWN(atop(VM_PAGE_TO_PHYS(pg)), boundary); 585 } else 586 first_boundary = 0; 587 588 /* 589 * Increase next until it hits the first page of the previous segment. 590 * 591 * While loop checks the following: 592 * - next != NULL we have not reached the end of pgl 593 * - boundary == 0 || next >= first_boundary 594 * we do not cross a boundary 595 * - atop(prev) - 1 == atop(next) 596 * still in the same segment 597 * - low <= last 598 * - high > last still in the same memory range 599 * - memtype is equal allocator is unable to view different memtypes 600 * as part of the same segment 601 * - prev - 1 == next no array breakage occurs 602 */ 603 prev = pg; 604 next = TAILQ_NEXT(prev, pageq); 605 while (next != NULL && 606 (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) >= first_boundary) && 607 atop(VM_PAGE_TO_PHYS(prev)) - 1 == atop(VM_PAGE_TO_PHYS(next)) && 608 pmr->low <= atop(VM_PAGE_TO_PHYS(next)) && 609 pmr->high > atop(VM_PAGE_TO_PHYS(next)) && 610 uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) && 611 prev - 1 == next) { 612 prev = next; 613 next = TAILQ_NEXT(prev, pageq); 614 } 615 616 /* 617 * Start of this segment. 618 */ 619 return prev; 620 } 621 622 /* 623 * Remove the first segment of contiguous pages from pgl. 624 * A segment ends if it crosses boundary (unless boundary = 0) or 625 * if it would enter a different uvm_pmemrange. 626 * 627 * Work: the page range that the caller is currently working with. 628 * May be null. 629 * 630 * If is_desperate is non-zero, the smallest segment is erased. Otherwise, 631 * the first segment is erased (which, if called by uvm_pmr_getpages(), 632 * probably is the smallest or very close to it). 633 */ 634 psize_t 635 uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary, 636 struct vm_page **work, int is_desperate) 637 { 638 struct vm_page *start, *end, *iter, *iter_end, *inserted, *lowest; 639 psize_t count; 640 struct uvm_pmemrange *pmr, *pmr_iter; 641 642 KASSERT(!TAILQ_EMPTY(pgl)); 643 644 /* 645 * Initialize to first page. 646 * Unless desperate scan finds a better candidate, this is what'll be 647 * erased. 648 */ 649 start = TAILQ_FIRST(pgl); 650 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start))); 651 end = uvm_pmr_findnextsegment(pmr, start, boundary); 652 653 /* 654 * If we are desperate, we _really_ want to get rid of the smallest 655 * element (rather than a close match to the smallest element). 656 */ 657 if (is_desperate) { 658 /* Linear search for smallest segment. */ 659 pmr_iter = pmr; 660 for (iter = TAILQ_NEXT(end, pageq); 661 iter != NULL && start != end; 662 iter = TAILQ_NEXT(iter_end, pageq)) { 663 /* 664 * Only update pmr if it doesn't match current 665 * iteration. 666 */ 667 if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) || 668 pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) { 669 pmr_iter = uvm_pmemrange_find(atop( 670 VM_PAGE_TO_PHYS(iter))); 671 } 672 673 iter_end = uvm_pmr_findnextsegment(pmr_iter, iter, 674 boundary); 675 676 /* 677 * Current iteration is smaller than best match so 678 * far; update. 679 */ 680 if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) < 681 VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) { 682 start = iter; 683 end = iter_end; 684 pmr = pmr_iter; 685 } 686 } 687 } 688 689 /* 690 * Calculate count and end of the list. 691 */ 692 count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1; 693 lowest = start; 694 end = TAILQ_NEXT(end, pageq); 695 696 /* 697 * Actually remove the range of pages. 698 * 699 * Sadly, this cannot be done using pointer iteration: 700 * vm_physseg is not guaranteed to be sorted on address, hence 701 * uvm_page_init() may not have initialized its array sorted by 702 * page number. 703 */ 704 for (iter = start; iter != end; iter = iter_end) { 705 iter_end = TAILQ_NEXT(iter, pageq); 706 TAILQ_REMOVE(pgl, iter, pageq); 707 } 708 709 lowest->fpgsz = count; 710 inserted = uvm_pmr_insert(pmr, lowest, 0); 711 712 /* 713 * If the caller was working on a range and this function modified 714 * that range, update the pointer. 715 */ 716 if (work != NULL && *work != NULL && 717 atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) && 718 atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz > 719 atop(VM_PAGE_TO_PHYS(*work))) 720 *work = inserted; 721 return count; 722 } 723 724 /* 725 * Remove the first segment of contiguous pages from a pgl 726 * with the list elements in reverse order of physaddr. 727 * 728 * A segment ends if it would enter a different uvm_pmemrange. 729 * 730 * Stores starting physical address of the segment in pstart. 731 */ 732 psize_t 733 uvm_pmr_remove_1strange_reverse(struct pglist *pgl, paddr_t *pstart) 734 { 735 struct vm_page *start, *end, *iter, *iter_end, *lowest; 736 psize_t count; 737 struct uvm_pmemrange *pmr; 738 739 KASSERT(!TAILQ_EMPTY(pgl)); 740 741 start = TAILQ_FIRST(pgl); 742 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start))); 743 end = uvm_pmr_findprevsegment(pmr, start, 0); 744 745 KASSERT(end <= start); 746 747 /* 748 * Calculate count and end of the list. 749 */ 750 count = atop(VM_PAGE_TO_PHYS(start) - VM_PAGE_TO_PHYS(end)) + 1; 751 lowest = end; 752 end = TAILQ_NEXT(end, pageq); 753 754 /* 755 * Actually remove the range of pages. 756 * 757 * Sadly, this cannot be done using pointer iteration: 758 * vm_physseg is not guaranteed to be sorted on address, hence 759 * uvm_page_init() may not have initialized its array sorted by 760 * page number. 761 */ 762 for (iter = start; iter != end; iter = iter_end) { 763 iter_end = TAILQ_NEXT(iter, pageq); 764 TAILQ_REMOVE(pgl, iter, pageq); 765 } 766 767 lowest->fpgsz = count; 768 (void) uvm_pmr_insert(pmr, lowest, 0); 769 770 *pstart = VM_PAGE_TO_PHYS(lowest); 771 return count; 772 } 773 774 /* 775 * Extract a number of pages from a segment of free pages. 776 * Called by uvm_pmr_getpages. 777 * 778 * Returns the segment that was created from pages left over at the tail 779 * of the remove set of pages, or NULL if no pages were left at the tail. 780 */ 781 struct vm_page * 782 uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg, 783 paddr_t start, paddr_t end, struct pglist *result) 784 { 785 struct vm_page *after, *pg_i; 786 psize_t before_sz, after_sz; 787 #ifdef DEBUG 788 psize_t i; 789 #endif 790 791 KDASSERT(end > start); 792 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg))); 793 KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz); 794 KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start); 795 KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end); 796 797 before_sz = start - atop(VM_PAGE_TO_PHYS(pg)); 798 after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end; 799 KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz); 800 uvm_pmr_assertvalid(pmr); 801 802 uvm_pmr_remove_size(pmr, pg); 803 if (before_sz == 0) 804 uvm_pmr_remove_addr(pmr, pg); 805 after = pg + before_sz + (end - start); 806 807 /* Add selected pages to result. */ 808 for (pg_i = pg + before_sz; pg_i != after; pg_i++) { 809 KASSERT(pg_i->pg_flags & PQ_FREE); 810 pg_i->fpgsz = 0; 811 TAILQ_INSERT_TAIL(result, pg_i, pageq); 812 } 813 814 /* Before handling. */ 815 if (before_sz > 0) { 816 pg->fpgsz = before_sz; 817 uvm_pmr_insert_size(pmr, pg); 818 } 819 820 /* After handling. */ 821 if (after_sz > 0) { 822 #ifdef DEBUG 823 for (i = 0; i < after_sz; i++) { 824 KASSERT(!uvm_pmr_isfree(after + i)); 825 } 826 #endif 827 KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end); 828 after->fpgsz = after_sz; 829 after = uvm_pmr_insert_addr(pmr, after, 1); 830 uvm_pmr_insert_size(pmr, after); 831 } 832 833 uvm_pmr_assertvalid(pmr); 834 return (after_sz > 0 ? after : NULL); 835 } 836 837 /* 838 * Acquire a number of pages. 839 * 840 * count: the number of pages returned 841 * start: lowest page number 842 * end: highest page number +1 843 * (start = end = 0: no limitation) 844 * align: power-of-2 alignment constraint (align = 1: no alignment) 845 * boundary: power-of-2 boundary (boundary = 0: no boundary) 846 * maxseg: maximum number of segments to return 847 * flags: UVM_PLA_* flags 848 * result: returned pages storage (uses pageq) 849 */ 850 int 851 uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align, 852 paddr_t boundary, int maxseg, int flags, struct pglist *result) 853 { 854 struct uvm_pmemrange *pmr; /* Iterate memory ranges. */ 855 struct vm_page *found, *f_next; /* Iterate chunks. */ 856 psize_t fcount; /* Current found pages. */ 857 int fnsegs; /* Current segment counter. */ 858 int try, start_try; 859 psize_t search[3]; 860 paddr_t fstart, fend; /* Pages to be taken from found. */ 861 int memtype; /* Requested memtype. */ 862 int memtype_init; /* Best memtype. */ 863 int desperate; /* True if allocation failed. */ 864 #ifdef DIAGNOSTIC 865 struct vm_page *diag_prev; /* Used during validation. */ 866 #endif /* DIAGNOSTIC */ 867 868 /* 869 * Validate arguments. 870 */ 871 KASSERT(count > 0); 872 KASSERT(start == 0 || end == 0 || start < end); 873 KASSERT(align >= 1); 874 KASSERT(powerof2(align)); 875 KASSERT(maxseg > 0); 876 KASSERT(boundary == 0 || powerof2(boundary)); 877 KASSERT(boundary == 0 || maxseg * boundary >= count); 878 KASSERT(TAILQ_EMPTY(result)); 879 KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT)); 880 881 /* 882 * TRYCONTIG is a noop if you only want a single segment. 883 * Remove it if that's the case: otherwise it'll deny the fast 884 * allocation. 885 */ 886 if (maxseg == 1 || count == 1) 887 flags &= ~UVM_PLA_TRYCONTIG; 888 889 /* 890 * Configure search. 891 * 892 * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case. 893 * search[1] is multiple segments, chosen to fulfill the search in 894 * approximately even-sized segments. 895 * This is a good trade-off between slightly reduced allocation speed 896 * and less fragmentation. 897 * search[2] is the worst case, in which all segments are evaluated. 898 * This provides the least fragmentation, but makes the search 899 * possibly longer (although in the case it is selected, that no 900 * longer matters most). 901 * 902 * The exception is when maxseg == 1: since we can only fulfill that 903 * with one segment of size pages, only a single search type has to 904 * be attempted. 905 */ 906 if (maxseg == 1 || count == 1) { 907 start_try = 2; 908 search[2] = count; 909 } else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) { 910 start_try = 2; 911 search[2] = 1; 912 } else { 913 start_try = 0; 914 search[0] = count; 915 search[1] = pow2divide(count, maxseg); 916 search[2] = 1; 917 if ((flags & UVM_PLA_TRYCONTIG) == 0) 918 start_try = 1; 919 if (search[1] >= search[0]) { 920 search[1] = search[0]; 921 start_try = 1; 922 } 923 if (search[2] >= search[start_try]) { 924 start_try = 2; 925 } 926 } 927 928 /* 929 * Memory type: if zeroed memory is requested, traverse the zero set. 930 * Otherwise, traverse the dirty set. 931 * 932 * The memtype iterator is reinitialized to memtype_init on entrance 933 * of a pmemrange. 934 */ 935 if (flags & UVM_PLA_ZERO) 936 memtype_init = UVM_PMR_MEMTYPE_ZERO; 937 else 938 memtype_init = UVM_PMR_MEMTYPE_DIRTY; 939 940 /* 941 * Initially, we're not desperate. 942 * 943 * Note that if we return from a sleep, we are still desperate. 944 * Chances are that memory pressure is still high, so resetting 945 * seems over-optimistic to me. 946 */ 947 desperate = 0; 948 949 uvm_lock_fpageq(); 950 retry: /* Return point after sleeping. */ 951 /* 952 * check to see if we need to generate some free pages waking 953 * the pagedaemon. 954 */ 955 if ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freemin || 956 ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg && 957 (uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg)) 958 wakeup(&uvm.pagedaemon); 959 960 /* 961 * fail if any of these conditions is true: 962 * [1] there really are no free pages, or 963 * [2] only kernel "reserved" pages remain and 964 * the UVM_PLA_USERESERVE flag wasn't used. 965 * [3] only pagedaemon "reserved" pages remain and 966 * the requestor isn't the pagedaemon nor the syncer. 967 */ 968 if ((uvmexp.free <= (uvmexp.reserve_kernel + count)) && 969 !(flags & UVM_PLA_USERESERVE)) { 970 uvm_unlock_fpageq(); 971 return ENOMEM; 972 } 973 974 if ((uvmexp.free <= (uvmexp.reserve_pagedaemon + count)) && 975 !in_pagedaemon(1)) { 976 uvm_unlock_fpageq(); 977 if (flags & UVM_PLA_WAITOK) { 978 uvm_wait("uvm_pmr_getpages"); 979 uvm_lock_fpageq(); 980 goto retry; 981 } 982 return ENOMEM; 983 } 984 985 fcount = 0; 986 fnsegs = 0; 987 988 retry_desperate: 989 /* 990 * If we just want any page(s), go for the really fast option. 991 */ 992 if (count <= maxseg && align == 1 && boundary == 0 && 993 (flags & UVM_PLA_TRYCONTIG) == 0) { 994 fcount += uvm_pmr_get1page(count - fcount, memtype_init, 995 result, start, end, 0); 996 997 /* 998 * If we found sufficient pages, go to the success exit code. 999 * 1000 * Otherwise, go immediately to fail, since we collected 1001 * all we could anyway. 1002 */ 1003 if (fcount == count) 1004 goto out; 1005 else 1006 goto fail; 1007 } 1008 1009 /* 1010 * The heart of the contig case. 1011 * 1012 * The code actually looks like this: 1013 * 1014 * foreach (struct pmemrange) { 1015 * foreach (memtype) { 1016 * foreach(try) { 1017 * foreach (free range of memtype in pmemrange, 1018 * starting at search[try]) { 1019 * while (range has space left) 1020 * take from range 1021 * } 1022 * } 1023 * } 1024 * 1025 * if next pmemrange has higher usecount than current: 1026 * enter desperate case (which will drain the pmemranges 1027 * until empty prior to moving to the next one) 1028 * } 1029 * 1030 * When desperate is activated, try always starts at the highest 1031 * value. The memtype loop is using a goto ReScanMemtype. 1032 * The try loop is using a goto ReScan. 1033 * The 'range has space left' loop uses label DrainFound. 1034 * 1035 * Writing them all as loops would take up a lot of screen space in 1036 * the form of indentation and some parts are easier to express 1037 * using the labels. 1038 */ 1039 1040 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { 1041 /* Empty range. */ 1042 if (pmr->nsegs == 0) 1043 continue; 1044 1045 /* Outside requested range. */ 1046 if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end)) 1047 continue; 1048 1049 memtype = memtype_init; 1050 1051 rescan_memtype: /* Return point at memtype++. */ 1052 try = start_try; 1053 1054 rescan: /* Return point at try++. */ 1055 for (found = uvm_pmr_nfindsz(pmr, search[try], memtype); 1056 found != NULL; 1057 found = f_next) { 1058 f_next = uvm_pmr_nextsz(pmr, found, memtype); 1059 1060 fstart = atop(VM_PAGE_TO_PHYS(found)); 1061 if (start != 0) 1062 fstart = MAX(start, fstart); 1063 drain_found: 1064 /* 1065 * Throw away the first segment if fnsegs == maxseg 1066 * 1067 * Note that f_next is still valid after this call, 1068 * since we only allocated from entries before f_next. 1069 * We don't revisit the entries we already extracted 1070 * from unless we entered the desperate case. 1071 */ 1072 if (fnsegs == maxseg) { 1073 fnsegs--; 1074 fcount -= 1075 uvm_pmr_remove_1strange(result, boundary, 1076 &found, desperate); 1077 } 1078 1079 fstart = PMR_ALIGN(fstart, align); 1080 fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz; 1081 if (end != 0) 1082 fend = MIN(end, fend); 1083 if (boundary != 0) { 1084 fend = 1085 MIN(fend, PMR_ALIGN(fstart + 1, boundary)); 1086 } 1087 if (fstart >= fend) 1088 continue; 1089 if (fend - fstart > count - fcount) 1090 fend = fstart + (count - fcount); 1091 1092 fcount += fend - fstart; 1093 fnsegs++; 1094 found = uvm_pmr_extract_range(pmr, found, 1095 fstart, fend, result); 1096 1097 if (fcount == count) 1098 goto out; 1099 1100 /* 1101 * If there's still space left in found, try to 1102 * fully drain it prior to continuing. 1103 */ 1104 if (found != NULL) { 1105 fstart = fend; 1106 goto drain_found; 1107 } 1108 } 1109 1110 /* Try a smaller search now. */ 1111 if (++try < nitems(search)) 1112 goto rescan; 1113 1114 /* 1115 * Exhaust all memory types prior to going to the next memory 1116 * segment. 1117 * This means that zero-vs-dirty are eaten prior to moving 1118 * to a pmemrange with a higher use-count. 1119 * 1120 * Code is basically a difficult way of writing: 1121 * memtype = memtype_init; 1122 * do { 1123 * ...; 1124 * memtype += 1; 1125 * memtype %= MEMTYPE_MAX; 1126 * } while (memtype != memtype_init); 1127 */ 1128 memtype += 1; 1129 if (memtype == UVM_PMR_MEMTYPE_MAX) 1130 memtype = 0; 1131 if (memtype != memtype_init) 1132 goto rescan_memtype; 1133 1134 /* 1135 * If not desperate, enter desperate case prior to eating all 1136 * the good stuff in the next range. 1137 */ 1138 if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL && 1139 TAILQ_NEXT(pmr, pmr_use)->use != pmr->use) 1140 break; 1141 } 1142 1143 /* 1144 * Not enough memory of the requested type available. Fall back to 1145 * less good memory that we'll clean up better later. 1146 * 1147 * This algorithm is not very smart though, it just starts scanning 1148 * a different typed range, but the nicer ranges of the previous 1149 * iteration may fall out. Hence there is a small chance of a false 1150 * negative. 1151 * 1152 * When desperate: scan all sizes starting at the smallest 1153 * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may 1154 * allow us to hit the fast path now). 1155 * 1156 * Also, because we will revisit entries we scanned before, we need 1157 * to reset the page queue, or we may end up releasing entries in 1158 * such a way as to invalidate f_next. 1159 */ 1160 if (!desperate) { 1161 desperate = 1; 1162 start_try = nitems(search) - 1; 1163 flags &= ~UVM_PLA_TRYCONTIG; 1164 1165 while (!TAILQ_EMPTY(result)) 1166 uvm_pmr_remove_1strange(result, 0, NULL, 0); 1167 fnsegs = 0; 1168 fcount = 0; 1169 goto retry_desperate; 1170 } 1171 1172 fail: 1173 /* Allocation failed. */ 1174 /* XXX: claim from memory reserve here */ 1175 1176 while (!TAILQ_EMPTY(result)) 1177 uvm_pmr_remove_1strange(result, 0, NULL, 0); 1178 1179 if (flags & UVM_PLA_WAITOK) { 1180 if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count), 1181 flags & UVM_PLA_FAILOK) == 0) 1182 goto retry; 1183 KASSERT(flags & UVM_PLA_FAILOK); 1184 } else if (!(flags & UVM_PLA_NOWAKE)) { 1185 struct uvm_pmalloc *pma = &nowait_pma; 1186 1187 if (!(nowait_pma.pm_flags & UVM_PMA_LINKED)) { 1188 nowait_pma.pm_flags = UVM_PMA_LINKED; 1189 TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, pma, pmq); 1190 wakeup(&uvm.pagedaemon); 1191 } 1192 } 1193 uvm_unlock_fpageq(); 1194 1195 return ENOMEM; 1196 1197 out: 1198 /* Allocation successful. */ 1199 uvmexp.free -= fcount; 1200 1201 uvm_unlock_fpageq(); 1202 1203 /* Update statistics and zero pages if UVM_PLA_ZERO. */ 1204 #ifdef DIAGNOSTIC 1205 fnsegs = 0; 1206 fcount = 0; 1207 diag_prev = NULL; 1208 #endif /* DIAGNOSTIC */ 1209 TAILQ_FOREACH(found, result, pageq) { 1210 atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK); 1211 1212 if (found->pg_flags & PG_ZERO) { 1213 uvm_lock_fpageq(); 1214 uvmexp.zeropages--; 1215 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) 1216 wakeup(&uvmexp.zeropages); 1217 uvm_unlock_fpageq(); 1218 } 1219 if (flags & UVM_PLA_ZERO) { 1220 if (found->pg_flags & PG_ZERO) 1221 uvmexp.pga_zerohit++; 1222 else { 1223 uvmexp.pga_zeromiss++; 1224 uvm_pagezero(found); 1225 } 1226 } 1227 atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE); 1228 1229 found->uobject = NULL; 1230 found->uanon = NULL; 1231 found->pg_version++; 1232 1233 /* 1234 * Validate that the page matches range criterium. 1235 */ 1236 KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start); 1237 KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end); 1238 1239 #ifdef DIAGNOSTIC 1240 /* 1241 * Update fcount (# found pages) and 1242 * fnsegs (# found segments) counters. 1243 */ 1244 if (diag_prev == NULL || 1245 /* new segment if it contains a hole */ 1246 atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 != 1247 atop(VM_PAGE_TO_PHYS(found)) || 1248 /* new segment if it crosses boundary */ 1249 (atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) != 1250 (atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1))) 1251 fnsegs++; 1252 fcount++; 1253 1254 diag_prev = found; 1255 #endif /* DIAGNOSTIC */ 1256 } 1257 1258 #ifdef DIAGNOSTIC 1259 /* 1260 * Panic on algorithm failure. 1261 */ 1262 if (fcount != count || fnsegs > maxseg) { 1263 panic("pmemrange allocation error: " 1264 "allocated %ld pages in %d segments, " 1265 "but request was %ld pages in %d segments", 1266 fcount, fnsegs, count, maxseg); 1267 } 1268 #endif /* DIAGNOSTIC */ 1269 1270 return 0; 1271 } 1272 1273 /* 1274 * Acquire a single page. 1275 * 1276 * flags: UVM_PLA_* flags 1277 * result: returned page. 1278 */ 1279 struct vm_page * 1280 uvm_pmr_getone(int flags) 1281 { 1282 struct vm_page *pg; 1283 struct pglist pgl; 1284 1285 TAILQ_INIT(&pgl); 1286 if (uvm_pmr_getpages(1, 0, 0, 1, 0, 1, flags, &pgl) != 0) 1287 return NULL; 1288 1289 pg = TAILQ_FIRST(&pgl); 1290 KASSERT(pg != NULL && TAILQ_NEXT(pg, pageq) == NULL); 1291 1292 return pg; 1293 } 1294 1295 /* 1296 * Free a number of contig pages (invoked by uvm_page_init). 1297 */ 1298 void 1299 uvm_pmr_freepages(struct vm_page *pg, psize_t count) 1300 { 1301 struct uvm_pmemrange *pmr; 1302 psize_t i, pmr_count; 1303 struct vm_page *firstpg = pg; 1304 1305 for (i = 0; i < count; i++) { 1306 KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) == 1307 atop(VM_PAGE_TO_PHYS(pg)) + i); 1308 1309 if (!((pg[i].pg_flags & PQ_FREE) == 0 && 1310 VALID_FLAGS(pg[i].pg_flags))) { 1311 printf("Flags: 0x%x, will panic now.\n", 1312 pg[i].pg_flags); 1313 } 1314 KASSERT((pg[i].pg_flags & PQ_FREE) == 0 && 1315 VALID_FLAGS(pg[i].pg_flags)); 1316 atomic_setbits_int(&pg[i].pg_flags, PQ_FREE); 1317 atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO); 1318 } 1319 1320 uvm_lock_fpageq(); 1321 1322 for (i = count; i > 0; i -= pmr_count) { 1323 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); 1324 KASSERT(pmr != NULL); 1325 1326 pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg))); 1327 pg->fpgsz = pmr_count; 1328 uvm_pmr_insert(pmr, pg, 0); 1329 1330 uvmexp.free += pmr_count; 1331 pg += pmr_count; 1332 } 1333 wakeup(&uvmexp.free); 1334 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) 1335 wakeup(&uvmexp.zeropages); 1336 1337 uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count)); 1338 1339 uvm_unlock_fpageq(); 1340 } 1341 1342 /* 1343 * Free all pages in the queue. 1344 */ 1345 void 1346 uvm_pmr_freepageq(struct pglist *pgl) 1347 { 1348 struct vm_page *pg; 1349 paddr_t pstart; 1350 psize_t plen; 1351 1352 TAILQ_FOREACH(pg, pgl, pageq) { 1353 if (!((pg->pg_flags & PQ_FREE) == 0 && 1354 VALID_FLAGS(pg->pg_flags))) { 1355 printf("Flags: 0x%x, will panic now.\n", 1356 pg->pg_flags); 1357 } 1358 KASSERT((pg->pg_flags & PQ_FREE) == 0 && 1359 VALID_FLAGS(pg->pg_flags)); 1360 atomic_setbits_int(&pg->pg_flags, PQ_FREE); 1361 atomic_clearbits_int(&pg->pg_flags, PG_ZERO); 1362 } 1363 1364 uvm_lock_fpageq(); 1365 while (!TAILQ_EMPTY(pgl)) { 1366 pg = TAILQ_FIRST(pgl); 1367 if (pg == TAILQ_NEXT(pg, pageq) + 1) { 1368 /* 1369 * If pg is one behind the position of the 1370 * next page in the list in the page array, 1371 * try going backwards instead of forward. 1372 */ 1373 plen = uvm_pmr_remove_1strange_reverse(pgl, &pstart); 1374 } else { 1375 pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl)); 1376 plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0); 1377 } 1378 uvmexp.free += plen; 1379 1380 uvm_wakeup_pla(pstart, ptoa(plen)); 1381 } 1382 wakeup(&uvmexp.free); 1383 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) 1384 wakeup(&uvmexp.zeropages); 1385 uvm_unlock_fpageq(); 1386 1387 return; 1388 } 1389 1390 /* 1391 * Store a pmemrange in the list. 1392 * 1393 * The list is sorted by use. 1394 */ 1395 struct uvm_pmemrange * 1396 uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq, 1397 struct uvm_pmemrange *pmr) 1398 { 1399 struct uvm_pmemrange *iter; 1400 int cmp = 1; 1401 1402 TAILQ_FOREACH(iter, useq, pmr_use) { 1403 cmp = uvm_pmemrange_use_cmp(pmr, iter); 1404 if (cmp == 0) 1405 return iter; 1406 if (cmp == -1) 1407 break; 1408 } 1409 1410 if (iter == NULL) 1411 TAILQ_INSERT_TAIL(useq, pmr, pmr_use); 1412 else 1413 TAILQ_INSERT_BEFORE(iter, pmr, pmr_use); 1414 return NULL; 1415 } 1416 1417 #ifdef DEBUG 1418 /* 1419 * Validation of the whole pmemrange. 1420 * Called with fpageq locked. 1421 */ 1422 void 1423 uvm_pmr_assertvalid(struct uvm_pmemrange *pmr) 1424 { 1425 struct vm_page *prev, *next, *i, *xref; 1426 int lcv, mti; 1427 1428 /* Empty range */ 1429 if (pmr->nsegs == 0) 1430 return; 1431 1432 /* Validate address tree. */ 1433 RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) { 1434 /* Validate the range. */ 1435 KASSERT(i->fpgsz > 0); 1436 KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low); 1437 KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz 1438 <= pmr->high); 1439 1440 /* Validate each page in this range. */ 1441 for (lcv = 0; lcv < i->fpgsz; lcv++) { 1442 /* 1443 * Only the first page has a size specification. 1444 * Rest is size 0. 1445 */ 1446 KASSERT(lcv == 0 || i[lcv].fpgsz == 0); 1447 /* 1448 * Flag check. 1449 */ 1450 KASSERT(VALID_FLAGS(i[lcv].pg_flags) && 1451 (i[lcv].pg_flags & PQ_FREE) == PQ_FREE); 1452 /* 1453 * Free pages are: 1454 * - not wired 1455 * - have no vm_anon 1456 * - have no uvm_object 1457 */ 1458 KASSERT(i[lcv].wire_count == 0); 1459 KASSERT(i[lcv].uanon == (void*)0xdeadbeef || 1460 i[lcv].uanon == NULL); 1461 KASSERT(i[lcv].uobject == (void*)0xdeadbeef || 1462 i[lcv].uobject == NULL); 1463 /* 1464 * Pages in a single range always have the same 1465 * memtype. 1466 */ 1467 KASSERT(uvm_pmr_pg_to_memtype(&i[0]) == 1468 uvm_pmr_pg_to_memtype(&i[lcv])); 1469 } 1470 1471 /* Check that it shouldn't be joined with its predecessor. */ 1472 prev = RBT_PREV(uvm_pmr_addr, i); 1473 if (prev != NULL) { 1474 KASSERT(uvm_pmr_pg_to_memtype(i) != 1475 uvm_pmr_pg_to_memtype(prev) || 1476 atop(VM_PAGE_TO_PHYS(i)) > 1477 atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz || 1478 prev + prev->fpgsz != i); 1479 } 1480 1481 /* Assert i is in the size tree as well. */ 1482 if (i->fpgsz == 1) { 1483 TAILQ_FOREACH(xref, 1484 &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) { 1485 if (xref == i) 1486 break; 1487 } 1488 KASSERT(xref == i); 1489 } else { 1490 KASSERT(RBT_FIND(uvm_pmr_size, 1491 &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) == 1492 i + 1); 1493 } 1494 } 1495 1496 /* Validate size tree. */ 1497 for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) { 1498 for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) { 1499 next = uvm_pmr_nextsz(pmr, i, mti); 1500 if (next != NULL) { 1501 KASSERT(i->fpgsz <= 1502 next->fpgsz); 1503 } 1504 1505 /* Assert i is in the addr tree as well. */ 1506 KASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, i) == i); 1507 1508 /* Assert i is of the correct memory type. */ 1509 KASSERT(uvm_pmr_pg_to_memtype(i) == mti); 1510 } 1511 } 1512 1513 /* Validate nsegs statistic. */ 1514 lcv = 0; 1515 RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) 1516 lcv++; 1517 KASSERT(pmr->nsegs == lcv); 1518 } 1519 #endif /* DEBUG */ 1520 1521 /* 1522 * Split pmr at split point pageno. 1523 * Called with fpageq unlocked. 1524 * 1525 * Split is only applied if a pmemrange spans pageno. 1526 */ 1527 void 1528 uvm_pmr_split(paddr_t pageno) 1529 { 1530 struct uvm_pmemrange *pmr, *drain; 1531 struct vm_page *rebuild, *prev, *next; 1532 psize_t prev_sz; 1533 1534 uvm_lock_fpageq(); 1535 pmr = uvm_pmemrange_find(pageno); 1536 if (pmr == NULL || !(pmr->low < pageno)) { 1537 /* No split required. */ 1538 uvm_unlock_fpageq(); 1539 return; 1540 } 1541 1542 KASSERT(pmr->low < pageno); 1543 KASSERT(pmr->high > pageno); 1544 1545 /* 1546 * uvm_pmr_allocpmr() calls into malloc() which in turn calls into 1547 * uvm_kmemalloc which calls into pmemrange, making the locking 1548 * a bit hard, so we just race! 1549 */ 1550 uvm_unlock_fpageq(); 1551 drain = uvm_pmr_allocpmr(); 1552 uvm_lock_fpageq(); 1553 pmr = uvm_pmemrange_find(pageno); 1554 if (pmr == NULL || !(pmr->low < pageno)) { 1555 /* 1556 * We lost the race since someone else ran this or a related 1557 * function, however this should be triggered very rarely so 1558 * we just leak the pmr. 1559 */ 1560 printf("uvm_pmr_split: lost one pmr\n"); 1561 uvm_unlock_fpageq(); 1562 return; 1563 } 1564 1565 drain->low = pageno; 1566 drain->high = pmr->high; 1567 drain->use = pmr->use; 1568 1569 uvm_pmr_assertvalid(pmr); 1570 uvm_pmr_assertvalid(drain); 1571 KASSERT(drain->nsegs == 0); 1572 1573 RBT_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) { 1574 if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno) 1575 break; 1576 } 1577 if (rebuild == NULL) 1578 prev = RBT_MAX(uvm_pmr_addr, &pmr->addr); 1579 else 1580 prev = RBT_PREV(uvm_pmr_addr, rebuild); 1581 KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno); 1582 1583 /* 1584 * Handle free chunk that spans the split point. 1585 */ 1586 if (prev != NULL && 1587 atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) { 1588 psize_t before, after; 1589 1590 KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno); 1591 1592 uvm_pmr_remove(pmr, prev); 1593 prev_sz = prev->fpgsz; 1594 before = pageno - atop(VM_PAGE_TO_PHYS(prev)); 1595 after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno; 1596 1597 KASSERT(before > 0); 1598 KASSERT(after > 0); 1599 1600 prev->fpgsz = before; 1601 uvm_pmr_insert(pmr, prev, 1); 1602 (prev + before)->fpgsz = after; 1603 uvm_pmr_insert(drain, prev + before, 1); 1604 } 1605 1606 /* Move free chunks that no longer fall in the range. */ 1607 for (; rebuild != NULL; rebuild = next) { 1608 next = RBT_NEXT(uvm_pmr_addr, rebuild); 1609 1610 uvm_pmr_remove(pmr, rebuild); 1611 uvm_pmr_insert(drain, rebuild, 1); 1612 } 1613 1614 pmr->high = pageno; 1615 uvm_pmr_assertvalid(pmr); 1616 uvm_pmr_assertvalid(drain); 1617 1618 RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain); 1619 uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain); 1620 uvm_unlock_fpageq(); 1621 } 1622 1623 /* 1624 * Increase the usage counter for the given range of memory. 1625 * 1626 * The more usage counters a given range of memory has, the more will be 1627 * attempted not to allocate from it. 1628 * 1629 * Addresses here are in paddr_t, not page-numbers. 1630 * The lowest and highest allowed address are specified. 1631 */ 1632 void 1633 uvm_pmr_use_inc(paddr_t low, paddr_t high) 1634 { 1635 struct uvm_pmemrange *pmr; 1636 paddr_t sz; 1637 1638 /* pmr uses page numbers, translate low and high. */ 1639 high++; 1640 high = atop(trunc_page(high)); 1641 low = atop(round_page(low)); 1642 uvm_pmr_split(low); 1643 uvm_pmr_split(high); 1644 1645 sz = 0; 1646 uvm_lock_fpageq(); 1647 /* Increase use count on segments in range. */ 1648 RBT_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) { 1649 if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) { 1650 TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use); 1651 pmr->use++; 1652 sz += pmr->high - pmr->low; 1653 uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr); 1654 } 1655 uvm_pmr_assertvalid(pmr); 1656 } 1657 uvm_unlock_fpageq(); 1658 1659 KASSERT(sz >= high - low); 1660 } 1661 1662 /* 1663 * Allocate a pmemrange. 1664 * 1665 * If called from uvm_page_init, the uvm_pageboot_alloc is used. 1666 * If called after uvm_init, malloc is used. 1667 * (And if called in between, you're dead.) 1668 */ 1669 struct uvm_pmemrange * 1670 uvm_pmr_allocpmr(void) 1671 { 1672 struct uvm_pmemrange *nw; 1673 int i; 1674 1675 /* We're only ever hitting the !uvm.page_init_done case for now. */ 1676 if (!uvm.page_init_done) { 1677 nw = (struct uvm_pmemrange *) 1678 uvm_pageboot_alloc(sizeof(struct uvm_pmemrange)); 1679 } else { 1680 nw = malloc(sizeof(struct uvm_pmemrange), 1681 M_VMMAP, M_NOWAIT); 1682 } 1683 KASSERT(nw != NULL); 1684 memset(nw, 0, sizeof(struct uvm_pmemrange)); 1685 RBT_INIT(uvm_pmr_addr, &nw->addr); 1686 for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) { 1687 RBT_INIT(uvm_pmr_size, &nw->size[i]); 1688 TAILQ_INIT(&nw->single[i]); 1689 } 1690 return nw; 1691 } 1692 1693 /* 1694 * Initialization of pmr. 1695 * Called by uvm_page_init. 1696 * 1697 * Sets up pmemranges. 1698 */ 1699 void 1700 uvm_pmr_init(void) 1701 { 1702 struct uvm_pmemrange *new_pmr; 1703 int i; 1704 1705 TAILQ_INIT(&uvm.pmr_control.use); 1706 RBT_INIT(uvm_pmemrange_addr, &uvm.pmr_control.addr); 1707 TAILQ_INIT(&uvm.pmr_control.allocs); 1708 1709 /* By default, one range for the entire address space. */ 1710 new_pmr = uvm_pmr_allocpmr(); 1711 new_pmr->low = 0; 1712 new_pmr->high = atop((paddr_t)-1) + 1; 1713 1714 RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr); 1715 uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr); 1716 1717 for (i = 0; uvm_md_constraints[i] != NULL; i++) { 1718 uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low, 1719 uvm_md_constraints[i]->ucr_high); 1720 } 1721 } 1722 1723 /* 1724 * Find the pmemrange that contains the given page number. 1725 * 1726 * (Manually traverses the binary tree, because that is cheaper on stack 1727 * usage.) 1728 */ 1729 struct uvm_pmemrange * 1730 uvm_pmemrange_find(paddr_t pageno) 1731 { 1732 struct uvm_pmemrange *pmr; 1733 1734 pmr = RBT_ROOT(uvm_pmemrange_addr, &uvm.pmr_control.addr); 1735 while (pmr != NULL) { 1736 if (pmr->low > pageno) 1737 pmr = RBT_LEFT(uvm_pmemrange_addr, pmr); 1738 else if (pmr->high <= pageno) 1739 pmr = RBT_RIGHT(uvm_pmemrange_addr, pmr); 1740 else 1741 break; 1742 } 1743 1744 return pmr; 1745 } 1746 1747 #if defined(DDB) || defined(DEBUG) 1748 /* 1749 * Return true if the given page is in any of the free lists. 1750 * Used by uvm_page_printit. 1751 * This function is safe, even if the page is not on the freeq. 1752 * Note: does not apply locking, only called from ddb. 1753 */ 1754 int 1755 uvm_pmr_isfree(struct vm_page *pg) 1756 { 1757 struct vm_page *r; 1758 struct uvm_pmemrange *pmr; 1759 1760 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); 1761 if (pmr == NULL) 1762 return 0; 1763 r = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg); 1764 if (r == NULL) 1765 r = RBT_MAX(uvm_pmr_addr, &pmr->addr); 1766 else if (r != pg) 1767 r = RBT_PREV(uvm_pmr_addr, r); 1768 if (r == NULL) 1769 return 0; /* Empty tree. */ 1770 1771 KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg))); 1772 return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz > 1773 atop(VM_PAGE_TO_PHYS(pg)); 1774 } 1775 #endif /* DEBUG */ 1776 1777 /* 1778 * Given a root of a tree, find a range which intersects start, end and 1779 * is of the same memtype. 1780 * 1781 * Page must be in the address tree. 1782 */ 1783 struct vm_page* 1784 uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root, 1785 paddr_t start, paddr_t end, int memtype) 1786 { 1787 int direction; 1788 struct vm_page *root; 1789 struct vm_page *high, *high_next; 1790 struct vm_page *low, *low_next; 1791 1792 KDASSERT(pmr != NULL && init_root != NULL); 1793 root = init_root; 1794 1795 /* Which direction to use for searching. */ 1796 if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start) 1797 direction = 1; 1798 else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end) 1799 direction = -1; 1800 else /* nothing to do */ 1801 return root; 1802 1803 /* First, update root to fall within the chosen range. */ 1804 while (root && !PMR_INTERSECTS_WITH( 1805 atop(VM_PAGE_TO_PHYS(root)), 1806 atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz, 1807 start, end)) { 1808 if (direction == 1) 1809 root = RBT_RIGHT(uvm_objtree, root); 1810 else 1811 root = RBT_LEFT(uvm_objtree, root); 1812 } 1813 if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype) 1814 return root; 1815 1816 /* 1817 * Root is valid, but of the wrong memtype. 1818 * 1819 * Try to find a range that has the given memtype in the subtree 1820 * (memtype mismatches are costly, either because the conversion 1821 * is expensive, or a later allocation will need to do the opposite 1822 * conversion, which will be expensive). 1823 * 1824 * 1825 * First, simply increase address until we hit something we can use. 1826 * Cache the upper page, so we can page-walk later. 1827 */ 1828 high = root; 1829 high_next = RBT_RIGHT(uvm_objtree, high); 1830 while (high_next != NULL && PMR_INTERSECTS_WITH( 1831 atop(VM_PAGE_TO_PHYS(high_next)), 1832 atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz, 1833 start, end)) { 1834 high = high_next; 1835 if (uvm_pmr_pg_to_memtype(high) == memtype) 1836 return high; 1837 high_next = RBT_RIGHT(uvm_objtree, high); 1838 } 1839 1840 /* 1841 * Second, decrease the address until we hit something we can use. 1842 * Cache the lower page, so we can page-walk later. 1843 */ 1844 low = root; 1845 low_next = RBT_LEFT(uvm_objtree, low); 1846 while (low_next != NULL && PMR_INTERSECTS_WITH( 1847 atop(VM_PAGE_TO_PHYS(low_next)), 1848 atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz, 1849 start, end)) { 1850 low = low_next; 1851 if (uvm_pmr_pg_to_memtype(low) == memtype) 1852 return low; 1853 low_next = RBT_LEFT(uvm_objtree, low); 1854 } 1855 1856 if (low == high) 1857 return NULL; 1858 1859 /* No hits. Walk the address tree until we find something usable. */ 1860 for (low = RBT_NEXT(uvm_pmr_addr, low); 1861 low != high; 1862 low = RBT_NEXT(uvm_pmr_addr, low)) { 1863 KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)), 1864 atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz, 1865 start, end)); 1866 if (uvm_pmr_pg_to_memtype(low) == memtype) 1867 return low; 1868 } 1869 1870 /* Nothing found. */ 1871 return NULL; 1872 } 1873 1874 /* 1875 * Allocate any page, the fastest way. Page number constraints only. 1876 */ 1877 psize_t 1878 uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result, 1879 paddr_t start, paddr_t end, int memtype_only) 1880 { 1881 struct uvm_pmemrange *pmr; 1882 struct vm_page *found, *splitpg; 1883 psize_t fcount; 1884 int memtype; 1885 1886 fcount = 0; 1887 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { 1888 /* We're done. */ 1889 if (fcount == count) 1890 break; 1891 1892 /* Outside requested range. */ 1893 if (!(start == 0 && end == 0) && 1894 !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end)) 1895 continue; 1896 1897 /* Range is empty. */ 1898 if (pmr->nsegs == 0) 1899 continue; 1900 1901 /* Loop over all memtypes, starting at memtype_init. */ 1902 memtype = memtype_init; 1903 while (fcount != count) { 1904 found = TAILQ_FIRST(&pmr->single[memtype]); 1905 /* 1906 * If found is outside the range, walk the list 1907 * until we find something that intersects with 1908 * boundaries. 1909 */ 1910 while (found && !PMR_INTERSECTS_WITH( 1911 atop(VM_PAGE_TO_PHYS(found)), 1912 atop(VM_PAGE_TO_PHYS(found)) + 1, 1913 start, end)) 1914 found = TAILQ_NEXT(found, pageq); 1915 1916 if (found == NULL) { 1917 /* 1918 * Check if the size tree contains a range 1919 * that intersects with the boundaries. As the 1920 * allocation is for any page, try the smallest 1921 * range so that large ranges are preserved for 1922 * more constrained cases. Only one entry is 1923 * checked here, to avoid a brute-force search. 1924 * 1925 * Note that a size tree gives pg[1] instead of 1926 * pg[0]. 1927 */ 1928 found = RBT_MIN(uvm_pmr_size, 1929 &pmr->size[memtype]); 1930 if (found != NULL) { 1931 found--; 1932 if (!PMR_INTERSECTS_WITH( 1933 atop(VM_PAGE_TO_PHYS(found)), 1934 atop(VM_PAGE_TO_PHYS(found)) + 1935 found->fpgsz, start, end)) 1936 found = NULL; 1937 } 1938 } 1939 if (found == NULL) { 1940 /* 1941 * Try address-guided search to meet the page 1942 * number constraints. 1943 */ 1944 found = RBT_ROOT(uvm_pmr_addr, &pmr->addr); 1945 if (found != NULL) { 1946 found = uvm_pmr_rootupdate(pmr, found, 1947 start, end, memtype); 1948 } 1949 } 1950 if (found != NULL) { 1951 uvm_pmr_assertvalid(pmr); 1952 uvm_pmr_remove_size(pmr, found); 1953 1954 /* 1955 * If the page intersects the end, then it'll 1956 * need splitting. 1957 * 1958 * Note that we don't need to split if the page 1959 * intersects start: the drain function will 1960 * simply stop on hitting start. 1961 */ 1962 if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) + 1963 found->fpgsz > end) { 1964 psize_t splitsz = 1965 atop(VM_PAGE_TO_PHYS(found)) + 1966 found->fpgsz - end; 1967 1968 uvm_pmr_remove_addr(pmr, found); 1969 uvm_pmr_assertvalid(pmr); 1970 found->fpgsz -= splitsz; 1971 splitpg = found + found->fpgsz; 1972 splitpg->fpgsz = splitsz; 1973 uvm_pmr_insert(pmr, splitpg, 1); 1974 1975 /* 1976 * At this point, splitpg and found 1977 * actually should be joined. 1978 * But we explicitly disable that, 1979 * because we will start subtracting 1980 * from found. 1981 */ 1982 KASSERT(start == 0 || 1983 atop(VM_PAGE_TO_PHYS(found)) + 1984 found->fpgsz > start); 1985 uvm_pmr_insert_addr(pmr, found, 1); 1986 } 1987 1988 /* 1989 * Fetch pages from the end. 1990 * If the range is larger than the requested 1991 * number of pages, this saves us an addr-tree 1992 * update. 1993 * 1994 * Since we take from the end and insert at 1995 * the head, any ranges keep preserved. 1996 */ 1997 while (found->fpgsz > 0 && fcount < count && 1998 (start == 0 || 1999 atop(VM_PAGE_TO_PHYS(found)) + 2000 found->fpgsz > start)) { 2001 found->fpgsz--; 2002 fcount++; 2003 TAILQ_INSERT_HEAD(result, 2004 &found[found->fpgsz], pageq); 2005 } 2006 if (found->fpgsz > 0) { 2007 uvm_pmr_insert_size(pmr, found); 2008 KDASSERT(fcount == count); 2009 uvm_pmr_assertvalid(pmr); 2010 return fcount; 2011 } 2012 2013 /* 2014 * Delayed addr-tree removal. 2015 */ 2016 uvm_pmr_remove_addr(pmr, found); 2017 uvm_pmr_assertvalid(pmr); 2018 } else { 2019 if (memtype_only) 2020 break; 2021 /* 2022 * Skip to the next memtype. 2023 */ 2024 memtype += 1; 2025 if (memtype == UVM_PMR_MEMTYPE_MAX) 2026 memtype = 0; 2027 if (memtype == memtype_init) 2028 break; 2029 } 2030 } 2031 } 2032 2033 /* 2034 * Search finished. 2035 * 2036 * Ran out of ranges before enough pages were gathered, or we hit the 2037 * case where found->fpgsz == count - fcount, in which case the 2038 * above exit condition didn't trigger. 2039 * 2040 * On failure, caller will free the pages. 2041 */ 2042 return fcount; 2043 } 2044 2045 #ifdef DDB 2046 /* 2047 * Print information about pmemrange. 2048 * Does not do locking (so either call it from DDB or acquire fpageq lock 2049 * before invoking. 2050 */ 2051 void 2052 uvm_pmr_print(void) 2053 { 2054 struct uvm_pmemrange *pmr; 2055 struct vm_page *pg; 2056 psize_t size[UVM_PMR_MEMTYPE_MAX]; 2057 psize_t free; 2058 int useq_len; 2059 int mt; 2060 2061 printf("Ranges, use queue:\n"); 2062 useq_len = 0; 2063 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { 2064 useq_len++; 2065 free = 0; 2066 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { 2067 pg = RBT_MAX(uvm_pmr_size, &pmr->size[mt]); 2068 if (pg != NULL) 2069 pg--; 2070 else 2071 pg = TAILQ_FIRST(&pmr->single[mt]); 2072 size[mt] = (pg == NULL ? 0 : pg->fpgsz); 2073 2074 RBT_FOREACH(pg, uvm_pmr_addr, &pmr->addr) 2075 free += pg->fpgsz; 2076 } 2077 2078 printf("* [0x%lx-0x%lx] use=%d nsegs=%ld", 2079 (unsigned long)pmr->low, (unsigned long)pmr->high, 2080 pmr->use, (unsigned long)pmr->nsegs); 2081 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { 2082 printf(" maxsegsz[%d]=0x%lx", mt, 2083 (unsigned long)size[mt]); 2084 } 2085 printf(" free=0x%lx\n", (unsigned long)free); 2086 } 2087 printf("#ranges = %d\n", useq_len); 2088 } 2089 #endif 2090 2091 /* 2092 * uvm_wait_pla: wait (sleep) for the page daemon to free some pages 2093 * in a specific physmem area. 2094 * 2095 * Returns ENOMEM if the pagedaemon failed to free any pages. 2096 * If not failok, failure will lead to panic. 2097 * 2098 * Must be called with fpageq locked. 2099 */ 2100 int 2101 uvm_wait_pla(paddr_t low, paddr_t high, paddr_t size, int failok) 2102 { 2103 struct uvm_pmalloc pma; 2104 const char *wmsg = "pmrwait"; 2105 2106 KASSERT(curcpu()->ci_idepth == 0); 2107 2108 if (curproc == uvm.pagedaemon_proc) { 2109 /* 2110 * This is not that uncommon when the pagedaemon is trying 2111 * to flush out a large mmapped file. VOP_WRITE will circle 2112 * back through the buffer cache and try to get more memory. 2113 * The pagedaemon starts by calling bufbackoff, but we can 2114 * easily use up that reserve in a single scan iteration. 2115 */ 2116 uvm_unlock_fpageq(); 2117 if (bufbackoff(NULL, atop(size)) >= atop(size)) { 2118 uvm_lock_fpageq(); 2119 return 0; 2120 } 2121 uvm_lock_fpageq(); 2122 2123 /* 2124 * XXX detect pagedaemon deadlock - see comment in 2125 * uvm_wait(), as this is exactly the same issue. 2126 */ 2127 printf("pagedaemon: wait_pla deadlock detected!\n"); 2128 msleep_nsec(&uvmexp.free, &uvm.fpageqlock, PVM, wmsg, 2129 MSEC_TO_NSEC(125)); 2130 #if defined(DEBUG) 2131 /* DEBUG: panic so we can debug it */ 2132 panic("wait_pla pagedaemon deadlock"); 2133 #endif 2134 return 0; 2135 } 2136 2137 for (;;) { 2138 pma.pm_constraint.ucr_low = low; 2139 pma.pm_constraint.ucr_high = high; 2140 pma.pm_size = size; 2141 pma.pm_flags = UVM_PMA_LINKED; 2142 TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, &pma, pmq); 2143 2144 wakeup(&uvm.pagedaemon); /* wake the daemon! */ 2145 while (pma.pm_flags & (UVM_PMA_LINKED | UVM_PMA_BUSY)) 2146 msleep_nsec(&pma, &uvm.fpageqlock, PVM, wmsg, INFSLP); 2147 2148 if (!(pma.pm_flags & UVM_PMA_FREED) && 2149 pma.pm_flags & UVM_PMA_FAIL) { 2150 if (failok) 2151 return ENOMEM; 2152 printf("uvm_wait: failed to free %ld pages between " 2153 "0x%lx-0x%lx\n", atop(size), low, high); 2154 } else 2155 return 0; 2156 } 2157 /* UNREACHABLE */ 2158 } 2159 2160 /* 2161 * Wake up uvm_pmalloc sleepers. 2162 */ 2163 void 2164 uvm_wakeup_pla(paddr_t low, psize_t len) 2165 { 2166 struct uvm_pmalloc *pma, *pma_next; 2167 paddr_t high; 2168 2169 high = low + len; 2170 2171 /* Wake specific allocations waiting for this memory. */ 2172 for (pma = TAILQ_FIRST(&uvm.pmr_control.allocs); pma != NULL; 2173 pma = pma_next) { 2174 pma_next = TAILQ_NEXT(pma, pmq); 2175 2176 if (low < pma->pm_constraint.ucr_high && 2177 high > pma->pm_constraint.ucr_low) { 2178 pma->pm_flags |= UVM_PMA_FREED; 2179 if (!(pma->pm_flags & UVM_PMA_BUSY)) { 2180 pma->pm_flags &= ~UVM_PMA_LINKED; 2181 TAILQ_REMOVE(&uvm.pmr_control.allocs, pma, 2182 pmq); 2183 wakeup(pma); 2184 } 2185 } 2186 } 2187 } 2188 2189 void 2190 uvm_pagezero_thread(void *arg) 2191 { 2192 struct pglist pgl; 2193 struct vm_page *pg; 2194 int count; 2195 2196 /* Run at the lowest possible priority. */ 2197 curproc->p_p->ps_nice = NZERO + PRIO_MAX; 2198 2199 KERNEL_UNLOCK(); 2200 2201 TAILQ_INIT(&pgl); 2202 for (;;) { 2203 uvm_lock_fpageq(); 2204 while (uvmexp.zeropages >= UVM_PAGEZERO_TARGET || 2205 (count = uvm_pmr_get1page(16, UVM_PMR_MEMTYPE_DIRTY, 2206 &pgl, 0, 0, 1)) == 0) { 2207 msleep_nsec(&uvmexp.zeropages, &uvm.fpageqlock, 2208 MAXPRI, "pgzero", INFSLP); 2209 } 2210 uvm_unlock_fpageq(); 2211 2212 TAILQ_FOREACH(pg, &pgl, pageq) { 2213 uvm_pagezero(pg); 2214 atomic_setbits_int(&pg->pg_flags, PG_ZERO); 2215 } 2216 2217 uvm_lock_fpageq(); 2218 while (!TAILQ_EMPTY(&pgl)) 2219 uvm_pmr_remove_1strange(&pgl, 0, NULL, 0); 2220 uvmexp.zeropages += count; 2221 uvm_unlock_fpageq(); 2222 2223 yield(); 2224 } 2225 } 2226 2227 #if defined(MULTIPROCESSOR) && defined(__HAVE_UVM_PERCPU) 2228 int 2229 uvm_pmr_cache_alloc(struct uvm_pmr_cache_item *upci) 2230 { 2231 struct vm_page *pg; 2232 struct pglist pgl; 2233 int flags = UVM_PLA_NOWAIT|UVM_PLA_NOWAKE; 2234 int npages = UVM_PMR_CACHEMAGSZ; 2235 2236 splassert(IPL_VM); 2237 KASSERT(upci->upci_npages == 0); 2238 2239 TAILQ_INIT(&pgl); 2240 if (uvm_pmr_getpages(npages, 0, 0, 1, 0, npages, flags, &pgl)) 2241 return -1; 2242 2243 while ((pg = TAILQ_FIRST(&pgl)) != NULL) { 2244 TAILQ_REMOVE(&pgl, pg, pageq); 2245 upci->upci_pages[upci->upci_npages] = pg; 2246 upci->upci_npages++; 2247 } 2248 atomic_add_int(&uvmexp.percpucaches, npages); 2249 2250 return 0; 2251 } 2252 2253 struct vm_page * 2254 uvm_pmr_cache_get(int flags) 2255 { 2256 struct uvm_pmr_cache *upc = &curcpu()->ci_uvm; 2257 struct uvm_pmr_cache_item *upci; 2258 struct vm_page *pg; 2259 int s; 2260 2261 s = splvm(); 2262 upci = &upc->upc_magz[upc->upc_actv]; 2263 if (upci->upci_npages == 0) { 2264 unsigned int prev; 2265 2266 prev = (upc->upc_actv == 0) ? 1 : 0; 2267 upci = &upc->upc_magz[prev]; 2268 if (upci->upci_npages == 0) { 2269 atomic_inc_int(&uvmexp.pcpmiss); 2270 if (uvm_pmr_cache_alloc(upci)) { 2271 splx(s); 2272 return uvm_pmr_getone(flags); 2273 } 2274 } 2275 /* Swap magazines */ 2276 upc->upc_actv = prev; 2277 } else { 2278 atomic_inc_int(&uvmexp.pcphit); 2279 } 2280 2281 atomic_dec_int(&uvmexp.percpucaches); 2282 upci->upci_npages--; 2283 pg = upci->upci_pages[upci->upci_npages]; 2284 splx(s); 2285 2286 if (flags & UVM_PLA_ZERO) 2287 uvm_pagezero(pg); 2288 2289 return pg; 2290 } 2291 2292 unsigned int 2293 uvm_pmr_cache_free(struct uvm_pmr_cache_item *upci) 2294 { 2295 struct pglist pgl; 2296 unsigned int i; 2297 2298 splassert(IPL_VM); 2299 2300 TAILQ_INIT(&pgl); 2301 for (i = 0; i < upci->upci_npages; i++) 2302 TAILQ_INSERT_TAIL(&pgl, upci->upci_pages[i], pageq); 2303 2304 uvm_pmr_freepageq(&pgl); 2305 2306 atomic_sub_int(&uvmexp.percpucaches, upci->upci_npages); 2307 upci->upci_npages = 0; 2308 memset(upci->upci_pages, 0, sizeof(upci->upci_pages)); 2309 2310 return i; 2311 } 2312 2313 void 2314 uvm_pmr_cache_put(struct vm_page *pg) 2315 { 2316 struct uvm_pmr_cache *upc = &curcpu()->ci_uvm; 2317 struct uvm_pmr_cache_item *upci; 2318 struct uvm_pmemrange *pmr; 2319 int s; 2320 2321 /* 2322 * Always give back low pages to the allocator to not accelerate 2323 * their exhaustion. 2324 */ 2325 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); 2326 if (pmr->use > 0) { 2327 uvm_pmr_freepages(pg, 1); 2328 return; 2329 } 2330 2331 s = splvm(); 2332 upci = &upc->upc_magz[upc->upc_actv]; 2333 if (upci->upci_npages >= UVM_PMR_CACHEMAGSZ) { 2334 unsigned int prev; 2335 2336 prev = (upc->upc_actv == 0) ? 1 : 0; 2337 upci = &upc->upc_magz[prev]; 2338 if (upci->upci_npages > 0) 2339 uvm_pmr_cache_free(upci); 2340 2341 /* Swap magazines */ 2342 upc->upc_actv = prev; 2343 KASSERT(upci->upci_npages == 0); 2344 } 2345 2346 upci->upci_pages[upci->upci_npages] = pg; 2347 upci->upci_npages++; 2348 atomic_inc_int(&uvmexp.percpucaches); 2349 splx(s); 2350 } 2351 2352 unsigned int 2353 uvm_pmr_cache_drain(void) 2354 { 2355 struct uvm_pmr_cache *upc = &curcpu()->ci_uvm; 2356 unsigned int freed = 0; 2357 int s; 2358 2359 s = splvm(); 2360 freed += uvm_pmr_cache_free(&upc->upc_magz[0]); 2361 freed += uvm_pmr_cache_free(&upc->upc_magz[1]); 2362 splx(s); 2363 2364 return freed; 2365 } 2366 2367 #else /* !(MULTIPROCESSOR && __HAVE_UVM_PERCPU) */ 2368 2369 struct vm_page * 2370 uvm_pmr_cache_get(int flags) 2371 { 2372 return uvm_pmr_getone(flags); 2373 } 2374 2375 void 2376 uvm_pmr_cache_put(struct vm_page *pg) 2377 { 2378 uvm_pmr_freepages(pg, 1); 2379 } 2380 2381 unsigned int 2382 uvm_pmr_cache_drain(void) 2383 { 2384 return 0; 2385 } 2386 #endif 2387