1 /* $NetBSD: uvm_pglist.c,v 1.70 2016/12/23 09:18:02 skrll Exp $ */ 2 3 /*- 4 * Copyright (c) 1997 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * uvm_pglist.c: pglist functions 35 */ 36 37 #include <sys/cdefs.h> 38 __KERNEL_RCSID(0, "$NetBSD: uvm_pglist.c,v 1.70 2016/12/23 09:18:02 skrll Exp $"); 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 43 #include <uvm/uvm.h> 44 #include <uvm/uvm_pdpolicy.h> 45 46 #ifdef VM_PAGE_ALLOC_MEMORY_STATS 47 #define STAT_INCR(v) (v)++ 48 #define STAT_DECR(v) do { \ 49 if ((v) == 0) \ 50 printf("%s:%d -- Already 0!\n", __FILE__, __LINE__); \ 51 else \ 52 (v)--; \ 53 } while (/*CONSTCOND*/ 0) 54 u_long uvm_pglistalloc_npages; 55 #else 56 #define STAT_INCR(v) 57 #define STAT_DECR(v) 58 #endif 59 60 /* 61 * uvm_pglistalloc: allocate a list of pages 62 * 63 * => allocated pages are placed onto an rlist. rlist is 64 * initialized by uvm_pglistalloc. 65 * => returns 0 on success or errno on failure 66 * => implementation allocates a single segment if any constraints are 67 * imposed by call arguments. 68 * => doesn't take into account clean non-busy pages on inactive list 69 * that could be used(?) 70 * => params: 71 * size the size of the allocation, rounded to page size. 72 * low the low address of the allowed allocation range. 73 * high the high address of the allowed allocation range. 74 * alignment memory must be aligned to this power-of-two boundary. 75 * boundary no segment in the allocation may cross this 76 * power-of-two boundary (relative to zero). 77 */ 78 79 static void 80 uvm_pglist_add(struct vm_page *pg, struct pglist *rlist) 81 { 82 int free_list __unused, color __unused, pgflidx; 83 84 KASSERT(mutex_owned(&uvm_fpageqlock)); 85 86 #if PGFL_NQUEUES != 2 87 #error uvm_pglistalloc needs to be updated 88 #endif 89 90 free_list = uvm_page_lookup_freelist(pg); 91 color = VM_PGCOLOR_BUCKET(pg); 92 pgflidx = (pg->flags & PG_ZERO) ? PGFL_ZEROS : PGFL_UNKNOWN; 93 #ifdef UVMDEBUG 94 struct vm_page *tp; 95 LIST_FOREACH(tp, 96 &uvm.page_free[free_list].pgfl_buckets[color].pgfl_queues[pgflidx], 97 pageq.list) { 98 if (tp == pg) 99 break; 100 } 101 if (tp == NULL) 102 panic("uvm_pglistalloc: page not on freelist"); 103 #endif 104 LIST_REMOVE(pg, pageq.list); /* global */ 105 LIST_REMOVE(pg, listq.list); /* cpu */ 106 uvmexp.free--; 107 if (pg->flags & PG_ZERO) 108 uvmexp.zeropages--; 109 VM_FREE_PAGE_TO_CPU(pg)->pages[pgflidx]--; 110 pg->flags = PG_CLEAN; 111 pg->pqflags = 0; 112 pg->uobject = NULL; 113 pg->uanon = NULL; 114 TAILQ_INSERT_TAIL(rlist, pg, pageq.queue); 115 STAT_INCR(uvm_pglistalloc_npages); 116 } 117 118 static int 119 uvm_pglistalloc_c_ps(uvm_physseg_t psi, int num, paddr_t low, paddr_t high, 120 paddr_t alignment, paddr_t boundary, struct pglist *rlist) 121 { 122 signed int candidate, limit, candidateidx, end, idx, skip; 123 int pagemask; 124 bool second_pass; 125 #ifdef DEBUG 126 paddr_t idxpa, lastidxpa; 127 paddr_t cidx = 0; /* XXX: GCC */ 128 #endif 129 #ifdef PGALLOC_VERBOSE 130 printf("pgalloc: contig %d pgs from psi %zd\n", num, ps - vm_physmem); 131 #endif 132 133 KASSERT(mutex_owned(&uvm_fpageqlock)); 134 135 low = atop(low); 136 high = atop(high); 137 alignment = atop(alignment); 138 139 /* 140 * Make sure that physseg falls within with range to be allocated from. 141 */ 142 if (high <= uvm_physseg_get_avail_start(psi) || low >= uvm_physseg_get_avail_end(psi)) 143 return 0; 144 145 /* 146 * We start our search at the just after where the last allocation 147 * succeeded. 148 */ 149 candidate = roundup2(max(low, uvm_physseg_get_avail_start(psi) + 150 uvm_physseg_get_start_hint(psi)), alignment); 151 limit = min(high, uvm_physseg_get_avail_end(psi)); 152 pagemask = ~((boundary >> PAGE_SHIFT) - 1); 153 skip = 0; 154 second_pass = false; 155 156 for (;;) { 157 bool ok = true; 158 signed int cnt; 159 160 if (candidate + num > limit) { 161 if (uvm_physseg_get_start_hint(psi) == 0 || second_pass) { 162 /* 163 * We've run past the allowable range. 164 */ 165 return 0; /* FAIL = 0 pages*/ 166 } 167 /* 168 * We've wrapped around the end of this segment 169 * so restart at the beginning but now our limit 170 * is were we started. 171 */ 172 second_pass = true; 173 candidate = roundup2(max(low, uvm_physseg_get_avail_start(psi)), alignment); 174 limit = min(limit, uvm_physseg_get_avail_start(psi) + 175 uvm_physseg_get_start_hint(psi)); 176 skip = 0; 177 continue; 178 } 179 if (boundary != 0 && 180 ((candidate ^ (candidate + num - 1)) & pagemask) != 0) { 181 /* 182 * Region crosses boundary. Jump to the boundary 183 * just crossed and ensure alignment. 184 */ 185 candidate = (candidate + num - 1) & pagemask; 186 candidate = roundup2(candidate, alignment); 187 skip = 0; 188 continue; 189 } 190 #ifdef DEBUG 191 /* 192 * Make sure this is a managed physical page. 193 */ 194 195 if (uvm_physseg_find(candidate, &cidx) != psi) 196 panic("pgalloc contig: botch1"); 197 if (cidx != candidate - uvm_physseg_get_start(psi)) 198 panic("pgalloc contig: botch2"); 199 if (uvm_physseg_find(candidate + num - 1, &cidx) != psi) 200 panic("pgalloc contig: botch3"); 201 if (cidx != candidate - uvm_physseg_get_start(psi) + num - 1) 202 panic("pgalloc contig: botch4"); 203 #endif 204 candidateidx = candidate - uvm_physseg_get_start(psi); 205 end = candidateidx + num; 206 207 /* 208 * Found a suitable starting page. See if the range is free. 209 */ 210 #ifdef PGALLOC_VERBOSE 211 printf("%s: ps=%p candidate=%#x end=%#x skip=%#x, align=%#"PRIxPADDR, 212 __func__, ps, candidateidx, end, skip, alignment); 213 #endif 214 /* 215 * We start at the end and work backwards since if we find a 216 * non-free page, it makes no sense to continue. 217 * 218 * But on the plus size we have "vetted" some number of free 219 * pages. If this iteration fails, we may be able to skip 220 * testing most of those pages again in the next pass. 221 */ 222 for (idx = end - 1; idx >= candidateidx + skip; idx--) { 223 if (VM_PAGE_IS_FREE(uvm_physseg_get_pg(psi, idx)) == 0) { 224 ok = false; 225 break; 226 } 227 228 #ifdef DEBUG 229 if (idx > candidateidx) { 230 idxpa = VM_PAGE_TO_PHYS(uvm_physseg_get_pg(psi, idx)); 231 lastidxpa = VM_PAGE_TO_PHYS(uvm_physseg_get_pg(psi, idx - 1)); 232 if ((lastidxpa + PAGE_SIZE) != idxpa) { 233 /* 234 * Region not contiguous. 235 */ 236 panic("pgalloc contig: botch5"); 237 } 238 if (boundary != 0 && 239 ((lastidxpa ^ idxpa) & ~(boundary - 1)) 240 != 0) { 241 /* 242 * Region crosses boundary. 243 */ 244 panic("pgalloc contig: botch6"); 245 } 246 } 247 #endif 248 } 249 250 if (ok) { 251 while (skip-- > 0) { 252 KDASSERT(VM_PAGE_IS_FREE(uvm_physseg_get_pg(psi, candidateidx + skip))); 253 } 254 #ifdef PGALLOC_VERBOSE 255 printf(": ok\n"); 256 #endif 257 break; 258 } 259 260 #ifdef PGALLOC_VERBOSE 261 printf(": non-free at %#x\n", idx - candidateidx); 262 #endif 263 /* 264 * count the number of pages we can advance 265 * since we know they aren't all free. 266 */ 267 cnt = idx + 1 - candidateidx; 268 /* 269 * now round up that to the needed alignment. 270 */ 271 cnt = roundup2(cnt, alignment); 272 /* 273 * The number of pages we can skip checking 274 * (might be 0 if cnt > num). 275 */ 276 skip = max(num - cnt, 0); 277 candidate += cnt; 278 } 279 280 /* 281 * we have a chunk of memory that conforms to the requested constraints. 282 */ 283 for (idx = candidateidx; idx < end; idx++) 284 uvm_pglist_add(uvm_physseg_get_pg(psi, idx), rlist); 285 286 /* 287 * the next time we need to search this segment, start after this 288 * chunk of pages we just allocated. 289 */ 290 uvm_physseg_set_start_hint(psi, candidate + num - 291 uvm_physseg_get_avail_start(psi)); 292 KASSERTMSG(uvm_physseg_get_start_hint(psi) <= 293 uvm_physseg_get_avail_end(psi) - uvm_physseg_get_avail_start(psi), 294 "%x %u (%#x) <= %#"PRIxPADDR" - %#"PRIxPADDR" (%#"PRIxPADDR")", 295 candidate + num, 296 uvm_physseg_get_start_hint(psi), uvm_physseg_get_start_hint(psi), 297 uvm_physseg_get_avail_end(psi), uvm_physseg_get_avail_start(psi), 298 uvm_physseg_get_avail_end(psi) - uvm_physseg_get_avail_start(psi)); 299 300 #ifdef PGALLOC_VERBOSE 301 printf("got %d pgs\n", num); 302 #endif 303 return num; /* number of pages allocated */ 304 } 305 306 static int 307 uvm_pglistalloc_contig(int num, paddr_t low, paddr_t high, paddr_t alignment, 308 paddr_t boundary, struct pglist *rlist) 309 { 310 int fl; 311 int error; 312 313 uvm_physseg_t psi; 314 /* Default to "lose". */ 315 error = ENOMEM; 316 317 /* 318 * Block all memory allocation and lock the free list. 319 */ 320 mutex_spin_enter(&uvm_fpageqlock); 321 322 /* Are there even any free pages? */ 323 if (uvmexp.free <= (uvmexp.reserve_pagedaemon + uvmexp.reserve_kernel)) 324 goto out; 325 326 for (fl = 0; fl < VM_NFREELIST; fl++) { 327 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST) 328 for (psi = uvm_physseg_get_last(); uvm_physseg_valid_p(psi); psi = uvm_physseg_get_prev(psi)) 329 #else 330 for (psi = uvm_physseg_get_first(); uvm_physseg_valid_p(psi); psi = uvm_physseg_get_next(psi)) 331 #endif 332 { 333 if (uvm_physseg_get_free_list(psi) != fl) 334 continue; 335 336 num -= uvm_pglistalloc_c_ps(psi, num, low, high, 337 alignment, boundary, rlist); 338 if (num == 0) { 339 #ifdef PGALLOC_VERBOSE 340 printf("pgalloc: %"PRIxMAX"-%"PRIxMAX"\n", 341 (uintmax_t) VM_PAGE_TO_PHYS(TAILQ_FIRST(rlist)), 342 (uintmax_t) VM_PAGE_TO_PHYS(TAILQ_LAST(rlist, pglist))); 343 #endif 344 error = 0; 345 goto out; 346 } 347 } 348 } 349 350 out: 351 /* 352 * check to see if we need to generate some free pages waking 353 * the pagedaemon. 354 */ 355 356 uvm_kick_pdaemon(); 357 mutex_spin_exit(&uvm_fpageqlock); 358 return (error); 359 } 360 361 static int 362 uvm_pglistalloc_s_ps(uvm_physseg_t psi, int num, paddr_t low, paddr_t high, 363 struct pglist *rlist) 364 { 365 int todo, limit, candidate; 366 struct vm_page *pg; 367 bool second_pass; 368 #ifdef PGALLOC_VERBOSE 369 printf("pgalloc: simple %d pgs from psi %zd\n", num, psi); 370 #endif 371 372 KASSERT(mutex_owned(&uvm_fpageqlock)); 373 KASSERT(uvm_physseg_get_start(psi) <= uvm_physseg_get_avail_start(psi)); 374 KASSERT(uvm_physseg_get_start(psi) <= uvm_physseg_get_avail_end(psi)); 375 KASSERT(uvm_physseg_get_avail_start(psi) <= uvm_physseg_get_end(psi)); 376 KASSERT(uvm_physseg_get_avail_end(psi) <= uvm_physseg_get_end(psi)); 377 378 low = atop(low); 379 high = atop(high); 380 todo = num; 381 candidate = max(low, uvm_physseg_get_avail_start(psi) + 382 uvm_physseg_get_start_hint(psi)); 383 limit = min(high, uvm_physseg_get_avail_end(psi)); 384 pg = uvm_physseg_get_pg(psi, candidate - uvm_physseg_get_start(psi)); 385 second_pass = false; 386 387 /* 388 * Make sure that physseg falls within with range to be allocated from. 389 */ 390 if (high <= uvm_physseg_get_avail_start(psi) || 391 low >= uvm_physseg_get_avail_end(psi)) 392 return 0; 393 394 again: 395 for (;; candidate++, pg++) { 396 if (candidate >= limit) { 397 if (uvm_physseg_get_start_hint(psi) == 0 || second_pass) { 398 candidate = limit - 1; 399 break; 400 } 401 second_pass = true; 402 candidate = max(low, uvm_physseg_get_avail_start(psi)); 403 limit = min(limit, uvm_physseg_get_avail_start(psi) + 404 uvm_physseg_get_start_hint(psi)); 405 pg = uvm_physseg_get_pg(psi, candidate - uvm_physseg_get_start(psi)); 406 goto again; 407 } 408 #if defined(DEBUG) 409 { 410 paddr_t cidx = 0; 411 const uvm_physseg_t bank = uvm_physseg_find(candidate, &cidx); 412 KDASSERTMSG(bank == psi, 413 "uvm_physseg_find(%#x) (%"PRIxPHYSSEG ") != psi %"PRIxPHYSSEG, 414 candidate, bank, psi); 415 KDASSERTMSG(cidx == candidate - uvm_physseg_get_start(psi), 416 "uvm_physseg_find(%#x): %#"PRIxPADDR" != off %"PRIxPADDR, 417 candidate, cidx, candidate - uvm_physseg_get_start(psi)); 418 } 419 #endif 420 if (VM_PAGE_IS_FREE(pg) == 0) 421 continue; 422 423 uvm_pglist_add(pg, rlist); 424 if (--todo == 0) { 425 break; 426 } 427 } 428 429 /* 430 * The next time we need to search this segment, 431 * start just after the pages we just allocated. 432 */ 433 uvm_physseg_set_start_hint(psi, candidate + 1 - uvm_physseg_get_avail_start(psi)); 434 KASSERTMSG(uvm_physseg_get_start_hint(psi) <= uvm_physseg_get_avail_end(psi) - 435 uvm_physseg_get_avail_start(psi), 436 "%#x %u (%#x) <= %#"PRIxPADDR" - %#"PRIxPADDR" (%#"PRIxPADDR")", 437 candidate + 1, 438 uvm_physseg_get_start_hint(psi), 439 uvm_physseg_get_start_hint(psi), 440 uvm_physseg_get_avail_end(psi), 441 uvm_physseg_get_avail_start(psi), 442 uvm_physseg_get_avail_end(psi) - uvm_physseg_get_avail_start(psi)); 443 444 #ifdef PGALLOC_VERBOSE 445 printf("got %d pgs\n", num - todo); 446 #endif 447 return (num - todo); /* number of pages allocated */ 448 } 449 450 static int 451 uvm_pglistalloc_simple(int num, paddr_t low, paddr_t high, 452 struct pglist *rlist, int waitok) 453 { 454 int fl, error; 455 uvm_physseg_t psi; 456 457 /* Default to "lose". */ 458 error = ENOMEM; 459 460 again: 461 /* 462 * Block all memory allocation and lock the free list. 463 */ 464 mutex_spin_enter(&uvm_fpageqlock); 465 466 /* Are there even any free pages? */ 467 if (uvmexp.free <= (uvmexp.reserve_pagedaemon + uvmexp.reserve_kernel)) 468 goto out; 469 470 for (fl = 0; fl < VM_NFREELIST; fl++) { 471 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST) 472 for (psi = uvm_physseg_get_last(); uvm_physseg_valid_p(psi); psi = uvm_physseg_get_prev(psi)) 473 #else 474 for (psi = uvm_physseg_get_first(); uvm_physseg_valid_p(psi); psi = uvm_physseg_get_next(psi)) 475 #endif 476 { 477 if (uvm_physseg_get_free_list(psi) != fl) 478 continue; 479 480 num -= uvm_pglistalloc_s_ps(psi, num, low, high, rlist); 481 if (num == 0) { 482 error = 0; 483 goto out; 484 } 485 } 486 487 } 488 489 out: 490 /* 491 * check to see if we need to generate some free pages waking 492 * the pagedaemon. 493 */ 494 495 uvm_kick_pdaemon(); 496 mutex_spin_exit(&uvm_fpageqlock); 497 498 if (error) { 499 if (waitok) { 500 /* XXX perhaps some time limitation? */ 501 #ifdef DEBUG 502 printf("pglistalloc waiting\n"); 503 #endif 504 uvm_wait("pglalloc"); 505 goto again; 506 } else 507 uvm_pglistfree(rlist); 508 } 509 #ifdef PGALLOC_VERBOSE 510 if (!error) 511 printf("pgalloc: %"PRIxMAX"..%"PRIxMAX"\n", 512 (uintmax_t) VM_PAGE_TO_PHYS(TAILQ_FIRST(rlist)), 513 (uintmax_t) VM_PAGE_TO_PHYS(TAILQ_LAST(rlist, pglist))); 514 #endif 515 return (error); 516 } 517 518 int 519 uvm_pglistalloc(psize_t size, paddr_t low, paddr_t high, paddr_t alignment, 520 paddr_t boundary, struct pglist *rlist, int nsegs, int waitok) 521 { 522 int num, res; 523 524 KASSERT((alignment & (alignment - 1)) == 0); 525 KASSERT((boundary & (boundary - 1)) == 0); 526 527 /* 528 * Our allocations are always page granularity, so our alignment 529 * must be, too. 530 */ 531 if (alignment < PAGE_SIZE) 532 alignment = PAGE_SIZE; 533 if (boundary != 0 && boundary < size) 534 return (EINVAL); 535 num = atop(round_page(size)); 536 low = roundup2(low, alignment); 537 538 TAILQ_INIT(rlist); 539 540 if ((nsegs < size >> PAGE_SHIFT) || (alignment != PAGE_SIZE) || 541 (boundary != 0)) 542 res = uvm_pglistalloc_contig(num, low, high, alignment, 543 boundary, rlist); 544 else 545 res = uvm_pglistalloc_simple(num, low, high, rlist, waitok); 546 547 return (res); 548 } 549 550 /* 551 * uvm_pglistfree: free a list of pages 552 * 553 * => pages should already be unmapped 554 */ 555 556 void 557 uvm_pglistfree(struct pglist *list) 558 { 559 struct uvm_cpu *ucpu; 560 struct vm_page *pg; 561 int index, color, queue; 562 bool iszero; 563 564 /* 565 * Lock the free list and free each page. 566 */ 567 568 mutex_spin_enter(&uvm_fpageqlock); 569 ucpu = curcpu()->ci_data.cpu_uvm; 570 while ((pg = TAILQ_FIRST(list)) != NULL) { 571 KASSERT(!uvmpdpol_pageisqueued_p(pg)); 572 TAILQ_REMOVE(list, pg, pageq.queue); 573 iszero = (pg->flags & PG_ZERO); 574 pg->pqflags = PQ_FREE; 575 #ifdef DEBUG 576 pg->uobject = (void *)0xdeadbeef; 577 pg->uanon = (void *)0xdeadbeef; 578 #endif /* DEBUG */ 579 #ifdef DEBUG 580 if (iszero) 581 uvm_pagezerocheck(pg); 582 #endif /* DEBUG */ 583 index = uvm_page_lookup_freelist(pg); 584 color = VM_PGCOLOR_BUCKET(pg); 585 queue = iszero ? PGFL_ZEROS : PGFL_UNKNOWN; 586 pg->offset = (uintptr_t)ucpu; 587 LIST_INSERT_HEAD(&uvm.page_free[index].pgfl_buckets[color]. 588 pgfl_queues[queue], pg, pageq.list); 589 LIST_INSERT_HEAD(&ucpu->page_free[index].pgfl_buckets[color]. 590 pgfl_queues[queue], pg, listq.list); 591 uvmexp.free++; 592 if (iszero) 593 uvmexp.zeropages++; 594 ucpu->pages[queue]++; 595 STAT_DECR(uvm_pglistalloc_npages); 596 } 597 if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN]) 598 ucpu->page_idle_zero = vm_page_zero_enable; 599 mutex_spin_exit(&uvm_fpageqlock); 600 } 601