1 /* $NetBSD: uvm_pglist.c,v 1.58 2011/01/24 22:54:01 matt Exp $ */ 2 3 /*- 4 * Copyright (c) 1997 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * uvm_pglist.c: pglist functions 35 */ 36 37 #include <sys/cdefs.h> 38 __KERNEL_RCSID(0, "$NetBSD: uvm_pglist.c,v 1.58 2011/01/24 22:54:01 matt Exp $"); 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/malloc.h> 43 #include <sys/proc.h> 44 45 #include <uvm/uvm.h> 46 #include <uvm/uvm_pdpolicy.h> 47 48 #ifdef VM_PAGE_ALLOC_MEMORY_STATS 49 #define STAT_INCR(v) (v)++ 50 #define STAT_DECR(v) do { \ 51 if ((v) == 0) \ 52 printf("%s:%d -- Already 0!\n", __FILE__, __LINE__); \ 53 else \ 54 (v)--; \ 55 } while (/*CONSTCOND*/ 0) 56 u_long uvm_pglistalloc_npages; 57 #else 58 #define STAT_INCR(v) 59 #define STAT_DECR(v) 60 #endif 61 62 /* 63 * uvm_pglistalloc: allocate a list of pages 64 * 65 * => allocated pages are placed onto an rlist. rlist is 66 * initialized by uvm_pglistalloc. 67 * => returns 0 on success or errno on failure 68 * => implementation allocates a single segment if any constraints are 69 * imposed by call arguments. 70 * => doesn't take into account clean non-busy pages on inactive list 71 * that could be used(?) 72 * => params: 73 * size the size of the allocation, rounded to page size. 74 * low the low address of the allowed allocation range. 75 * high the high address of the allowed allocation range. 76 * alignment memory must be aligned to this power-of-two boundary. 77 * boundary no segment in the allocation may cross this 78 * power-of-two boundary (relative to zero). 79 */ 80 81 static void 82 uvm_pglist_add(struct vm_page *pg, struct pglist *rlist) 83 { 84 int free_list, color, pgflidx; 85 86 KASSERT(mutex_owned(&uvm_fpageqlock)); 87 88 #if PGFL_NQUEUES != 2 89 #error uvm_pglistalloc needs to be updated 90 #endif 91 92 free_list = uvm_page_lookup_freelist(pg); 93 color = VM_PGCOLOR_BUCKET(pg); 94 pgflidx = (pg->flags & PG_ZERO) ? PGFL_ZEROS : PGFL_UNKNOWN; 95 #ifdef NOT_DEBUG 96 struct vm_page *tp; 97 LIST_FOREACH(tp, 98 &uvm.page_free[free_list].pgfl_buckets[color].pgfl_queues[pgflidx], 99 pageq.list) { 100 if (tp == pg) 101 break; 102 } 103 if (tp == NULL) 104 panic("uvm_pglistalloc: page not on freelist"); 105 #endif 106 LIST_REMOVE(pg, pageq.list); /* global */ 107 LIST_REMOVE(pg, listq.list); /* cpu */ 108 uvmexp.free--; 109 if (pg->flags & PG_ZERO) 110 uvmexp.zeropages--; 111 VM_FREE_PAGE_TO_CPU(pg)->pages[pgflidx]--; 112 pg->flags = PG_CLEAN; 113 pg->pqflags = 0; 114 pg->uobject = NULL; 115 pg->uanon = NULL; 116 TAILQ_INSERT_TAIL(rlist, pg, pageq.queue); 117 STAT_INCR(uvm_pglistalloc_npages); 118 } 119 120 static int 121 uvm_pglistalloc_c_ps(struct vm_physseg *ps, int num, paddr_t low, paddr_t high, 122 paddr_t alignment, paddr_t boundary, struct pglist *rlist) 123 { 124 signed int try, limit, tryidx, end, idx, skip; 125 struct vm_page *pgs; 126 int pagemask; 127 bool second_pass; 128 #ifdef DEBUG 129 paddr_t idxpa, lastidxpa; 130 int cidx = 0; /* XXX: GCC */ 131 #endif 132 #ifdef PGALLOC_VERBOSE 133 printf("pgalloc: contig %d pgs from psi %zd\n", num, ps - vm_physmem); 134 #endif 135 136 KASSERT(mutex_owned(&uvm_fpageqlock)); 137 138 low = atop(low); 139 high = atop(high); 140 alignment = atop(alignment); 141 142 /* 143 * Make sure that physseg falls within with range to be allocated from. 144 */ 145 if (high <= ps->avail_start || low >= ps->avail_end) 146 return 0; 147 148 /* 149 * We start our search at the just after where the last allocation 150 * succeeded. 151 */ 152 try = roundup2(max(low, ps->avail_start + ps->start_hint), alignment); 153 limit = min(high, ps->avail_end); 154 pagemask = ~((boundary >> PAGE_SHIFT) - 1); 155 skip = 0; 156 second_pass = false; 157 pgs = ps->pgs; 158 159 for (;;) { 160 bool ok = true; 161 signed int cnt; 162 163 if (try + num > limit) { 164 if (ps->start_hint == 0 || second_pass) { 165 /* 166 * We've run past the allowable range. 167 */ 168 return 0; /* FAIL = 0 pages*/ 169 } 170 /* 171 * We've wrapped around the end of this segment 172 * so restart at the beginning but now our limit 173 * is were we started. 174 */ 175 second_pass = true; 176 try = roundup2(max(low, ps->avail_start), alignment); 177 limit = min(limit, ps->avail_start + ps->start_hint); 178 skip = 0; 179 continue; 180 } 181 if (boundary != 0 && 182 ((try ^ (try + num - 1)) & pagemask) != 0) { 183 /* 184 * Region crosses boundary. Jump to the boundary 185 * just crossed and ensure alignment. 186 */ 187 try = (try + num - 1) & pagemask; 188 try = roundup2(try, alignment); 189 skip = 0; 190 continue; 191 } 192 #ifdef DEBUG 193 /* 194 * Make sure this is a managed physical page. 195 */ 196 197 if (vm_physseg_find(try, &cidx) != ps - vm_physmem) 198 panic("pgalloc contig: botch1"); 199 if (cidx != try - ps->start) 200 panic("pgalloc contig: botch2"); 201 if (vm_physseg_find(try + num - 1, &cidx) != ps - vm_physmem) 202 panic("pgalloc contig: botch3"); 203 if (cidx != try - ps->start + num - 1) 204 panic("pgalloc contig: botch4"); 205 #endif 206 tryidx = try - ps->start; 207 end = tryidx + num; 208 209 /* 210 * Found a suitable starting page. See if the range is free. 211 */ 212 #ifdef PGALLOC_VERBOSE 213 printf("%s: ps=%p try=%#x end=%#x skip=%#x, align=%#"PRIxPADDR, 214 __func__, ps, tryidx, end, skip, alignment); 215 #endif 216 /* 217 * We start at the end and work backwards since if we find a 218 * non-free page, it makes no sense to continue. 219 * 220 * But on the plus size we have "vetted" some number of free 221 * pages. If this iteration fails, we may be able to skip 222 * testing most of those pages again in the next pass. 223 */ 224 for (idx = end - 1; idx >= tryidx + skip; idx--) { 225 if (VM_PAGE_IS_FREE(&pgs[idx]) == 0) { 226 ok = false; 227 break; 228 } 229 230 #ifdef DEBUG 231 if (idx > tryidx) { 232 idxpa = VM_PAGE_TO_PHYS(&pgs[idx]); 233 lastidxpa = VM_PAGE_TO_PHYS(&pgs[idx - 1]); 234 if ((lastidxpa + PAGE_SIZE) != idxpa) { 235 /* 236 * Region not contiguous. 237 */ 238 panic("pgalloc contig: botch5"); 239 } 240 if (boundary != 0 && 241 ((lastidxpa ^ idxpa) & ~(boundary - 1)) 242 != 0) { 243 /* 244 * Region crosses boundary. 245 */ 246 panic("pgalloc contig: botch6"); 247 } 248 } 249 #endif 250 } 251 252 if (ok) { 253 while (skip-- > 0) { 254 KDASSERT(VM_PAGE_IS_FREE(&pgs[tryidx + skip])); 255 } 256 #ifdef PGALLOC_VERBOSE 257 printf(": ok\n"); 258 #endif 259 break; 260 } 261 262 #ifdef PGALLOC_VERBOSE 263 printf(": non-free at %#x\n", idx - tryidx); 264 #endif 265 /* 266 * count the number of pages we can advance 267 * since we know they aren't all free. 268 */ 269 cnt = idx + 1 - tryidx; 270 /* 271 * now round up that to the needed alignment. 272 */ 273 cnt = roundup2(cnt, alignment); 274 /* 275 * The number of pages we can skip checking 276 * (might be 0 if cnt > num). 277 */ 278 skip = max(num - cnt, 0); 279 try += cnt; 280 } 281 282 /* 283 * we have a chunk of memory that conforms to the requested constraints. 284 */ 285 for (idx = tryidx, pgs += idx; idx < end; idx++, pgs++) 286 uvm_pglist_add(pgs, rlist); 287 288 /* 289 * the next time we need to search this segment, start after this 290 * chunk of pages we just allocated. 291 */ 292 ps->start_hint = try + num - ps->avail_start; 293 KASSERTMSG(ps->start_hint <= ps->avail_end - ps->avail_start, 294 ("%x %u (%#x) <= %#"PRIxPADDR" - %#"PRIxPADDR" (%#"PRIxPADDR")", 295 try + num, 296 ps->start_hint, ps->start_hint, ps->avail_end, ps->avail_start, 297 ps->avail_end - ps->avail_start)); 298 299 #ifdef PGALLOC_VERBOSE 300 printf("got %d pgs\n", num); 301 #endif 302 return num; /* number of pages allocated */ 303 } 304 305 static int 306 uvm_pglistalloc_contig(int num, paddr_t low, paddr_t high, paddr_t alignment, 307 paddr_t boundary, struct pglist *rlist) 308 { 309 int fl, psi; 310 struct vm_physseg *ps; 311 int error; 312 313 /* Default to "lose". */ 314 error = ENOMEM; 315 316 /* 317 * Block all memory allocation and lock the free list. 318 */ 319 mutex_spin_enter(&uvm_fpageqlock); 320 321 /* Are there even any free pages? */ 322 if (uvmexp.free <= (uvmexp.reserve_pagedaemon + uvmexp.reserve_kernel)) 323 goto out; 324 325 for (fl = 0; fl < VM_NFREELIST; fl++) { 326 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST) 327 for (psi = vm_nphysseg - 1 ; psi >= 0 ; psi--) 328 #else 329 for (psi = 0 ; psi < vm_nphysseg ; psi++) 330 #endif 331 { 332 ps = &vm_physmem[psi]; 333 334 if (ps->free_list != fl) 335 continue; 336 337 num -= uvm_pglistalloc_c_ps(ps, num, low, high, 338 alignment, boundary, rlist); 339 if (num == 0) { 340 #ifdef PGALLOC_VERBOSE 341 printf("pgalloc: %"PRIxMAX"-%"PRIxMAX"\n", 342 (uintmax_t) VM_PAGE_TO_PHYS(TAILQ_FIRST(rlist)), 343 (uintmax_t) VM_PAGE_TO_PHYS(TAILQ_LAST(rlist, pglist))); 344 #endif 345 error = 0; 346 goto out; 347 } 348 } 349 } 350 351 out: 352 /* 353 * check to see if we need to generate some free pages waking 354 * the pagedaemon. 355 */ 356 357 uvm_kick_pdaemon(); 358 mutex_spin_exit(&uvm_fpageqlock); 359 return (error); 360 } 361 362 static int 363 uvm_pglistalloc_s_ps(struct vm_physseg *ps, int num, paddr_t low, paddr_t high, 364 struct pglist *rlist) 365 { 366 int todo, limit, try; 367 struct vm_page *pg; 368 bool second_pass; 369 #ifdef PGALLOC_VERBOSE 370 printf("pgalloc: simple %d pgs from psi %zd\n", num, ps - vm_physmem); 371 #endif 372 373 KASSERT(mutex_owned(&uvm_fpageqlock)); 374 KASSERT(ps->start <= ps->avail_start); 375 KASSERT(ps->start <= ps->avail_end); 376 KASSERT(ps->avail_start <= ps->end); 377 KASSERT(ps->avail_end <= ps->end); 378 379 low = atop(low); 380 high = atop(high); 381 todo = num; 382 try = max(low, ps->avail_start + ps->start_hint); 383 limit = min(high, ps->avail_end); 384 pg = &ps->pgs[try - ps->start]; 385 second_pass = false; 386 387 /* 388 * Make sure that physseg falls within with range to be allocated from. 389 */ 390 if (high <= ps->avail_start || low >= ps->avail_end) 391 return 0; 392 393 for (;; try++, pg++) { 394 if (try >= limit) { 395 if (ps->start_hint == 0 || second_pass) { 396 try = limit - 1; 397 break; 398 } 399 second_pass = true; 400 try = max(low, ps->avail_start); 401 limit = min(limit, ps->avail_start + ps->start_hint); 402 pg = &ps->pgs[try - ps->start]; 403 continue; 404 } 405 #if defined(DEBUG) 406 { 407 int cidx = 0; 408 const int bank = vm_physseg_find(try, &cidx); 409 KDASSERTMSG(bank == ps - vm_physmem, 410 ("vm_physseg_find(%#x) (%d) != ps %zd", 411 try, bank, ps - vm_physmem)); 412 KDASSERTMSG(cidx == try - ps->start, 413 ("vm_physseg_find(%#x): %#x != off %"PRIxPADDR, 414 try, cidx, try - ps->start)); 415 } 416 #endif 417 if (VM_PAGE_IS_FREE(pg) == 0) 418 continue; 419 420 uvm_pglist_add(pg, rlist); 421 if (--todo == 0) { 422 break; 423 } 424 } 425 426 /* 427 * The next time we need to search this segment, 428 * start just after the pages we just allocated. 429 */ 430 ps->start_hint = try + 1 - ps->avail_start; 431 KASSERTMSG(ps->start_hint <= ps->avail_end - ps->avail_start, 432 ("%#x %u (%#x) <= %#"PRIxPADDR" - %#"PRIxPADDR" (%#"PRIxPADDR")", 433 try + 1, 434 ps->start_hint, ps->start_hint, ps->avail_end, ps->avail_start, 435 ps->avail_end - ps->avail_start)); 436 437 #ifdef PGALLOC_VERBOSE 438 printf("got %d pgs\n", num - todo); 439 #endif 440 return (num - todo); /* number of pages allocated */ 441 } 442 443 static int 444 uvm_pglistalloc_simple(int num, paddr_t low, paddr_t high, 445 struct pglist *rlist, int waitok) 446 { 447 int fl, psi, error; 448 struct vm_physseg *ps; 449 450 /* Default to "lose". */ 451 error = ENOMEM; 452 453 again: 454 /* 455 * Block all memory allocation and lock the free list. 456 */ 457 mutex_spin_enter(&uvm_fpageqlock); 458 459 /* Are there even any free pages? */ 460 if (uvmexp.free <= (uvmexp.reserve_pagedaemon + uvmexp.reserve_kernel)) 461 goto out; 462 463 for (fl = 0; fl < VM_NFREELIST; fl++) { 464 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST) 465 for (psi = vm_nphysseg - 1 ; psi >= 0 ; psi--) 466 #else 467 for (psi = 0 ; psi < vm_nphysseg ; psi++) 468 #endif 469 { 470 ps = &vm_physmem[psi]; 471 472 if (ps->free_list != fl) 473 continue; 474 475 num -= uvm_pglistalloc_s_ps(ps, num, low, high, rlist); 476 if (num == 0) { 477 error = 0; 478 goto out; 479 } 480 } 481 482 } 483 484 out: 485 /* 486 * check to see if we need to generate some free pages waking 487 * the pagedaemon. 488 */ 489 490 uvm_kick_pdaemon(); 491 mutex_spin_exit(&uvm_fpageqlock); 492 493 if (error) { 494 if (waitok) { 495 /* XXX perhaps some time limitation? */ 496 #ifdef DEBUG 497 printf("pglistalloc waiting\n"); 498 #endif 499 uvm_wait("pglalloc"); 500 goto again; 501 } else 502 uvm_pglistfree(rlist); 503 } 504 #ifdef PGALLOC_VERBOSE 505 if (!error) 506 printf("pgalloc: %"PRIxMAX"..%"PRIxMAX"\n", 507 (uintmax_t) VM_PAGE_TO_PHYS(TAILQ_FIRST(rlist)), 508 (uintmax_t) VM_PAGE_TO_PHYS(TAILQ_LAST(rlist, pglist))); 509 #endif 510 return (error); 511 } 512 513 int 514 uvm_pglistalloc(psize_t size, paddr_t low, paddr_t high, paddr_t alignment, 515 paddr_t boundary, struct pglist *rlist, int nsegs, int waitok) 516 { 517 int num, res; 518 519 KASSERT((alignment & (alignment - 1)) == 0); 520 KASSERT((boundary & (boundary - 1)) == 0); 521 522 /* 523 * Our allocations are always page granularity, so our alignment 524 * must be, too. 525 */ 526 if (alignment < PAGE_SIZE) 527 alignment = PAGE_SIZE; 528 if (boundary != 0 && boundary < size) 529 return (EINVAL); 530 num = atop(round_page(size)); 531 low = roundup2(low, alignment); 532 533 TAILQ_INIT(rlist); 534 535 if ((nsegs < size >> PAGE_SHIFT) || (alignment != PAGE_SIZE) || 536 (boundary != 0)) 537 res = uvm_pglistalloc_contig(num, low, high, alignment, 538 boundary, rlist); 539 else 540 res = uvm_pglistalloc_simple(num, low, high, rlist, waitok); 541 542 return (res); 543 } 544 545 /* 546 * uvm_pglistfree: free a list of pages 547 * 548 * => pages should already be unmapped 549 */ 550 551 void 552 uvm_pglistfree(struct pglist *list) 553 { 554 struct uvm_cpu *ucpu; 555 struct vm_page *pg; 556 int index, color, queue; 557 bool iszero; 558 559 /* 560 * Lock the free list and free each page. 561 */ 562 563 mutex_spin_enter(&uvm_fpageqlock); 564 ucpu = curcpu()->ci_data.cpu_uvm; 565 while ((pg = TAILQ_FIRST(list)) != NULL) { 566 KASSERT(!uvmpdpol_pageisqueued_p(pg)); 567 TAILQ_REMOVE(list, pg, pageq.queue); 568 iszero = (pg->flags & PG_ZERO); 569 pg->pqflags = PQ_FREE; 570 #ifdef DEBUG 571 pg->uobject = (void *)0xdeadbeef; 572 pg->uanon = (void *)0xdeadbeef; 573 #endif /* DEBUG */ 574 #ifdef DEBUG 575 if (iszero) 576 uvm_pagezerocheck(pg); 577 #endif /* DEBUG */ 578 index = uvm_page_lookup_freelist(pg); 579 color = VM_PGCOLOR_BUCKET(pg); 580 queue = iszero ? PGFL_ZEROS : PGFL_UNKNOWN; 581 pg->offset = (uintptr_t)ucpu; 582 LIST_INSERT_HEAD(&uvm.page_free[index].pgfl_buckets[color]. 583 pgfl_queues[queue], pg, pageq.list); 584 LIST_INSERT_HEAD(&ucpu->page_free[index].pgfl_buckets[color]. 585 pgfl_queues[queue], pg, listq.list); 586 uvmexp.free++; 587 if (iszero) 588 uvmexp.zeropages++; 589 ucpu->pages[queue]++; 590 STAT_DECR(uvm_pglistalloc_npages); 591 } 592 if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN]) 593 ucpu->page_idle_zero = vm_page_zero_enable; 594 mutex_spin_exit(&uvm_fpageqlock); 595 } 596