1 /* $NetBSD: uvm_pglist.c,v 1.60 2011/01/26 08:49:48 enami Exp $ */ 2 3 /*- 4 * Copyright (c) 1997 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * uvm_pglist.c: pglist functions 35 */ 36 37 #include <sys/cdefs.h> 38 __KERNEL_RCSID(0, "$NetBSD: uvm_pglist.c,v 1.60 2011/01/26 08:49:48 enami Exp $"); 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/malloc.h> 43 #include <sys/proc.h> 44 45 #include <uvm/uvm.h> 46 #include <uvm/uvm_pdpolicy.h> 47 48 #ifdef VM_PAGE_ALLOC_MEMORY_STATS 49 #define STAT_INCR(v) (v)++ 50 #define STAT_DECR(v) do { \ 51 if ((v) == 0) \ 52 printf("%s:%d -- Already 0!\n", __FILE__, __LINE__); \ 53 else \ 54 (v)--; \ 55 } while (/*CONSTCOND*/ 0) 56 u_long uvm_pglistalloc_npages; 57 #else 58 #define STAT_INCR(v) 59 #define STAT_DECR(v) 60 #endif 61 62 /* 63 * uvm_pglistalloc: allocate a list of pages 64 * 65 * => allocated pages are placed onto an rlist. rlist is 66 * initialized by uvm_pglistalloc. 67 * => returns 0 on success or errno on failure 68 * => implementation allocates a single segment if any constraints are 69 * imposed by call arguments. 70 * => doesn't take into account clean non-busy pages on inactive list 71 * that could be used(?) 72 * => params: 73 * size the size of the allocation, rounded to page size. 74 * low the low address of the allowed allocation range. 75 * high the high address of the allowed allocation range. 76 * alignment memory must be aligned to this power-of-two boundary. 77 * boundary no segment in the allocation may cross this 78 * power-of-two boundary (relative to zero). 79 */ 80 81 static void 82 uvm_pglist_add(struct vm_page *pg, struct pglist *rlist) 83 { 84 int free_list, color, pgflidx; 85 86 KASSERT(mutex_owned(&uvm_fpageqlock)); 87 88 #if PGFL_NQUEUES != 2 89 #error uvm_pglistalloc needs to be updated 90 #endif 91 92 free_list = uvm_page_lookup_freelist(pg); 93 color = VM_PGCOLOR_BUCKET(pg); 94 pgflidx = (pg->flags & PG_ZERO) ? PGFL_ZEROS : PGFL_UNKNOWN; 95 #ifdef NOT_DEBUG 96 struct vm_page *tp; 97 LIST_FOREACH(tp, 98 &uvm.page_free[free_list].pgfl_buckets[color].pgfl_queues[pgflidx], 99 pageq.list) { 100 if (tp == pg) 101 break; 102 } 103 if (tp == NULL) 104 panic("uvm_pglistalloc: page not on freelist"); 105 #endif 106 LIST_REMOVE(pg, pageq.list); /* global */ 107 LIST_REMOVE(pg, listq.list); /* cpu */ 108 uvmexp.free--; 109 if (pg->flags & PG_ZERO) 110 uvmexp.zeropages--; 111 VM_FREE_PAGE_TO_CPU(pg)->pages[pgflidx]--; 112 pg->flags = PG_CLEAN; 113 pg->pqflags = 0; 114 pg->uobject = NULL; 115 pg->uanon = NULL; 116 TAILQ_INSERT_TAIL(rlist, pg, pageq.queue); 117 STAT_INCR(uvm_pglistalloc_npages); 118 } 119 120 static int 121 uvm_pglistalloc_c_ps(struct vm_physseg *ps, int num, paddr_t low, paddr_t high, 122 paddr_t alignment, paddr_t boundary, struct pglist *rlist) 123 { 124 signed int try, limit, tryidx, end, idx, skip; 125 struct vm_page *pgs; 126 int pagemask; 127 bool second_pass; 128 #ifdef DEBUG 129 paddr_t idxpa, lastidxpa; 130 int cidx = 0; /* XXX: GCC */ 131 #endif 132 #ifdef PGALLOC_VERBOSE 133 printf("pgalloc: contig %d pgs from psi %zd\n", num, ps - vm_physmem); 134 #endif 135 136 KASSERT(mutex_owned(&uvm_fpageqlock)); 137 138 low = atop(low); 139 high = atop(high); 140 alignment = atop(alignment); 141 142 /* 143 * Make sure that physseg falls within with range to be allocated from. 144 */ 145 if (high <= ps->avail_start || low >= ps->avail_end) 146 return 0; 147 148 /* 149 * We start our search at the just after where the last allocation 150 * succeeded. 151 */ 152 try = roundup2(max(low, ps->avail_start + ps->start_hint), alignment); 153 limit = min(high, ps->avail_end); 154 pagemask = ~((boundary >> PAGE_SHIFT) - 1); 155 skip = 0; 156 second_pass = false; 157 pgs = ps->pgs; 158 159 for (;;) { 160 bool ok = true; 161 signed int cnt; 162 163 if (try + num > limit) { 164 if (ps->start_hint == 0 || second_pass) { 165 /* 166 * We've run past the allowable range. 167 */ 168 return 0; /* FAIL = 0 pages*/ 169 } 170 /* 171 * We've wrapped around the end of this segment 172 * so restart at the beginning but now our limit 173 * is were we started. 174 */ 175 second_pass = true; 176 try = roundup2(max(low, ps->avail_start), alignment); 177 limit = min(limit, ps->avail_start + ps->start_hint); 178 skip = 0; 179 continue; 180 } 181 if (boundary != 0 && 182 ((try ^ (try + num - 1)) & pagemask) != 0) { 183 /* 184 * Region crosses boundary. Jump to the boundary 185 * just crossed and ensure alignment. 186 */ 187 try = (try + num - 1) & pagemask; 188 try = roundup2(try, alignment); 189 skip = 0; 190 continue; 191 } 192 #ifdef DEBUG 193 /* 194 * Make sure this is a managed physical page. 195 */ 196 197 if (vm_physseg_find(try, &cidx) != ps - vm_physmem) 198 panic("pgalloc contig: botch1"); 199 if (cidx != try - ps->start) 200 panic("pgalloc contig: botch2"); 201 if (vm_physseg_find(try + num - 1, &cidx) != ps - vm_physmem) 202 panic("pgalloc contig: botch3"); 203 if (cidx != try - ps->start + num - 1) 204 panic("pgalloc contig: botch4"); 205 #endif 206 tryidx = try - ps->start; 207 end = tryidx + num; 208 209 /* 210 * Found a suitable starting page. See if the range is free. 211 */ 212 #ifdef PGALLOC_VERBOSE 213 printf("%s: ps=%p try=%#x end=%#x skip=%#x, align=%#"PRIxPADDR, 214 __func__, ps, tryidx, end, skip, alignment); 215 #endif 216 /* 217 * We start at the end and work backwards since if we find a 218 * non-free page, it makes no sense to continue. 219 * 220 * But on the plus size we have "vetted" some number of free 221 * pages. If this iteration fails, we may be able to skip 222 * testing most of those pages again in the next pass. 223 */ 224 for (idx = end - 1; idx >= tryidx + skip; idx--) { 225 if (VM_PAGE_IS_FREE(&pgs[idx]) == 0) { 226 ok = false; 227 break; 228 } 229 230 #ifdef DEBUG 231 if (idx > tryidx) { 232 idxpa = VM_PAGE_TO_PHYS(&pgs[idx]); 233 lastidxpa = VM_PAGE_TO_PHYS(&pgs[idx - 1]); 234 if ((lastidxpa + PAGE_SIZE) != idxpa) { 235 /* 236 * Region not contiguous. 237 */ 238 panic("pgalloc contig: botch5"); 239 } 240 if (boundary != 0 && 241 ((lastidxpa ^ idxpa) & ~(boundary - 1)) 242 != 0) { 243 /* 244 * Region crosses boundary. 245 */ 246 panic("pgalloc contig: botch6"); 247 } 248 } 249 #endif 250 } 251 252 if (ok) { 253 while (skip-- > 0) { 254 KDASSERT(VM_PAGE_IS_FREE(&pgs[tryidx + skip])); 255 } 256 #ifdef PGALLOC_VERBOSE 257 printf(": ok\n"); 258 #endif 259 break; 260 } 261 262 #ifdef PGALLOC_VERBOSE 263 printf(": non-free at %#x\n", idx - tryidx); 264 #endif 265 /* 266 * count the number of pages we can advance 267 * since we know they aren't all free. 268 */ 269 cnt = idx + 1 - tryidx; 270 /* 271 * now round up that to the needed alignment. 272 */ 273 cnt = roundup2(cnt, alignment); 274 /* 275 * The number of pages we can skip checking 276 * (might be 0 if cnt > num). 277 */ 278 skip = max(num - cnt, 0); 279 try += cnt; 280 } 281 282 /* 283 * we have a chunk of memory that conforms to the requested constraints. 284 */ 285 for (idx = tryidx, pgs += idx; idx < end; idx++, pgs++) 286 uvm_pglist_add(pgs, rlist); 287 288 /* 289 * the next time we need to search this segment, start after this 290 * chunk of pages we just allocated. 291 */ 292 ps->start_hint = try + num - ps->avail_start; 293 KASSERTMSG(ps->start_hint <= ps->avail_end - ps->avail_start, 294 ("%x %u (%#x) <= %#"PRIxPADDR" - %#"PRIxPADDR" (%#"PRIxPADDR")", 295 try + num, 296 ps->start_hint, ps->start_hint, ps->avail_end, ps->avail_start, 297 ps->avail_end - ps->avail_start)); 298 299 #ifdef PGALLOC_VERBOSE 300 printf("got %d pgs\n", num); 301 #endif 302 return num; /* number of pages allocated */ 303 } 304 305 static int 306 uvm_pglistalloc_contig(int num, paddr_t low, paddr_t high, paddr_t alignment, 307 paddr_t boundary, struct pglist *rlist) 308 { 309 int fl, psi; 310 struct vm_physseg *ps; 311 int error; 312 313 /* Default to "lose". */ 314 error = ENOMEM; 315 316 /* 317 * Block all memory allocation and lock the free list. 318 */ 319 mutex_spin_enter(&uvm_fpageqlock); 320 321 /* Are there even any free pages? */ 322 if (uvmexp.free <= (uvmexp.reserve_pagedaemon + uvmexp.reserve_kernel)) 323 goto out; 324 325 for (fl = 0; fl < VM_NFREELIST; fl++) { 326 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST) 327 for (psi = vm_nphysseg - 1 ; psi >= 0 ; psi--) 328 #else 329 for (psi = 0 ; psi < vm_nphysseg ; psi++) 330 #endif 331 { 332 ps = &vm_physmem[psi]; 333 334 if (ps->free_list != fl) 335 continue; 336 337 num -= uvm_pglistalloc_c_ps(ps, num, low, high, 338 alignment, boundary, rlist); 339 if (num == 0) { 340 #ifdef PGALLOC_VERBOSE 341 printf("pgalloc: %"PRIxMAX"-%"PRIxMAX"\n", 342 (uintmax_t) VM_PAGE_TO_PHYS(TAILQ_FIRST(rlist)), 343 (uintmax_t) VM_PAGE_TO_PHYS(TAILQ_LAST(rlist, pglist))); 344 #endif 345 error = 0; 346 goto out; 347 } 348 } 349 } 350 351 out: 352 /* 353 * check to see if we need to generate some free pages waking 354 * the pagedaemon. 355 */ 356 357 uvm_kick_pdaemon(); 358 mutex_spin_exit(&uvm_fpageqlock); 359 return (error); 360 } 361 362 static int 363 uvm_pglistalloc_s_ps(struct vm_physseg *ps, int num, paddr_t low, paddr_t high, 364 struct pglist *rlist) 365 { 366 int todo, limit, try; 367 struct vm_page *pg; 368 bool second_pass; 369 #ifdef PGALLOC_VERBOSE 370 printf("pgalloc: simple %d pgs from psi %zd\n", num, ps - vm_physmem); 371 #endif 372 373 KASSERT(mutex_owned(&uvm_fpageqlock)); 374 KASSERT(ps->start <= ps->avail_start); 375 KASSERT(ps->start <= ps->avail_end); 376 KASSERT(ps->avail_start <= ps->end); 377 KASSERT(ps->avail_end <= ps->end); 378 379 low = atop(low); 380 high = atop(high); 381 todo = num; 382 try = max(low, ps->avail_start + ps->start_hint); 383 limit = min(high, ps->avail_end); 384 pg = &ps->pgs[try - ps->start]; 385 second_pass = false; 386 387 /* 388 * Make sure that physseg falls within with range to be allocated from. 389 */ 390 if (high <= ps->avail_start || low >= ps->avail_end) 391 return 0; 392 393 again: 394 for (;; try++, pg++) { 395 if (try >= limit) { 396 if (ps->start_hint == 0 || second_pass) { 397 try = limit - 1; 398 break; 399 } 400 second_pass = true; 401 try = max(low, ps->avail_start); 402 limit = min(limit, ps->avail_start + ps->start_hint); 403 pg = &ps->pgs[try - ps->start]; 404 goto again; 405 } 406 #if defined(DEBUG) 407 { 408 int cidx = 0; 409 const int bank = vm_physseg_find(try, &cidx); 410 KDASSERTMSG(bank == ps - vm_physmem, 411 ("vm_physseg_find(%#x) (%d) != ps %zd", 412 try, bank, ps - vm_physmem)); 413 KDASSERTMSG(cidx == try - ps->start, 414 ("vm_physseg_find(%#x): %#x != off %"PRIxPADDR, 415 try, cidx, try - ps->start)); 416 } 417 #endif 418 if (VM_PAGE_IS_FREE(pg) == 0) 419 continue; 420 421 uvm_pglist_add(pg, rlist); 422 if (--todo == 0) { 423 break; 424 } 425 } 426 427 /* 428 * The next time we need to search this segment, 429 * start just after the pages we just allocated. 430 */ 431 ps->start_hint = try + 1 - ps->avail_start; 432 KASSERTMSG(ps->start_hint <= ps->avail_end - ps->avail_start, 433 ("%#x %u (%#x) <= %#"PRIxPADDR" - %#"PRIxPADDR" (%#"PRIxPADDR")", 434 try + 1, 435 ps->start_hint, ps->start_hint, ps->avail_end, ps->avail_start, 436 ps->avail_end - ps->avail_start)); 437 438 #ifdef PGALLOC_VERBOSE 439 printf("got %d pgs\n", num - todo); 440 #endif 441 return (num - todo); /* number of pages allocated */ 442 } 443 444 static int 445 uvm_pglistalloc_simple(int num, paddr_t low, paddr_t high, 446 struct pglist *rlist, int waitok) 447 { 448 int fl, psi, error; 449 struct vm_physseg *ps; 450 451 /* Default to "lose". */ 452 error = ENOMEM; 453 454 again: 455 /* 456 * Block all memory allocation and lock the free list. 457 */ 458 mutex_spin_enter(&uvm_fpageqlock); 459 460 /* Are there even any free pages? */ 461 if (uvmexp.free <= (uvmexp.reserve_pagedaemon + uvmexp.reserve_kernel)) 462 goto out; 463 464 for (fl = 0; fl < VM_NFREELIST; fl++) { 465 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST) 466 for (psi = vm_nphysseg - 1 ; psi >= 0 ; psi--) 467 #else 468 for (psi = 0 ; psi < vm_nphysseg ; psi++) 469 #endif 470 { 471 ps = &vm_physmem[psi]; 472 473 if (ps->free_list != fl) 474 continue; 475 476 num -= uvm_pglistalloc_s_ps(ps, num, low, high, rlist); 477 if (num == 0) { 478 error = 0; 479 goto out; 480 } 481 } 482 483 } 484 485 out: 486 /* 487 * check to see if we need to generate some free pages waking 488 * the pagedaemon. 489 */ 490 491 uvm_kick_pdaemon(); 492 mutex_spin_exit(&uvm_fpageqlock); 493 494 if (error) { 495 if (waitok) { 496 /* XXX perhaps some time limitation? */ 497 #ifdef DEBUG 498 printf("pglistalloc waiting\n"); 499 #endif 500 uvm_wait("pglalloc"); 501 goto again; 502 } else 503 uvm_pglistfree(rlist); 504 } 505 #ifdef PGALLOC_VERBOSE 506 if (!error) 507 printf("pgalloc: %"PRIxMAX"..%"PRIxMAX"\n", 508 (uintmax_t) VM_PAGE_TO_PHYS(TAILQ_FIRST(rlist)), 509 (uintmax_t) VM_PAGE_TO_PHYS(TAILQ_LAST(rlist, pglist))); 510 #endif 511 return (error); 512 } 513 514 int 515 uvm_pglistalloc(psize_t size, paddr_t low, paddr_t high, paddr_t alignment, 516 paddr_t boundary, struct pglist *rlist, int nsegs, int waitok) 517 { 518 int num, res; 519 520 KASSERT((alignment & (alignment - 1)) == 0); 521 KASSERT((boundary & (boundary - 1)) == 0); 522 523 /* 524 * Our allocations are always page granularity, so our alignment 525 * must be, too. 526 */ 527 if (alignment < PAGE_SIZE) 528 alignment = PAGE_SIZE; 529 if (boundary != 0 && boundary < size) 530 return (EINVAL); 531 num = atop(round_page(size)); 532 low = roundup2(low, alignment); 533 534 TAILQ_INIT(rlist); 535 536 if ((nsegs < size >> PAGE_SHIFT) || (alignment != PAGE_SIZE) || 537 (boundary != 0)) 538 res = uvm_pglistalloc_contig(num, low, high, alignment, 539 boundary, rlist); 540 else 541 res = uvm_pglistalloc_simple(num, low, high, rlist, waitok); 542 543 return (res); 544 } 545 546 /* 547 * uvm_pglistfree: free a list of pages 548 * 549 * => pages should already be unmapped 550 */ 551 552 void 553 uvm_pglistfree(struct pglist *list) 554 { 555 struct uvm_cpu *ucpu; 556 struct vm_page *pg; 557 int index, color, queue; 558 bool iszero; 559 560 /* 561 * Lock the free list and free each page. 562 */ 563 564 mutex_spin_enter(&uvm_fpageqlock); 565 ucpu = curcpu()->ci_data.cpu_uvm; 566 while ((pg = TAILQ_FIRST(list)) != NULL) { 567 KASSERT(!uvmpdpol_pageisqueued_p(pg)); 568 TAILQ_REMOVE(list, pg, pageq.queue); 569 iszero = (pg->flags & PG_ZERO); 570 pg->pqflags = PQ_FREE; 571 #ifdef DEBUG 572 pg->uobject = (void *)0xdeadbeef; 573 pg->uanon = (void *)0xdeadbeef; 574 #endif /* DEBUG */ 575 #ifdef DEBUG 576 if (iszero) 577 uvm_pagezerocheck(pg); 578 #endif /* DEBUG */ 579 index = uvm_page_lookup_freelist(pg); 580 color = VM_PGCOLOR_BUCKET(pg); 581 queue = iszero ? PGFL_ZEROS : PGFL_UNKNOWN; 582 pg->offset = (uintptr_t)ucpu; 583 LIST_INSERT_HEAD(&uvm.page_free[index].pgfl_buckets[color]. 584 pgfl_queues[queue], pg, pageq.list); 585 LIST_INSERT_HEAD(&ucpu->page_free[index].pgfl_buckets[color]. 586 pgfl_queues[queue], pg, listq.list); 587 uvmexp.free++; 588 if (iszero) 589 uvmexp.zeropages++; 590 ucpu->pages[queue]++; 591 STAT_DECR(uvm_pglistalloc_npages); 592 } 593 if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN]) 594 ucpu->page_idle_zero = vm_page_zero_enable; 595 mutex_spin_exit(&uvm_fpageqlock); 596 } 597