1 /* $NetBSD: uvm_page.c,v 1.234 2020/03/17 18:31:39 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1997 Charles D. Cranor and Washington University. 34 * Copyright (c) 1991, 1993, The Regents of the University of California. 35 * 36 * All rights reserved. 37 * 38 * This code is derived from software contributed to Berkeley by 39 * The Mach Operating System project at Carnegie-Mellon University. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * @(#)vm_page.c 8.3 (Berkeley) 3/21/94 66 * from: Id: uvm_page.c,v 1.1.2.18 1998/02/06 05:24:42 chs Exp 67 * 68 * 69 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 70 * All rights reserved. 71 * 72 * Permission to use, copy, modify and distribute this software and 73 * its documentation is hereby granted, provided that both the copyright 74 * notice and this permission notice appear in all copies of the 75 * software, derivative works or modified versions, and any portions 76 * thereof, and that both notices appear in supporting documentation. 77 * 78 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 79 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 80 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 81 * 82 * Carnegie Mellon requests users of this software to return to 83 * 84 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 85 * School of Computer Science 86 * Carnegie Mellon University 87 * Pittsburgh PA 15213-3890 88 * 89 * any improvements or extensions that they make and grant Carnegie the 90 * rights to redistribute these changes. 91 */ 92 93 /* 94 * uvm_page.c: page ops. 95 */ 96 97 #include <sys/cdefs.h> 98 __KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.234 2020/03/17 18:31:39 ad Exp $"); 99 100 #include "opt_ddb.h" 101 #include "opt_uvm.h" 102 #include "opt_uvmhist.h" 103 #include "opt_readahead.h" 104 105 #include <sys/param.h> 106 #include <sys/systm.h> 107 #include <sys/sched.h> 108 #include <sys/kernel.h> 109 #include <sys/vnode.h> 110 #include <sys/proc.h> 111 #include <sys/radixtree.h> 112 #include <sys/atomic.h> 113 #include <sys/cpu.h> 114 #include <sys/extent.h> 115 116 #include <uvm/uvm.h> 117 #include <uvm/uvm_ddb.h> 118 #include <uvm/uvm_pdpolicy.h> 119 #include <uvm/uvm_pgflcache.h> 120 121 /* 122 * Some supported CPUs in a given architecture don't support all 123 * of the things necessary to do idle page zero'ing efficiently. 124 * We therefore provide a way to enable it from machdep code here. 125 */ 126 bool vm_page_zero_enable = false; 127 128 /* 129 * number of pages per-CPU to reserve for the kernel. 130 */ 131 #ifndef UVM_RESERVED_PAGES_PER_CPU 132 #define UVM_RESERVED_PAGES_PER_CPU 5 133 #endif 134 int vm_page_reserve_kernel = UVM_RESERVED_PAGES_PER_CPU; 135 136 /* 137 * physical memory size; 138 */ 139 psize_t physmem; 140 141 /* 142 * local variables 143 */ 144 145 /* 146 * these variables record the values returned by vm_page_bootstrap, 147 * for debugging purposes. The implementation of uvm_pageboot_alloc 148 * and pmap_startup here also uses them internally. 149 */ 150 151 static vaddr_t virtual_space_start; 152 static vaddr_t virtual_space_end; 153 154 /* 155 * we allocate an initial number of page colors in uvm_page_init(), 156 * and remember them. We may re-color pages as cache sizes are 157 * discovered during the autoconfiguration phase. But we can never 158 * free the initial set of buckets, since they are allocated using 159 * uvm_pageboot_alloc(). 160 */ 161 162 static size_t recolored_pages_memsize /* = 0 */; 163 static char *recolored_pages_mem; 164 165 /* 166 * freelist locks - one per bucket. 167 */ 168 169 union uvm_freelist_lock uvm_freelist_locks[PGFL_MAX_BUCKETS] 170 __cacheline_aligned; 171 172 /* 173 * basic NUMA information. 174 */ 175 176 static struct uvm_page_numa_region { 177 struct uvm_page_numa_region *next; 178 paddr_t start; 179 paddr_t size; 180 u_int numa_id; 181 } *uvm_page_numa_region; 182 183 #ifdef DEBUG 184 kmutex_t uvm_zerochecklock __cacheline_aligned; 185 vaddr_t uvm_zerocheckkva; 186 #endif /* DEBUG */ 187 188 /* 189 * These functions are reserved for uvm(9) internal use and are not 190 * exported in the header file uvm_physseg.h 191 * 192 * Thus they are redefined here. 193 */ 194 void uvm_physseg_init_seg(uvm_physseg_t, struct vm_page *); 195 void uvm_physseg_seg_chomp_slab(uvm_physseg_t, struct vm_page *, size_t); 196 197 /* returns a pgs array */ 198 struct vm_page *uvm_physseg_seg_alloc_from_slab(uvm_physseg_t, size_t); 199 200 /* 201 * inline functions 202 */ 203 204 /* 205 * uvm_pageinsert: insert a page in the object. 206 * 207 * => caller must lock object 208 * => call should have already set pg's object and offset pointers 209 * and bumped the version counter 210 */ 211 212 static inline void 213 uvm_pageinsert_object(struct uvm_object *uobj, struct vm_page *pg) 214 { 215 216 KASSERT(uobj == pg->uobject); 217 KASSERT(rw_write_held(uobj->vmobjlock)); 218 KASSERT((pg->flags & PG_TABLED) == 0); 219 220 if ((pg->flags & PG_STAT) != 0) { 221 /* Cannot use uvm_pagegetdirty(): not yet in radix tree. */ 222 const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY); 223 const bool isaobj = (pg->flags & PG_AOBJ) != 0; 224 225 if (!isaobj) { 226 KASSERT((pg->flags & PG_FILE) != 0); 227 if (uobj->uo_npages == 0) { 228 struct vnode *vp = (struct vnode *)uobj; 229 mutex_enter(vp->v_interlock); 230 KASSERT((vp->v_iflag & VI_PAGES) == 0); 231 vp->v_iflag |= VI_PAGES; 232 vholdl(vp); 233 mutex_exit(vp->v_interlock); 234 } 235 kpreempt_disable(); 236 if (UVM_OBJ_IS_VTEXT(uobj)) { 237 CPU_COUNT(CPU_COUNT_EXECPAGES, 1); 238 } else { 239 CPU_COUNT(CPU_COUNT_FILEPAGES, 1); 240 } 241 CPU_COUNT(CPU_COUNT_FILEUNKNOWN + status, 1); 242 } else { 243 kpreempt_disable(); 244 CPU_COUNT(CPU_COUNT_ANONPAGES, 1); 245 CPU_COUNT(CPU_COUNT_ANONUNKNOWN + status, 1); 246 } 247 kpreempt_enable(); 248 } 249 pg->flags |= PG_TABLED; 250 uobj->uo_npages++; 251 } 252 253 static inline int 254 uvm_pageinsert_tree(struct uvm_object *uobj, struct vm_page *pg) 255 { 256 const uint64_t idx = pg->offset >> PAGE_SHIFT; 257 int error; 258 259 error = radix_tree_insert_node(&uobj->uo_pages, idx, pg); 260 if (error != 0) { 261 return error; 262 } 263 if ((pg->flags & PG_CLEAN) == 0) { 264 radix_tree_set_tag(&uobj->uo_pages, idx, UVM_PAGE_DIRTY_TAG); 265 } 266 KASSERT(((pg->flags & PG_CLEAN) == 0) == 267 radix_tree_get_tag(&uobj->uo_pages, idx, UVM_PAGE_DIRTY_TAG)); 268 return 0; 269 } 270 271 /* 272 * uvm_page_remove: remove page from object. 273 * 274 * => caller must lock object 275 */ 276 277 static inline void 278 uvm_pageremove_object(struct uvm_object *uobj, struct vm_page *pg) 279 { 280 281 KASSERT(uobj == pg->uobject); 282 KASSERT(rw_write_held(uobj->vmobjlock)); 283 KASSERT(pg->flags & PG_TABLED); 284 285 if ((pg->flags & PG_STAT) != 0) { 286 /* Cannot use uvm_pagegetdirty(): no longer in radix tree. */ 287 const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY); 288 const bool isaobj = (pg->flags & PG_AOBJ) != 0; 289 290 if (!isaobj) { 291 KASSERT((pg->flags & PG_FILE) != 0); 292 if (uobj->uo_npages == 1) { 293 struct vnode *vp = (struct vnode *)uobj; 294 mutex_enter(vp->v_interlock); 295 KASSERT((vp->v_iflag & VI_PAGES) != 0); 296 vp->v_iflag &= ~VI_PAGES; 297 holdrelel(vp); 298 mutex_exit(vp->v_interlock); 299 } 300 kpreempt_disable(); 301 if (UVM_OBJ_IS_VTEXT(uobj)) { 302 CPU_COUNT(CPU_COUNT_EXECPAGES, -1); 303 } else { 304 CPU_COUNT(CPU_COUNT_FILEPAGES, -1); 305 } 306 CPU_COUNT(CPU_COUNT_FILEUNKNOWN + status, -1); 307 } else { 308 kpreempt_disable(); 309 CPU_COUNT(CPU_COUNT_ANONPAGES, -1); 310 CPU_COUNT(CPU_COUNT_ANONUNKNOWN + status, -1); 311 } 312 kpreempt_enable(); 313 } 314 uobj->uo_npages--; 315 pg->flags &= ~PG_TABLED; 316 pg->uobject = NULL; 317 } 318 319 static inline void 320 uvm_pageremove_tree(struct uvm_object *uobj, struct vm_page *pg) 321 { 322 struct vm_page *opg __unused; 323 324 opg = radix_tree_remove_node(&uobj->uo_pages, pg->offset >> PAGE_SHIFT); 325 KASSERT(pg == opg); 326 } 327 328 static void 329 uvm_page_init_bucket(struct pgfreelist *pgfl, struct pgflbucket *pgb, int num) 330 { 331 int i; 332 333 pgb->pgb_nfree = 0; 334 for (i = 0; i < uvmexp.ncolors; i++) { 335 LIST_INIT(&pgb->pgb_colors[i]); 336 } 337 pgfl->pgfl_buckets[num] = pgb; 338 } 339 340 /* 341 * uvm_page_init: init the page system. called from uvm_init(). 342 * 343 * => we return the range of kernel virtual memory in kvm_startp/kvm_endp 344 */ 345 346 void 347 uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp) 348 { 349 static struct uvm_cpu boot_cpu __cacheline_aligned; 350 psize_t freepages, pagecount, bucketsize, n; 351 struct pgflbucket *pgb; 352 struct vm_page *pagearray; 353 char *bucketarray; 354 uvm_physseg_t bank; 355 int fl, b; 356 357 KASSERT(ncpu <= 1); 358 359 /* 360 * init the page queues and free page queue locks, except the 361 * free list; we allocate that later (with the initial vm_page 362 * structures). 363 */ 364 365 curcpu()->ci_data.cpu_uvm = &boot_cpu; 366 uvmpdpol_init(); 367 for (b = 0; b < __arraycount(uvm_freelist_locks); b++) { 368 mutex_init(&uvm_freelist_locks[b].lock, MUTEX_DEFAULT, IPL_VM); 369 } 370 371 /* 372 * allocate vm_page structures. 373 */ 374 375 /* 376 * sanity check: 377 * before calling this function the MD code is expected to register 378 * some free RAM with the uvm_page_physload() function. our job 379 * now is to allocate vm_page structures for this memory. 380 */ 381 382 if (uvm_physseg_get_last() == UVM_PHYSSEG_TYPE_INVALID) 383 panic("uvm_page_bootstrap: no memory pre-allocated"); 384 385 /* 386 * first calculate the number of free pages... 387 * 388 * note that we use start/end rather than avail_start/avail_end. 389 * this allows us to allocate extra vm_page structures in case we 390 * want to return some memory to the pool after booting. 391 */ 392 393 freepages = 0; 394 395 for (bank = uvm_physseg_get_first(); 396 uvm_physseg_valid_p(bank) ; 397 bank = uvm_physseg_get_next(bank)) { 398 freepages += (uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank)); 399 } 400 401 /* 402 * Let MD code initialize the number of colors, or default 403 * to 1 color if MD code doesn't care. 404 */ 405 if (uvmexp.ncolors == 0) 406 uvmexp.ncolors = 1; 407 uvmexp.colormask = uvmexp.ncolors - 1; 408 KASSERT((uvmexp.colormask & uvmexp.ncolors) == 0); 409 410 /* We always start with only 1 bucket. */ 411 uvm.bucketcount = 1; 412 413 /* 414 * we now know we have (PAGE_SIZE * freepages) bytes of memory we can 415 * use. for each page of memory we use we need a vm_page structure. 416 * thus, the total number of pages we can use is the total size of 417 * the memory divided by the PAGE_SIZE plus the size of the vm_page 418 * structure. we add one to freepages as a fudge factor to avoid 419 * truncation errors (since we can only allocate in terms of whole 420 * pages). 421 */ 422 pagecount = ((freepages + 1) << PAGE_SHIFT) / 423 (PAGE_SIZE + sizeof(struct vm_page)); 424 bucketsize = offsetof(struct pgflbucket, pgb_colors[uvmexp.ncolors]); 425 bucketsize = roundup2(bucketsize, coherency_unit); 426 bucketarray = (void *)uvm_pageboot_alloc( 427 bucketsize * VM_NFREELIST + 428 pagecount * sizeof(struct vm_page)); 429 pagearray = (struct vm_page *) 430 (bucketarray + bucketsize * VM_NFREELIST); 431 432 for (fl = 0; fl < VM_NFREELIST; fl++) { 433 pgb = (struct pgflbucket *)(bucketarray + bucketsize * fl); 434 uvm_page_init_bucket(&uvm.page_free[fl], pgb, 0); 435 } 436 memset(pagearray, 0, pagecount * sizeof(struct vm_page)); 437 438 /* 439 * init the freelist cache in the disabled state. 440 */ 441 uvm_pgflcache_init(); 442 443 /* 444 * init the vm_page structures and put them in the correct place. 445 */ 446 /* First init the extent */ 447 448 for (bank = uvm_physseg_get_first(), 449 uvm_physseg_seg_chomp_slab(bank, pagearray, pagecount); 450 uvm_physseg_valid_p(bank); 451 bank = uvm_physseg_get_next(bank)) { 452 453 n = uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank); 454 uvm_physseg_seg_alloc_from_slab(bank, n); 455 uvm_physseg_init_seg(bank, pagearray); 456 457 /* set up page array pointers */ 458 pagearray += n; 459 pagecount -= n; 460 } 461 462 /* 463 * pass up the values of virtual_space_start and 464 * virtual_space_end (obtained by uvm_pageboot_alloc) to the upper 465 * layers of the VM. 466 */ 467 468 *kvm_startp = round_page(virtual_space_start); 469 *kvm_endp = trunc_page(virtual_space_end); 470 #ifdef DEBUG 471 /* 472 * steal kva for uvm_pagezerocheck(). 473 */ 474 uvm_zerocheckkva = *kvm_startp; 475 *kvm_startp += PAGE_SIZE; 476 mutex_init(&uvm_zerochecklock, MUTEX_DEFAULT, IPL_VM); 477 #endif /* DEBUG */ 478 479 /* 480 * init various thresholds. 481 */ 482 483 uvmexp.reserve_pagedaemon = 1; 484 uvmexp.reserve_kernel = vm_page_reserve_kernel; 485 486 /* 487 * done! 488 */ 489 490 uvm.page_init_done = true; 491 } 492 493 /* 494 * uvm_pgfl_lock: lock all freelist buckets 495 */ 496 497 void 498 uvm_pgfl_lock(void) 499 { 500 int i; 501 502 for (i = 0; i < __arraycount(uvm_freelist_locks); i++) { 503 mutex_spin_enter(&uvm_freelist_locks[i].lock); 504 } 505 } 506 507 /* 508 * uvm_pgfl_unlock: unlock all freelist buckets 509 */ 510 511 void 512 uvm_pgfl_unlock(void) 513 { 514 int i; 515 516 for (i = 0; i < __arraycount(uvm_freelist_locks); i++) { 517 mutex_spin_exit(&uvm_freelist_locks[i].lock); 518 } 519 } 520 521 /* 522 * uvm_setpagesize: set the page size 523 * 524 * => sets page_shift and page_mask from uvmexp.pagesize. 525 */ 526 527 void 528 uvm_setpagesize(void) 529 { 530 531 /* 532 * If uvmexp.pagesize is 0 at this point, we expect PAGE_SIZE 533 * to be a constant (indicated by being a non-zero value). 534 */ 535 if (uvmexp.pagesize == 0) { 536 if (PAGE_SIZE == 0) 537 panic("uvm_setpagesize: uvmexp.pagesize not set"); 538 uvmexp.pagesize = PAGE_SIZE; 539 } 540 uvmexp.pagemask = uvmexp.pagesize - 1; 541 if ((uvmexp.pagemask & uvmexp.pagesize) != 0) 542 panic("uvm_setpagesize: page size %u (%#x) not a power of two", 543 uvmexp.pagesize, uvmexp.pagesize); 544 for (uvmexp.pageshift = 0; ; uvmexp.pageshift++) 545 if ((1 << uvmexp.pageshift) == uvmexp.pagesize) 546 break; 547 } 548 549 /* 550 * uvm_pageboot_alloc: steal memory from physmem for bootstrapping 551 */ 552 553 vaddr_t 554 uvm_pageboot_alloc(vsize_t size) 555 { 556 static bool initialized = false; 557 vaddr_t addr; 558 #if !defined(PMAP_STEAL_MEMORY) 559 vaddr_t vaddr; 560 paddr_t paddr; 561 #endif 562 563 /* 564 * on first call to this function, initialize ourselves. 565 */ 566 if (initialized == false) { 567 pmap_virtual_space(&virtual_space_start, &virtual_space_end); 568 569 /* round it the way we like it */ 570 virtual_space_start = round_page(virtual_space_start); 571 virtual_space_end = trunc_page(virtual_space_end); 572 573 initialized = true; 574 } 575 576 /* round to page size */ 577 size = round_page(size); 578 uvmexp.bootpages += atop(size); 579 580 #if defined(PMAP_STEAL_MEMORY) 581 582 /* 583 * defer bootstrap allocation to MD code (it may want to allocate 584 * from a direct-mapped segment). pmap_steal_memory should adjust 585 * virtual_space_start/virtual_space_end if necessary. 586 */ 587 588 addr = pmap_steal_memory(size, &virtual_space_start, 589 &virtual_space_end); 590 591 return(addr); 592 593 #else /* !PMAP_STEAL_MEMORY */ 594 595 /* 596 * allocate virtual memory for this request 597 */ 598 if (virtual_space_start == virtual_space_end || 599 (virtual_space_end - virtual_space_start) < size) 600 panic("uvm_pageboot_alloc: out of virtual space"); 601 602 addr = virtual_space_start; 603 604 #ifdef PMAP_GROWKERNEL 605 /* 606 * If the kernel pmap can't map the requested space, 607 * then allocate more resources for it. 608 */ 609 if (uvm_maxkaddr < (addr + size)) { 610 uvm_maxkaddr = pmap_growkernel(addr + size); 611 if (uvm_maxkaddr < (addr + size)) 612 panic("uvm_pageboot_alloc: pmap_growkernel() failed"); 613 } 614 #endif 615 616 virtual_space_start += size; 617 618 /* 619 * allocate and mapin physical pages to back new virtual pages 620 */ 621 622 for (vaddr = round_page(addr) ; vaddr < addr + size ; 623 vaddr += PAGE_SIZE) { 624 625 if (!uvm_page_physget(&paddr)) 626 panic("uvm_pageboot_alloc: out of memory"); 627 628 /* 629 * Note this memory is no longer managed, so using 630 * pmap_kenter is safe. 631 */ 632 pmap_kenter_pa(vaddr, paddr, VM_PROT_READ|VM_PROT_WRITE, 0); 633 } 634 pmap_update(pmap_kernel()); 635 return(addr); 636 #endif /* PMAP_STEAL_MEMORY */ 637 } 638 639 #if !defined(PMAP_STEAL_MEMORY) 640 /* 641 * uvm_page_physget: "steal" one page from the vm_physmem structure. 642 * 643 * => attempt to allocate it off the end of a segment in which the "avail" 644 * values match the start/end values. if we can't do that, then we 645 * will advance both values (making them equal, and removing some 646 * vm_page structures from the non-avail area). 647 * => return false if out of memory. 648 */ 649 650 /* subroutine: try to allocate from memory chunks on the specified freelist */ 651 static bool uvm_page_physget_freelist(paddr_t *, int); 652 653 static bool 654 uvm_page_physget_freelist(paddr_t *paddrp, int freelist) 655 { 656 uvm_physseg_t lcv; 657 658 /* pass 1: try allocating from a matching end */ 659 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST) 660 for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv)) 661 #else 662 for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv)) 663 #endif 664 { 665 if (uvm.page_init_done == true) 666 panic("uvm_page_physget: called _after_ bootstrap"); 667 668 /* Try to match at front or back on unused segment */ 669 if (uvm_page_physunload(lcv, freelist, paddrp)) 670 return true; 671 } 672 673 /* pass2: forget about matching ends, just allocate something */ 674 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST) 675 for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv)) 676 #else 677 for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv)) 678 #endif 679 { 680 /* Try the front regardless. */ 681 if (uvm_page_physunload_force(lcv, freelist, paddrp)) 682 return true; 683 } 684 return false; 685 } 686 687 bool 688 uvm_page_physget(paddr_t *paddrp) 689 { 690 int i; 691 692 /* try in the order of freelist preference */ 693 for (i = 0; i < VM_NFREELIST; i++) 694 if (uvm_page_physget_freelist(paddrp, i) == true) 695 return (true); 696 return (false); 697 } 698 #endif /* PMAP_STEAL_MEMORY */ 699 700 /* 701 * PHYS_TO_VM_PAGE: find vm_page for a PA. used by MI code to get vm_pages 702 * back from an I/O mapping (ugh!). used in some MD code as well. 703 */ 704 struct vm_page * 705 uvm_phys_to_vm_page(paddr_t pa) 706 { 707 paddr_t pf = atop(pa); 708 paddr_t off; 709 uvm_physseg_t upm; 710 711 upm = uvm_physseg_find(pf, &off); 712 if (upm != UVM_PHYSSEG_TYPE_INVALID) 713 return uvm_physseg_get_pg(upm, off); 714 return(NULL); 715 } 716 717 paddr_t 718 uvm_vm_page_to_phys(const struct vm_page *pg) 719 { 720 721 return pg->phys_addr & ~(PAGE_SIZE - 1); 722 } 723 724 /* 725 * uvm_page_numa_load: load NUMA range description. 726 */ 727 void 728 uvm_page_numa_load(paddr_t start, paddr_t size, u_int numa_id) 729 { 730 struct uvm_page_numa_region *d; 731 732 KASSERT(numa_id < PGFL_MAX_BUCKETS); 733 734 d = kmem_alloc(sizeof(*d), KM_SLEEP); 735 d->start = start; 736 d->size = size; 737 d->numa_id = numa_id; 738 d->next = uvm_page_numa_region; 739 uvm_page_numa_region = d; 740 } 741 742 /* 743 * uvm_page_numa_lookup: lookup NUMA node for the given page. 744 */ 745 static u_int 746 uvm_page_numa_lookup(struct vm_page *pg) 747 { 748 struct uvm_page_numa_region *d; 749 static bool warned; 750 paddr_t pa; 751 752 KASSERT(uvm.numa_alloc); 753 KASSERT(uvm_page_numa_region != NULL); 754 755 pa = VM_PAGE_TO_PHYS(pg); 756 for (d = uvm_page_numa_region; d != NULL; d = d->next) { 757 if (pa >= d->start && pa < d->start + d->size) { 758 return d->numa_id; 759 } 760 } 761 762 if (!warned) { 763 printf("uvm_page_numa_lookup: failed, first pg=%p pa=%#" 764 PRIxPADDR "\n", pg, VM_PAGE_TO_PHYS(pg)); 765 warned = true; 766 } 767 768 return 0; 769 } 770 771 /* 772 * uvm_page_redim: adjust freelist dimensions if they have changed. 773 */ 774 775 static void 776 uvm_page_redim(int newncolors, int newnbuckets) 777 { 778 struct pgfreelist npgfl; 779 struct pgflbucket *opgb, *npgb; 780 struct pgflist *ohead, *nhead; 781 struct vm_page *pg; 782 size_t bucketsize, bucketmemsize, oldbucketmemsize; 783 int fl, ob, oc, nb, nc, obuckets, ocolors; 784 char *bucketarray, *oldbucketmem, *bucketmem; 785 786 KASSERT(((newncolors - 1) & newncolors) == 0); 787 788 /* Anything to do? */ 789 if (newncolors <= uvmexp.ncolors && 790 newnbuckets == uvm.bucketcount) { 791 return; 792 } 793 if (uvm.page_init_done == false) { 794 uvmexp.ncolors = newncolors; 795 return; 796 } 797 798 bucketsize = offsetof(struct pgflbucket, pgb_colors[newncolors]); 799 bucketsize = roundup2(bucketsize, coherency_unit); 800 bucketmemsize = bucketsize * newnbuckets * VM_NFREELIST + 801 coherency_unit - 1; 802 bucketmem = kmem_zalloc(bucketmemsize, KM_SLEEP); 803 bucketarray = (char *)roundup2((uintptr_t)bucketmem, coherency_unit); 804 805 ocolors = uvmexp.ncolors; 806 obuckets = uvm.bucketcount; 807 808 /* Freelist cache musn't be enabled. */ 809 uvm_pgflcache_pause(); 810 811 /* Make sure we should still do this. */ 812 uvm_pgfl_lock(); 813 if (newncolors <= uvmexp.ncolors && 814 newnbuckets == uvm.bucketcount) { 815 uvm_pgfl_unlock(); 816 uvm_pgflcache_resume(); 817 kmem_free(bucketmem, bucketmemsize); 818 return; 819 } 820 821 uvmexp.ncolors = newncolors; 822 uvmexp.colormask = uvmexp.ncolors - 1; 823 uvm.bucketcount = newnbuckets; 824 825 for (fl = 0; fl < VM_NFREELIST; fl++) { 826 /* Init new buckets in new freelist. */ 827 memset(&npgfl, 0, sizeof(npgfl)); 828 for (nb = 0; nb < newnbuckets; nb++) { 829 npgb = (struct pgflbucket *)bucketarray; 830 uvm_page_init_bucket(&npgfl, npgb, nb); 831 bucketarray += bucketsize; 832 } 833 /* Now transfer pages from the old freelist. */ 834 for (nb = ob = 0; ob < obuckets; ob++) { 835 opgb = uvm.page_free[fl].pgfl_buckets[ob]; 836 for (oc = 0; oc < ocolors; oc++) { 837 ohead = &opgb->pgb_colors[oc]; 838 while ((pg = LIST_FIRST(ohead)) != NULL) { 839 LIST_REMOVE(pg, pageq.list); 840 /* 841 * Here we decide on the NEW color & 842 * bucket for the page. For NUMA 843 * we'll use the info that the 844 * hardware gave us. For non-NUMA 845 * assign take physical page frame 846 * number and cache color into 847 * account. We do this to try and 848 * avoid defeating any memory 849 * interleaving in the hardware. 850 */ 851 KASSERT( 852 uvm_page_get_bucket(pg) == ob); 853 KASSERT(fl == 854 uvm_page_get_freelist(pg)); 855 if (uvm.numa_alloc) { 856 nb = uvm_page_numa_lookup(pg); 857 } else { 858 nb = atop(VM_PAGE_TO_PHYS(pg)) 859 / uvmexp.ncolors / 8 860 % newnbuckets; 861 } 862 uvm_page_set_bucket(pg, nb); 863 npgb = npgfl.pgfl_buckets[nb]; 864 npgb->pgb_nfree++; 865 nc = VM_PGCOLOR(pg); 866 nhead = &npgb->pgb_colors[nc]; 867 LIST_INSERT_HEAD(nhead, pg, pageq.list); 868 } 869 } 870 } 871 /* Install the new freelist. */ 872 memcpy(&uvm.page_free[fl], &npgfl, sizeof(npgfl)); 873 } 874 875 /* Unlock and free the old memory. */ 876 oldbucketmemsize = recolored_pages_memsize; 877 oldbucketmem = recolored_pages_mem; 878 recolored_pages_memsize = bucketmemsize; 879 recolored_pages_mem = bucketmem; 880 881 uvm_pgfl_unlock(); 882 uvm_pgflcache_resume(); 883 884 if (oldbucketmemsize) { 885 kmem_free(oldbucketmem, oldbucketmemsize); 886 } 887 888 /* 889 * this calls uvm_km_alloc() which may want to hold 890 * uvm_freelist_lock. 891 */ 892 uvm_pager_realloc_emerg(); 893 } 894 895 /* 896 * uvm_page_recolor: Recolor the pages if the new color count is 897 * larger than the old one. 898 */ 899 900 void 901 uvm_page_recolor(int newncolors) 902 { 903 904 uvm_page_redim(newncolors, uvm.bucketcount); 905 } 906 907 /* 908 * uvm_page_rebucket: Determine a bucket structure and redim the free 909 * lists to match. 910 */ 911 912 void 913 uvm_page_rebucket(void) 914 { 915 u_int min_numa, max_numa, npackage, shift; 916 struct cpu_info *ci, *ci2, *ci3; 917 CPU_INFO_ITERATOR cii; 918 919 /* 920 * If we have more than one NUMA node, and the maximum NUMA node ID 921 * is less than PGFL_MAX_BUCKETS, then we'll use NUMA distribution 922 * for free pages. uvm_pagefree() will not reassign pages to a 923 * different bucket on free. 924 */ 925 min_numa = (u_int)-1; 926 max_numa = 0; 927 for (CPU_INFO_FOREACH(cii, ci)) { 928 if (ci->ci_numa_id < min_numa) { 929 min_numa = ci->ci_numa_id; 930 } 931 if (ci->ci_numa_id > max_numa) { 932 max_numa = ci->ci_numa_id; 933 } 934 } 935 if (min_numa != max_numa && max_numa < PGFL_MAX_BUCKETS) { 936 #ifdef NUMA 937 /* 938 * We can do this, and it seems to work well, but until 939 * further experiments are done we'll stick with the cache 940 * locality strategy. 941 */ 942 aprint_debug("UVM: using NUMA allocation scheme\n"); 943 for (CPU_INFO_FOREACH(cii, ci)) { 944 ci->ci_data.cpu_uvm->pgflbucket = ci->ci_numa_id; 945 } 946 uvm.numa_alloc = true; 947 uvm_page_redim(uvmexp.ncolors, max_numa + 1); 948 return; 949 #endif 950 } 951 952 /* 953 * Otherwise we'll go with a scheme to maximise L2/L3 cache locality 954 * and minimise lock contention. Count the total number of CPU 955 * packages, and then try to distribute the buckets among CPU 956 * packages evenly. uvm_pagefree() will reassign pages to the 957 * freeing CPU's preferred bucket on free. 958 */ 959 npackage = curcpu()->ci_nsibling[CPUREL_PACKAGE1ST]; 960 961 /* 962 * Figure out how to arrange the packages & buckets, and the total 963 * number of buckets we need. XXX 2 may not be the best factor. 964 */ 965 for (shift = 0; npackage > PGFL_MAX_BUCKETS; shift++) { 966 npackage >>= 1; 967 } 968 uvm_page_redim(uvmexp.ncolors, npackage); 969 970 /* 971 * Now tell each CPU which bucket to use. In the outer loop, scroll 972 * through all CPU packages. 973 */ 974 npackage = 0; 975 ci = curcpu(); 976 ci2 = ci->ci_sibling[CPUREL_PACKAGE1ST]; 977 do { 978 /* 979 * In the inner loop, scroll through all CPUs in the package 980 * and assign the same bucket ID. 981 */ 982 ci3 = ci2; 983 do { 984 ci3->ci_data.cpu_uvm->pgflbucket = npackage >> shift; 985 ci3 = ci3->ci_sibling[CPUREL_PACKAGE]; 986 } while (ci3 != ci2); 987 npackage++; 988 ci2 = ci2->ci_sibling[CPUREL_PACKAGE1ST]; 989 } while (ci2 != ci->ci_sibling[CPUREL_PACKAGE1ST]); 990 991 aprint_debug("UVM: using package allocation scheme, " 992 "%d package(s) per bucket\n", 1 << shift); 993 } 994 995 /* 996 * uvm_cpu_attach: initialize per-CPU data structures. 997 */ 998 999 void 1000 uvm_cpu_attach(struct cpu_info *ci) 1001 { 1002 struct uvm_cpu *ucpu; 1003 1004 /* Already done in uvm_page_init(). */ 1005 if (!CPU_IS_PRIMARY(ci)) { 1006 /* Add more reserve pages for this CPU. */ 1007 uvmexp.reserve_kernel += vm_page_reserve_kernel; 1008 1009 /* Allocate per-CPU data structures. */ 1010 ucpu = kmem_zalloc(sizeof(struct uvm_cpu) + coherency_unit - 1, 1011 KM_SLEEP); 1012 ucpu = (struct uvm_cpu *)roundup2((uintptr_t)ucpu, 1013 coherency_unit); 1014 ci->ci_data.cpu_uvm = ucpu; 1015 } else { 1016 ucpu = ci->ci_data.cpu_uvm; 1017 } 1018 1019 uvmpdpol_init_cpu(ucpu); 1020 1021 /* 1022 * Attach RNG source for this CPU's VM events 1023 */ 1024 rnd_attach_source(&ucpu->rs, ci->ci_data.cpu_name, RND_TYPE_VM, 1025 RND_FLAG_COLLECT_TIME|RND_FLAG_COLLECT_VALUE| 1026 RND_FLAG_ESTIMATE_VALUE); 1027 } 1028 1029 /* 1030 * uvm_availmem: fetch the total amount of free memory in pages. this can 1031 * have a detrimental effect on performance due to false sharing; don't call 1032 * unless needed. 1033 */ 1034 1035 int 1036 uvm_availmem(void) 1037 { 1038 struct pgfreelist *pgfl; 1039 int fl, b, fpages; 1040 1041 fpages = 0; 1042 for (fl = 0; fl < VM_NFREELIST; fl++) { 1043 pgfl = &uvm.page_free[fl]; 1044 for (b = 0; b < uvm.bucketcount; b++) { 1045 fpages += pgfl->pgfl_buckets[b]->pgb_nfree; 1046 } 1047 } 1048 return fpages; 1049 } 1050 1051 /* 1052 * uvm_pagealloc_pgb: helper routine that tries to allocate any color from a 1053 * specific freelist and specific bucket only. 1054 * 1055 * => must be at IPL_VM or higher to protect per-CPU data structures. 1056 */ 1057 1058 static struct vm_page * 1059 uvm_pagealloc_pgb(struct uvm_cpu *ucpu, int f, int b, int *trycolorp, int flags) 1060 { 1061 int c, trycolor, colormask; 1062 struct pgflbucket *pgb; 1063 struct vm_page *pg; 1064 kmutex_t *lock; 1065 bool fill; 1066 1067 /* 1068 * Skip the bucket if empty, no lock needed. There could be many 1069 * empty freelists/buckets. 1070 */ 1071 pgb = uvm.page_free[f].pgfl_buckets[b]; 1072 if (pgb->pgb_nfree == 0) { 1073 return NULL; 1074 } 1075 1076 /* Skip bucket if low on memory. */ 1077 lock = &uvm_freelist_locks[b].lock; 1078 mutex_spin_enter(lock); 1079 if (__predict_false(pgb->pgb_nfree <= uvmexp.reserve_kernel)) { 1080 if ((flags & UVM_PGA_USERESERVE) == 0 || 1081 (pgb->pgb_nfree <= uvmexp.reserve_pagedaemon && 1082 curlwp != uvm.pagedaemon_lwp)) { 1083 mutex_spin_exit(lock); 1084 return NULL; 1085 } 1086 fill = false; 1087 } else { 1088 fill = true; 1089 } 1090 1091 /* Try all page colors as needed. */ 1092 c = trycolor = *trycolorp; 1093 colormask = uvmexp.colormask; 1094 do { 1095 pg = LIST_FIRST(&pgb->pgb_colors[c]); 1096 if (__predict_true(pg != NULL)) { 1097 /* 1098 * Got a free page! PG_FREE must be cleared under 1099 * lock because of uvm_pglistalloc(). 1100 */ 1101 LIST_REMOVE(pg, pageq.list); 1102 KASSERT(pg->flags & PG_FREE); 1103 pg->flags &= PG_ZERO; 1104 pgb->pgb_nfree--; 1105 1106 /* 1107 * While we have the bucket locked and our data 1108 * structures fresh in L1 cache, we have an ideal 1109 * opportunity to grab some pages for the freelist 1110 * cache without causing extra contention. Only do 1111 * so if we found pages in this CPU's preferred 1112 * bucket. 1113 */ 1114 if (__predict_true(b == ucpu->pgflbucket && fill)) { 1115 uvm_pgflcache_fill(ucpu, f, b, c); 1116 } 1117 mutex_spin_exit(lock); 1118 KASSERT(uvm_page_get_bucket(pg) == b); 1119 CPU_COUNT(c == trycolor ? 1120 CPU_COUNT_COLORHIT : CPU_COUNT_COLORMISS, 1); 1121 CPU_COUNT(CPU_COUNT_CPUMISS, 1); 1122 *trycolorp = c; 1123 return pg; 1124 } 1125 c = (c + 1) & colormask; 1126 } while (c != trycolor); 1127 mutex_spin_exit(lock); 1128 1129 return NULL; 1130 } 1131 1132 /* 1133 * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat that allocates 1134 * any color from any bucket, in a specific freelist. 1135 * 1136 * => must be at IPL_VM or higher to protect per-CPU data structures. 1137 */ 1138 1139 static struct vm_page * 1140 uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int f, int *trycolorp, int flags) 1141 { 1142 int b, trybucket, bucketcount; 1143 struct vm_page *pg; 1144 1145 /* Try for the exact thing in the per-CPU cache. */ 1146 if ((pg = uvm_pgflcache_alloc(ucpu, f, *trycolorp)) != NULL) { 1147 CPU_COUNT(CPU_COUNT_CPUHIT, 1); 1148 CPU_COUNT(CPU_COUNT_COLORHIT, 1); 1149 return pg; 1150 } 1151 1152 /* Walk through all buckets, trying our preferred bucket first. */ 1153 trybucket = ucpu->pgflbucket; 1154 b = trybucket; 1155 bucketcount = uvm.bucketcount; 1156 do { 1157 pg = uvm_pagealloc_pgb(ucpu, f, b, trycolorp, flags); 1158 if (pg != NULL) { 1159 return pg; 1160 } 1161 b = (b + 1 == bucketcount ? 0 : b + 1); 1162 } while (b != trybucket); 1163 1164 return NULL; 1165 } 1166 1167 /* 1168 * uvm_pagealloc_strat: allocate vm_page from a particular free list. 1169 * 1170 * => return null if no pages free 1171 * => wake up pagedaemon if number of free pages drops below low water mark 1172 * => if obj != NULL, obj must be locked (to put in obj's tree) 1173 * => if anon != NULL, anon must be locked (to put in anon) 1174 * => only one of obj or anon can be non-null 1175 * => caller must activate/deactivate page if it is not wired. 1176 * => free_list is ignored if strat == UVM_PGA_STRAT_NORMAL. 1177 * => policy decision: it is more important to pull a page off of the 1178 * appropriate priority free list than it is to get a zero'd or 1179 * unknown contents page. This is because we live with the 1180 * consequences of a bad free list decision for the entire 1181 * lifetime of the page, e.g. if the page comes from memory that 1182 * is slower to access. 1183 */ 1184 1185 struct vm_page * 1186 uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon, 1187 int flags, int strat, int free_list) 1188 { 1189 int zeroit = 0, color; 1190 int lcv, error, s; 1191 struct uvm_cpu *ucpu; 1192 struct vm_page *pg; 1193 lwp_t *l; 1194 1195 KASSERT(obj == NULL || anon == NULL); 1196 KASSERT(anon == NULL || (flags & UVM_FLAG_COLORMATCH) || off == 0); 1197 KASSERT(off == trunc_page(off)); 1198 KASSERT(obj == NULL || rw_write_held(obj->vmobjlock)); 1199 KASSERT(anon == NULL || anon->an_lock == NULL || 1200 rw_write_held(anon->an_lock)); 1201 1202 /* 1203 * This implements a global round-robin page coloring 1204 * algorithm. 1205 */ 1206 1207 s = splvm(); 1208 ucpu = curcpu()->ci_data.cpu_uvm; 1209 if (flags & UVM_FLAG_COLORMATCH) { 1210 color = atop(off) & uvmexp.colormask; 1211 } else { 1212 color = ucpu->pgflcolor; 1213 } 1214 1215 /* 1216 * fail if any of these conditions is true: 1217 * [1] there really are no free pages, or 1218 * [2] only kernel "reserved" pages remain and 1219 * reserved pages have not been requested. 1220 * [3] only pagedaemon "reserved" pages remain and 1221 * the requestor isn't the pagedaemon. 1222 * we make kernel reserve pages available if called by a 1223 * kernel thread or a realtime thread. 1224 */ 1225 l = curlwp; 1226 if (__predict_true(l != NULL) && lwp_eprio(l) >= PRI_KTHREAD) { 1227 flags |= UVM_PGA_USERESERVE; 1228 } 1229 1230 /* If the allocator's running in NUMA mode, go with NUMA strategy. */ 1231 if (uvm.numa_alloc && strat == UVM_PGA_STRAT_NORMAL) { 1232 strat = UVM_PGA_STRAT_NUMA; 1233 } 1234 1235 again: 1236 switch (strat) { 1237 case UVM_PGA_STRAT_NORMAL: 1238 /* Check freelists: descending priority (ascending id) order. */ 1239 for (lcv = 0; lcv < VM_NFREELIST; lcv++) { 1240 pg = uvm_pagealloc_pgfl(ucpu, lcv, &color, flags); 1241 if (pg != NULL) { 1242 goto gotit; 1243 } 1244 } 1245 1246 /* No pages free! Have pagedaemon free some memory. */ 1247 splx(s); 1248 uvm_kick_pdaemon(); 1249 return NULL; 1250 1251 case UVM_PGA_STRAT_ONLY: 1252 case UVM_PGA_STRAT_FALLBACK: 1253 /* Attempt to allocate from the specified free list. */ 1254 KASSERT(free_list >= 0 && free_list < VM_NFREELIST); 1255 pg = uvm_pagealloc_pgfl(ucpu, free_list, &color, flags); 1256 if (pg != NULL) { 1257 goto gotit; 1258 } 1259 1260 /* Fall back, if possible. */ 1261 if (strat == UVM_PGA_STRAT_FALLBACK) { 1262 strat = UVM_PGA_STRAT_NORMAL; 1263 goto again; 1264 } 1265 1266 /* No pages free! Have pagedaemon free some memory. */ 1267 splx(s); 1268 uvm_kick_pdaemon(); 1269 return NULL; 1270 1271 case UVM_PGA_STRAT_NUMA: 1272 /* 1273 * NUMA strategy: allocating from the correct bucket is more 1274 * important than observing freelist priority. Look only to 1275 * the current NUMA node; if that fails, we need to look to 1276 * other NUMA nodes, so retry with the normal strategy. 1277 */ 1278 for (lcv = 0; lcv < VM_NFREELIST; lcv++) { 1279 pg = uvm_pgflcache_alloc(ucpu, lcv, color); 1280 if (pg != NULL) { 1281 CPU_COUNT(CPU_COUNT_CPUHIT, 1); 1282 CPU_COUNT(CPU_COUNT_COLORHIT, 1); 1283 goto gotit; 1284 } 1285 pg = uvm_pagealloc_pgb(ucpu, lcv, 1286 ucpu->pgflbucket, &color, flags); 1287 if (pg != NULL) { 1288 goto gotit; 1289 } 1290 } 1291 strat = UVM_PGA_STRAT_NORMAL; 1292 goto again; 1293 1294 default: 1295 panic("uvm_pagealloc_strat: bad strat %d", strat); 1296 /* NOTREACHED */ 1297 } 1298 1299 gotit: 1300 /* 1301 * We now know which color we actually allocated from; set 1302 * the next color accordingly. 1303 */ 1304 1305 ucpu->pgflcolor = (color + 1) & uvmexp.colormask; 1306 1307 /* 1308 * while still at IPL_VM, update allocation statistics and remember 1309 * if we have to zero the page 1310 */ 1311 1312 if (flags & UVM_PGA_ZERO) { 1313 if (pg->flags & PG_ZERO) { 1314 CPU_COUNT(CPU_COUNT_PGA_ZEROHIT, 1); 1315 zeroit = 0; 1316 } else { 1317 CPU_COUNT(CPU_COUNT_PGA_ZEROMISS, 1); 1318 zeroit = 1; 1319 } 1320 } 1321 if (pg->flags & PG_ZERO) { 1322 CPU_COUNT(CPU_COUNT_ZEROPAGES, -1); 1323 } 1324 if (anon) { 1325 CPU_COUNT(CPU_COUNT_ANONPAGES, 1); 1326 CPU_COUNT(CPU_COUNT_ANONCLEAN, 1); 1327 } 1328 splx(s); 1329 KASSERT((pg->flags & ~(PG_ZERO|PG_FREE)) == 0); 1330 1331 /* 1332 * assign the page to the object. as the page was free, we know 1333 * that pg->uobject and pg->uanon are NULL. we only need to take 1334 * the page's interlock if we are changing the values. 1335 */ 1336 if (anon != NULL || obj != NULL) { 1337 mutex_enter(&pg->interlock); 1338 } 1339 pg->offset = off; 1340 pg->uobject = obj; 1341 pg->uanon = anon; 1342 KASSERT(uvm_page_owner_locked_p(pg, true)); 1343 pg->flags = PG_BUSY|PG_CLEAN|PG_FAKE; 1344 if (anon) { 1345 anon->an_page = pg; 1346 pg->flags |= PG_ANON; 1347 mutex_exit(&pg->interlock); 1348 } else if (obj) { 1349 /* 1350 * set PG_FILE|PG_AOBJ before the first uvm_pageinsert. 1351 */ 1352 if (UVM_OBJ_IS_VNODE(obj)) { 1353 pg->flags |= PG_FILE; 1354 } else { 1355 pg->flags |= PG_AOBJ; 1356 } 1357 uvm_pageinsert_object(obj, pg); 1358 mutex_exit(&pg->interlock); 1359 error = uvm_pageinsert_tree(obj, pg); 1360 if (error != 0) { 1361 mutex_enter(&pg->interlock); 1362 uvm_pageremove_object(obj, pg); 1363 mutex_exit(&pg->interlock); 1364 uvm_pagefree(pg); 1365 return NULL; 1366 } 1367 } 1368 1369 #if defined(UVM_PAGE_TRKOWN) 1370 pg->owner_tag = NULL; 1371 #endif 1372 UVM_PAGE_OWN(pg, "new alloc"); 1373 1374 if (flags & UVM_PGA_ZERO) { 1375 /* 1376 * A zero'd page is not clean. If we got a page not already 1377 * zero'd, then we have to zero it ourselves. 1378 */ 1379 if (obj != NULL || anon != NULL) { 1380 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY); 1381 } 1382 if (zeroit) { 1383 pmap_zero_page(VM_PAGE_TO_PHYS(pg)); 1384 } 1385 } 1386 1387 return(pg); 1388 } 1389 1390 /* 1391 * uvm_pagereplace: replace a page with another 1392 * 1393 * => object must be locked 1394 * => page interlocks must be held 1395 */ 1396 1397 void 1398 uvm_pagereplace(struct vm_page *oldpg, struct vm_page *newpg) 1399 { 1400 struct uvm_object *uobj = oldpg->uobject; 1401 struct vm_page *pg __diagused; 1402 uint64_t idx; 1403 1404 KASSERT((oldpg->flags & PG_TABLED) != 0); 1405 KASSERT(uobj != NULL); 1406 KASSERT((newpg->flags & PG_TABLED) == 0); 1407 KASSERT(newpg->uobject == NULL); 1408 KASSERT(rw_write_held(uobj->vmobjlock)); 1409 KASSERT(mutex_owned(&oldpg->interlock)); 1410 KASSERT(mutex_owned(&newpg->interlock)); 1411 1412 newpg->uobject = uobj; 1413 newpg->offset = oldpg->offset; 1414 idx = newpg->offset >> PAGE_SHIFT; 1415 pg = radix_tree_replace_node(&uobj->uo_pages, idx, newpg); 1416 KASSERT(pg == oldpg); 1417 if (((oldpg->flags ^ newpg->flags) & PG_CLEAN) != 0) { 1418 if ((newpg->flags & PG_CLEAN) != 0) { 1419 radix_tree_clear_tag(&uobj->uo_pages, idx, 1420 UVM_PAGE_DIRTY_TAG); 1421 } else { 1422 radix_tree_set_tag(&uobj->uo_pages, idx, 1423 UVM_PAGE_DIRTY_TAG); 1424 } 1425 } 1426 /* 1427 * oldpg's PG_STAT is stable. newpg is not reachable by others yet. 1428 */ 1429 newpg->flags |= 1430 (newpg->flags & ~PG_STAT) | (oldpg->flags & PG_STAT); 1431 uvm_pageinsert_object(uobj, newpg); 1432 uvm_pageremove_object(uobj, oldpg); 1433 } 1434 1435 /* 1436 * uvm_pagerealloc: reallocate a page from one object to another 1437 * 1438 * => both objects must be locked 1439 * => both interlocks must be held 1440 */ 1441 1442 void 1443 uvm_pagerealloc(struct vm_page *pg, struct uvm_object *newobj, voff_t newoff) 1444 { 1445 /* 1446 * remove it from the old object 1447 */ 1448 1449 if (pg->uobject) { 1450 uvm_pageremove_tree(pg->uobject, pg); 1451 uvm_pageremove_object(pg->uobject, pg); 1452 } 1453 1454 /* 1455 * put it in the new object 1456 */ 1457 1458 if (newobj) { 1459 /* 1460 * XXX we have no in-tree users of this functionality 1461 */ 1462 panic("uvm_pagerealloc: no impl"); 1463 } 1464 } 1465 1466 #ifdef DEBUG 1467 /* 1468 * check if page is zero-filled 1469 */ 1470 void 1471 uvm_pagezerocheck(struct vm_page *pg) 1472 { 1473 int *p, *ep; 1474 1475 KASSERT(uvm_zerocheckkva != 0); 1476 1477 /* 1478 * XXX assuming pmap_kenter_pa and pmap_kremove never call 1479 * uvm page allocator. 1480 * 1481 * it might be better to have "CPU-local temporary map" pmap interface. 1482 */ 1483 mutex_spin_enter(&uvm_zerochecklock); 1484 pmap_kenter_pa(uvm_zerocheckkva, VM_PAGE_TO_PHYS(pg), VM_PROT_READ, 0); 1485 p = (int *)uvm_zerocheckkva; 1486 ep = (int *)((char *)p + PAGE_SIZE); 1487 pmap_update(pmap_kernel()); 1488 while (p < ep) { 1489 if (*p != 0) 1490 panic("PG_ZERO page isn't zero-filled"); 1491 p++; 1492 } 1493 pmap_kremove(uvm_zerocheckkva, PAGE_SIZE); 1494 mutex_spin_exit(&uvm_zerochecklock); 1495 /* 1496 * pmap_update() is not necessary here because no one except us 1497 * uses this VA. 1498 */ 1499 } 1500 #endif /* DEBUG */ 1501 1502 /* 1503 * uvm_pagefree: free page 1504 * 1505 * => erase page's identity (i.e. remove from object) 1506 * => put page on free list 1507 * => caller must lock owning object (either anon or uvm_object) 1508 * => assumes all valid mappings of pg are gone 1509 */ 1510 1511 void 1512 uvm_pagefree(struct vm_page *pg) 1513 { 1514 struct pgfreelist *pgfl; 1515 struct pgflbucket *pgb; 1516 struct uvm_cpu *ucpu; 1517 kmutex_t *lock; 1518 int bucket, s; 1519 bool locked; 1520 1521 #ifdef DEBUG 1522 if (pg->uobject == (void *)0xdeadbeef && 1523 pg->uanon == (void *)0xdeadbeef) { 1524 panic("uvm_pagefree: freeing free page %p", pg); 1525 } 1526 #endif /* DEBUG */ 1527 1528 KASSERT((pg->flags & PG_PAGEOUT) == 0); 1529 KASSERT(!(pg->flags & PG_FREE)); 1530 KASSERT(pg->uobject == NULL || rw_write_held(pg->uobject->vmobjlock)); 1531 KASSERT(pg->uobject != NULL || pg->uanon == NULL || 1532 rw_write_held(pg->uanon->an_lock)); 1533 1534 /* 1535 * remove the page from the object's tree before acquiring any page 1536 * interlocks: this can acquire locks to free radixtree nodes. 1537 */ 1538 if (pg->uobject != NULL) { 1539 uvm_pageremove_tree(pg->uobject, pg); 1540 } 1541 1542 /* 1543 * if the page is loaned, resolve the loan instead of freeing. 1544 */ 1545 1546 if (pg->loan_count) { 1547 KASSERT(pg->wire_count == 0); 1548 1549 /* 1550 * if the page is owned by an anon then we just want to 1551 * drop anon ownership. the kernel will free the page when 1552 * it is done with it. if the page is owned by an object, 1553 * remove it from the object and mark it dirty for the benefit 1554 * of possible anon owners. 1555 * 1556 * regardless of previous ownership, wakeup any waiters, 1557 * unbusy the page, and we're done. 1558 */ 1559 1560 uvm_pagelock(pg); 1561 locked = true; 1562 if (pg->uobject != NULL) { 1563 uvm_pageremove_object(pg->uobject, pg); 1564 pg->flags &= ~(PG_FILE|PG_AOBJ); 1565 } else if (pg->uanon != NULL) { 1566 if ((pg->flags & PG_ANON) == 0) { 1567 pg->loan_count--; 1568 } else { 1569 pg->flags &= ~PG_ANON; 1570 cpu_count(CPU_COUNT_ANONPAGES, -1); 1571 } 1572 pg->uanon->an_page = NULL; 1573 pg->uanon = NULL; 1574 } 1575 if (pg->pqflags & PQ_WANTED) { 1576 wakeup(pg); 1577 } 1578 pg->pqflags &= ~PQ_WANTED; 1579 pg->flags &= ~(PG_BUSY|PG_RELEASED|PG_PAGER1); 1580 #ifdef UVM_PAGE_TRKOWN 1581 pg->owner_tag = NULL; 1582 #endif 1583 KASSERT((pg->flags & PG_STAT) == 0); 1584 if (pg->loan_count) { 1585 KASSERT(pg->uobject == NULL); 1586 if (pg->uanon == NULL) { 1587 uvm_pagedequeue(pg); 1588 } 1589 uvm_pageunlock(pg); 1590 return; 1591 } 1592 } else if (pg->uobject != NULL || pg->uanon != NULL || 1593 pg->wire_count != 0) { 1594 uvm_pagelock(pg); 1595 locked = true; 1596 } else { 1597 locked = false; 1598 } 1599 1600 /* 1601 * remove page from its object or anon. 1602 */ 1603 if (pg->uobject != NULL) { 1604 uvm_pageremove_object(pg->uobject, pg); 1605 } else if (pg->uanon != NULL) { 1606 const unsigned int status = uvm_pagegetdirty(pg); 1607 pg->uanon->an_page = NULL; 1608 pg->uanon = NULL; 1609 kpreempt_disable(); 1610 CPU_COUNT(CPU_COUNT_ANONPAGES, -1); 1611 CPU_COUNT(CPU_COUNT_ANONUNKNOWN + status, -1); 1612 kpreempt_enable(); 1613 } 1614 1615 /* 1616 * if the page was wired, unwire it now. 1617 */ 1618 1619 if (pg->wire_count) { 1620 pg->wire_count = 0; 1621 atomic_dec_uint(&uvmexp.wired); 1622 } 1623 if (locked) { 1624 /* 1625 * wake anyone waiting on the page. 1626 */ 1627 if ((pg->pqflags & PQ_WANTED) != 0) { 1628 pg->pqflags &= ~PQ_WANTED; 1629 wakeup(pg); 1630 } 1631 1632 /* 1633 * now remove the page from the queues. 1634 */ 1635 uvm_pagedequeue(pg); 1636 uvm_pageunlock(pg); 1637 } else { 1638 KASSERT(!uvmpdpol_pageisqueued_p(pg)); 1639 } 1640 1641 /* 1642 * and put on free queue 1643 */ 1644 1645 #ifdef DEBUG 1646 pg->uobject = (void *)0xdeadbeef; 1647 pg->uanon = (void *)0xdeadbeef; 1648 if (pg->flags & PG_ZERO) 1649 uvm_pagezerocheck(pg); 1650 #endif /* DEBUG */ 1651 1652 /* Try to send the page to the per-CPU cache. */ 1653 s = splvm(); 1654 if (pg->flags & PG_ZERO) { 1655 CPU_COUNT(CPU_COUNT_ZEROPAGES, 1); 1656 } 1657 ucpu = curcpu()->ci_data.cpu_uvm; 1658 bucket = uvm_page_get_bucket(pg); 1659 if (bucket == ucpu->pgflbucket && uvm_pgflcache_free(ucpu, pg)) { 1660 splx(s); 1661 return; 1662 } 1663 1664 /* Didn't work. Never mind, send it to a global bucket. */ 1665 pgfl = &uvm.page_free[uvm_page_get_freelist(pg)]; 1666 pgb = pgfl->pgfl_buckets[bucket]; 1667 lock = &uvm_freelist_locks[bucket].lock; 1668 1669 mutex_spin_enter(lock); 1670 /* PG_FREE must be set under lock because of uvm_pglistalloc(). */ 1671 pg->flags = (pg->flags & PG_ZERO) | PG_FREE; 1672 LIST_INSERT_HEAD(&pgb->pgb_colors[VM_PGCOLOR(pg)], pg, pageq.list); 1673 pgb->pgb_nfree++; 1674 mutex_spin_exit(lock); 1675 splx(s); 1676 } 1677 1678 /* 1679 * uvm_page_unbusy: unbusy an array of pages. 1680 * 1681 * => pages must either all belong to the same object, or all belong to anons. 1682 * => if pages are object-owned, object must be locked. 1683 * => if pages are anon-owned, anons must be locked. 1684 * => caller must make sure that anon-owned pages are not PG_RELEASED. 1685 */ 1686 1687 void 1688 uvm_page_unbusy(struct vm_page **pgs, int npgs) 1689 { 1690 struct vm_page *pg; 1691 int i; 1692 UVMHIST_FUNC("uvm_page_unbusy"); UVMHIST_CALLED(ubchist); 1693 1694 for (i = 0; i < npgs; i++) { 1695 pg = pgs[i]; 1696 if (pg == NULL || pg == PGO_DONTCARE) { 1697 continue; 1698 } 1699 1700 KASSERT(uvm_page_owner_locked_p(pg, true)); 1701 KASSERT(pg->flags & PG_BUSY); 1702 KASSERT((pg->flags & PG_PAGEOUT) == 0); 1703 if (pg->flags & PG_RELEASED) { 1704 UVMHIST_LOG(ubchist, "releasing pg %#jx", 1705 (uintptr_t)pg, 0, 0, 0); 1706 KASSERT(pg->uobject != NULL || 1707 (pg->uanon != NULL && pg->uanon->an_ref > 0)); 1708 pg->flags &= ~PG_RELEASED; 1709 uvm_pagefree(pg); 1710 } else { 1711 UVMHIST_LOG(ubchist, "unbusying pg %#jx", 1712 (uintptr_t)pg, 0, 0, 0); 1713 KASSERT((pg->flags & PG_FAKE) == 0); 1714 pg->flags &= ~PG_BUSY; 1715 uvm_pagelock(pg); 1716 uvm_pagewakeup(pg); 1717 uvm_pageunlock(pg); 1718 UVM_PAGE_OWN(pg, NULL); 1719 } 1720 } 1721 } 1722 1723 /* 1724 * uvm_pagewait: wait for a busy page 1725 * 1726 * => page must be known PG_BUSY 1727 * => object must be read or write locked 1728 * => object will be unlocked on return 1729 */ 1730 1731 void 1732 uvm_pagewait(struct vm_page *pg, krwlock_t *lock, const char *wmesg) 1733 { 1734 1735 KASSERT(rw_lock_held(lock)); 1736 KASSERT((pg->flags & PG_BUSY) != 0); 1737 KASSERT(uvm_page_owner_locked_p(pg, false)); 1738 1739 mutex_enter(&pg->interlock); 1740 rw_exit(lock); 1741 pg->pqflags |= PQ_WANTED; 1742 UVM_UNLOCK_AND_WAIT(pg, &pg->interlock, false, wmesg, 0); 1743 } 1744 1745 /* 1746 * uvm_pagewakeup: wake anyone waiting on a page 1747 * 1748 * => page interlock must be held 1749 */ 1750 1751 void 1752 uvm_pagewakeup(struct vm_page *pg) 1753 { 1754 UVMHIST_FUNC("uvm_pagewakeup"); UVMHIST_CALLED(ubchist); 1755 1756 KASSERT(mutex_owned(&pg->interlock)); 1757 1758 UVMHIST_LOG(ubchist, "waking pg %#jx", (uintptr_t)pg, 0, 0, 0); 1759 1760 if ((pg->pqflags & PQ_WANTED) != 0) { 1761 wakeup(pg); 1762 pg->pqflags &= ~PQ_WANTED; 1763 } 1764 } 1765 1766 #if defined(UVM_PAGE_TRKOWN) 1767 /* 1768 * uvm_page_own: set or release page ownership 1769 * 1770 * => this is a debugging function that keeps track of who sets PG_BUSY 1771 * and where they do it. it can be used to track down problems 1772 * such a process setting "PG_BUSY" and never releasing it. 1773 * => page's object [if any] must be locked 1774 * => if "tag" is NULL then we are releasing page ownership 1775 */ 1776 void 1777 uvm_page_own(struct vm_page *pg, const char *tag) 1778 { 1779 1780 KASSERT((pg->flags & (PG_PAGEOUT|PG_RELEASED)) == 0); 1781 KASSERT(uvm_page_owner_locked_p(pg, true)); 1782 1783 /* gain ownership? */ 1784 if (tag) { 1785 KASSERT((pg->flags & PG_BUSY) != 0); 1786 if (pg->owner_tag) { 1787 printf("uvm_page_own: page %p already owned " 1788 "by proc %d [%s]\n", pg, 1789 pg->owner, pg->owner_tag); 1790 panic("uvm_page_own"); 1791 } 1792 pg->owner = curproc->p_pid; 1793 pg->lowner = curlwp->l_lid; 1794 pg->owner_tag = tag; 1795 return; 1796 } 1797 1798 /* drop ownership */ 1799 KASSERT((pg->flags & PG_BUSY) == 0); 1800 if (pg->owner_tag == NULL) { 1801 printf("uvm_page_own: dropping ownership of an non-owned " 1802 "page (%p)\n", pg); 1803 panic("uvm_page_own"); 1804 } 1805 pg->owner_tag = NULL; 1806 } 1807 #endif 1808 1809 /* 1810 * uvm_pageidlezero: zero free pages while the system is idle. 1811 */ 1812 void 1813 uvm_pageidlezero(void) 1814 { 1815 1816 /* 1817 * Disabled for the moment. Previous strategy too cache heavy. In 1818 * the future we may experiment with zeroing the pages held in the 1819 * per-CPU cache (uvm_pgflcache). 1820 */ 1821 } 1822 1823 /* 1824 * uvm_pagelookup: look up a page 1825 * 1826 * => caller should lock object to keep someone from pulling the page 1827 * out from under it 1828 */ 1829 1830 struct vm_page * 1831 uvm_pagelookup(struct uvm_object *obj, voff_t off) 1832 { 1833 struct vm_page *pg; 1834 1835 /* No - used from DDB. KASSERT(rw_lock_held(obj->vmobjlock)); */ 1836 1837 pg = radix_tree_lookup_node(&obj->uo_pages, off >> PAGE_SHIFT); 1838 1839 KASSERT(pg == NULL || obj->uo_npages != 0); 1840 KASSERT(pg == NULL || (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 || 1841 (pg->flags & PG_BUSY) != 0); 1842 return pg; 1843 } 1844 1845 /* 1846 * uvm_pagewire: wire the page, thus removing it from the daemon's grasp 1847 * 1848 * => caller must lock objects 1849 * => caller must hold pg->interlock 1850 */ 1851 1852 void 1853 uvm_pagewire(struct vm_page *pg) 1854 { 1855 1856 KASSERT(uvm_page_owner_locked_p(pg, true)); 1857 KASSERT(mutex_owned(&pg->interlock)); 1858 #if defined(READAHEAD_STATS) 1859 if ((pg->flags & PG_READAHEAD) != 0) { 1860 uvm_ra_hit.ev_count++; 1861 pg->flags &= ~PG_READAHEAD; 1862 } 1863 #endif /* defined(READAHEAD_STATS) */ 1864 if (pg->wire_count == 0) { 1865 uvm_pagedequeue(pg); 1866 atomic_inc_uint(&uvmexp.wired); 1867 } 1868 pg->wire_count++; 1869 KASSERT(pg->wire_count > 0); /* detect wraparound */ 1870 } 1871 1872 /* 1873 * uvm_pageunwire: unwire the page. 1874 * 1875 * => activate if wire count goes to zero. 1876 * => caller must lock objects 1877 * => caller must hold pg->interlock 1878 */ 1879 1880 void 1881 uvm_pageunwire(struct vm_page *pg) 1882 { 1883 1884 KASSERT(uvm_page_owner_locked_p(pg, true)); 1885 KASSERT(pg->wire_count != 0); 1886 KASSERT(!uvmpdpol_pageisqueued_p(pg)); 1887 KASSERT(mutex_owned(&pg->interlock)); 1888 pg->wire_count--; 1889 if (pg->wire_count == 0) { 1890 uvm_pageactivate(pg); 1891 KASSERT(uvmexp.wired != 0); 1892 atomic_dec_uint(&uvmexp.wired); 1893 } 1894 } 1895 1896 /* 1897 * uvm_pagedeactivate: deactivate page 1898 * 1899 * => caller must lock objects 1900 * => caller must check to make sure page is not wired 1901 * => object that page belongs to must be locked (so we can adjust pg->flags) 1902 * => caller must clear the reference on the page before calling 1903 * => caller must hold pg->interlock 1904 */ 1905 1906 void 1907 uvm_pagedeactivate(struct vm_page *pg) 1908 { 1909 1910 KASSERT(uvm_page_owner_locked_p(pg, false)); 1911 KASSERT(mutex_owned(&pg->interlock)); 1912 if (pg->wire_count == 0) { 1913 KASSERT(uvmpdpol_pageisqueued_p(pg)); 1914 uvmpdpol_pagedeactivate(pg); 1915 } 1916 } 1917 1918 /* 1919 * uvm_pageactivate: activate page 1920 * 1921 * => caller must lock objects 1922 * => caller must hold pg->interlock 1923 */ 1924 1925 void 1926 uvm_pageactivate(struct vm_page *pg) 1927 { 1928 1929 KASSERT(uvm_page_owner_locked_p(pg, false)); 1930 KASSERT(mutex_owned(&pg->interlock)); 1931 #if defined(READAHEAD_STATS) 1932 if ((pg->flags & PG_READAHEAD) != 0) { 1933 uvm_ra_hit.ev_count++; 1934 pg->flags &= ~PG_READAHEAD; 1935 } 1936 #endif /* defined(READAHEAD_STATS) */ 1937 if (pg->wire_count == 0) { 1938 uvmpdpol_pageactivate(pg); 1939 } 1940 } 1941 1942 /* 1943 * uvm_pagedequeue: remove a page from any paging queue 1944 * 1945 * => caller must lock objects 1946 * => caller must hold pg->interlock 1947 */ 1948 void 1949 uvm_pagedequeue(struct vm_page *pg) 1950 { 1951 1952 KASSERT(uvm_page_owner_locked_p(pg, true)); 1953 KASSERT(mutex_owned(&pg->interlock)); 1954 if (uvmpdpol_pageisqueued_p(pg)) { 1955 uvmpdpol_pagedequeue(pg); 1956 } 1957 } 1958 1959 /* 1960 * uvm_pageenqueue: add a page to a paging queue without activating. 1961 * used where a page is not really demanded (yet). eg. read-ahead 1962 * 1963 * => caller must lock objects 1964 * => caller must hold pg->interlock 1965 */ 1966 void 1967 uvm_pageenqueue(struct vm_page *pg) 1968 { 1969 1970 KASSERT(uvm_page_owner_locked_p(pg, false)); 1971 KASSERT(mutex_owned(&pg->interlock)); 1972 if (pg->wire_count == 0 && !uvmpdpol_pageisqueued_p(pg)) { 1973 uvmpdpol_pageenqueue(pg); 1974 } 1975 } 1976 1977 /* 1978 * uvm_pagelock: acquire page interlock 1979 */ 1980 void 1981 uvm_pagelock(struct vm_page *pg) 1982 { 1983 1984 mutex_enter(&pg->interlock); 1985 } 1986 1987 /* 1988 * uvm_pagelock2: acquire two page interlocks 1989 */ 1990 void 1991 uvm_pagelock2(struct vm_page *pg1, struct vm_page *pg2) 1992 { 1993 1994 if (pg1 < pg2) { 1995 mutex_enter(&pg1->interlock); 1996 mutex_enter(&pg2->interlock); 1997 } else { 1998 mutex_enter(&pg2->interlock); 1999 mutex_enter(&pg1->interlock); 2000 } 2001 } 2002 2003 /* 2004 * uvm_pageunlock: release page interlock, and if a page replacement intent 2005 * is set on the page, pass it to uvmpdpol to make real. 2006 * 2007 * => caller must hold pg->interlock 2008 */ 2009 void 2010 uvm_pageunlock(struct vm_page *pg) 2011 { 2012 2013 if ((pg->pqflags & PQ_INTENT_SET) == 0 || 2014 (pg->pqflags & PQ_INTENT_QUEUED) != 0) { 2015 mutex_exit(&pg->interlock); 2016 return; 2017 } 2018 pg->pqflags |= PQ_INTENT_QUEUED; 2019 mutex_exit(&pg->interlock); 2020 uvmpdpol_pagerealize(pg); 2021 } 2022 2023 /* 2024 * uvm_pageunlock2: release two page interlocks, and for both pages if a 2025 * page replacement intent is set on the page, pass it to uvmpdpol to make 2026 * real. 2027 * 2028 * => caller must hold pg->interlock 2029 */ 2030 void 2031 uvm_pageunlock2(struct vm_page *pg1, struct vm_page *pg2) 2032 { 2033 2034 if ((pg1->pqflags & PQ_INTENT_SET) == 0 || 2035 (pg1->pqflags & PQ_INTENT_QUEUED) != 0) { 2036 mutex_exit(&pg1->interlock); 2037 pg1 = NULL; 2038 } else { 2039 pg1->pqflags |= PQ_INTENT_QUEUED; 2040 mutex_exit(&pg1->interlock); 2041 } 2042 2043 if ((pg2->pqflags & PQ_INTENT_SET) == 0 || 2044 (pg2->pqflags & PQ_INTENT_QUEUED) != 0) { 2045 mutex_exit(&pg2->interlock); 2046 pg2 = NULL; 2047 } else { 2048 pg2->pqflags |= PQ_INTENT_QUEUED; 2049 mutex_exit(&pg2->interlock); 2050 } 2051 2052 if (pg1 != NULL) { 2053 uvmpdpol_pagerealize(pg1); 2054 } 2055 if (pg2 != NULL) { 2056 uvmpdpol_pagerealize(pg2); 2057 } 2058 } 2059 2060 /* 2061 * uvm_pagezero: zero fill a page 2062 * 2063 * => if page is part of an object then the object should be locked 2064 * to protect pg->flags. 2065 */ 2066 2067 void 2068 uvm_pagezero(struct vm_page *pg) 2069 { 2070 2071 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY); 2072 pmap_zero_page(VM_PAGE_TO_PHYS(pg)); 2073 } 2074 2075 /* 2076 * uvm_pagecopy: copy a page 2077 * 2078 * => if page is part of an object then the object should be locked 2079 * to protect pg->flags. 2080 */ 2081 2082 void 2083 uvm_pagecopy(struct vm_page *src, struct vm_page *dst) 2084 { 2085 2086 uvm_pagemarkdirty(dst, UVM_PAGE_STATUS_DIRTY); 2087 pmap_copy_page(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst)); 2088 } 2089 2090 /* 2091 * uvm_pageismanaged: test it see that a page (specified by PA) is managed. 2092 */ 2093 2094 bool 2095 uvm_pageismanaged(paddr_t pa) 2096 { 2097 2098 return (uvm_physseg_find(atop(pa), NULL) != UVM_PHYSSEG_TYPE_INVALID); 2099 } 2100 2101 /* 2102 * uvm_page_lookup_freelist: look up the free list for the specified page 2103 */ 2104 2105 int 2106 uvm_page_lookup_freelist(struct vm_page *pg) 2107 { 2108 uvm_physseg_t upm; 2109 2110 upm = uvm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), NULL); 2111 KASSERT(upm != UVM_PHYSSEG_TYPE_INVALID); 2112 return uvm_physseg_get_free_list(upm); 2113 } 2114 2115 /* 2116 * uvm_page_owner_locked_p: return true if object associated with page is 2117 * locked. this is a weak check for runtime assertions only. 2118 */ 2119 2120 bool 2121 uvm_page_owner_locked_p(struct vm_page *pg, bool exclusive) 2122 { 2123 2124 if (pg->uobject != NULL) { 2125 return exclusive 2126 ? rw_write_held(pg->uobject->vmobjlock) 2127 : rw_lock_held(pg->uobject->vmobjlock); 2128 } 2129 if (pg->uanon != NULL) { 2130 return exclusive 2131 ? rw_write_held(pg->uanon->an_lock) 2132 : rw_lock_held(pg->uanon->an_lock); 2133 } 2134 return true; 2135 } 2136 2137 /* 2138 * uvm_pagereadonly_p: return if the page should be mapped read-only 2139 */ 2140 2141 bool 2142 uvm_pagereadonly_p(struct vm_page *pg) 2143 { 2144 struct uvm_object * const uobj = pg->uobject; 2145 2146 KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock)); 2147 KASSERT(uobj != NULL || rw_lock_held(pg->uanon->an_lock)); 2148 if ((pg->flags & PG_RDONLY) != 0) { 2149 return true; 2150 } 2151 if (uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) { 2152 return true; 2153 } 2154 if (uobj == NULL) { 2155 return false; 2156 } 2157 return UVM_OBJ_NEEDS_WRITEFAULT(uobj); 2158 } 2159 2160 #ifdef PMAP_DIRECT 2161 /* 2162 * Call pmap to translate physical address into a virtual and to run a callback 2163 * for it. Used to avoid actually mapping the pages, pmap most likely uses direct map 2164 * or equivalent. 2165 */ 2166 int 2167 uvm_direct_process(struct vm_page **pgs, u_int npages, voff_t off, vsize_t len, 2168 int (*process)(void *, size_t, void *), void *arg) 2169 { 2170 int error = 0; 2171 paddr_t pa; 2172 size_t todo; 2173 voff_t pgoff = (off & PAGE_MASK); 2174 struct vm_page *pg; 2175 2176 KASSERT(npages > 0 && len > 0); 2177 2178 for (int i = 0; i < npages; i++) { 2179 pg = pgs[i]; 2180 2181 KASSERT(len > 0); 2182 2183 /* 2184 * Caller is responsible for ensuring all the pages are 2185 * available. 2186 */ 2187 KASSERT(pg != NULL && pg != PGO_DONTCARE); 2188 2189 pa = VM_PAGE_TO_PHYS(pg); 2190 todo = MIN(len, PAGE_SIZE - pgoff); 2191 2192 error = pmap_direct_process(pa, pgoff, todo, process, arg); 2193 if (error) 2194 break; 2195 2196 pgoff = 0; 2197 len -= todo; 2198 } 2199 2200 KASSERTMSG(error != 0 || len == 0, "len %lu != 0 for non-error", len); 2201 return error; 2202 } 2203 #endif /* PMAP_DIRECT */ 2204 2205 #if defined(DDB) || defined(DEBUGPRINT) 2206 2207 /* 2208 * uvm_page_printit: actually print the page 2209 */ 2210 2211 static const char page_flagbits[] = UVM_PGFLAGBITS; 2212 static const char page_pqflagbits[] = UVM_PQFLAGBITS; 2213 2214 void 2215 uvm_page_printit(struct vm_page *pg, bool full, 2216 void (*pr)(const char *, ...)) 2217 { 2218 struct vm_page *tpg; 2219 struct uvm_object *uobj; 2220 struct pgflbucket *pgb; 2221 struct pgflist *pgl; 2222 char pgbuf[128]; 2223 2224 (*pr)("PAGE %p:\n", pg); 2225 snprintb(pgbuf, sizeof(pgbuf), page_flagbits, pg->flags); 2226 (*pr)(" flags=%s\n", pgbuf); 2227 snprintb(pgbuf, sizeof(pgbuf), page_pqflagbits, pg->pqflags); 2228 (*pr)(" pqflags=%s\n", pgbuf); 2229 (*pr)(" uobject=%p, uanon=%p, offset=0x%llx\n", 2230 pg->uobject, pg->uanon, (long long)pg->offset); 2231 (*pr)(" loan_count=%d wire_count=%d bucket=%d freelist=%d\n", 2232 pg->loan_count, pg->wire_count, uvm_page_get_bucket(pg), 2233 uvm_page_get_freelist(pg)); 2234 (*pr)(" pa=0x%lx\n", (long)VM_PAGE_TO_PHYS(pg)); 2235 #if defined(UVM_PAGE_TRKOWN) 2236 if (pg->flags & PG_BUSY) 2237 (*pr)(" owning process = %d, tag=%s\n", 2238 pg->owner, pg->owner_tag); 2239 else 2240 (*pr)(" page not busy, no owner\n"); 2241 #else 2242 (*pr)(" [page ownership tracking disabled]\n"); 2243 #endif 2244 2245 if (!full) 2246 return; 2247 2248 /* cross-verify object/anon */ 2249 if ((pg->flags & PG_FREE) == 0) { 2250 if (pg->flags & PG_ANON) { 2251 if (pg->uanon == NULL || pg->uanon->an_page != pg) 2252 (*pr)(" >>> ANON DOES NOT POINT HERE <<< (%p)\n", 2253 (pg->uanon) ? pg->uanon->an_page : NULL); 2254 else 2255 (*pr)(" anon backpointer is OK\n"); 2256 } else { 2257 uobj = pg->uobject; 2258 if (uobj) { 2259 (*pr)(" checking object list\n"); 2260 tpg = uvm_pagelookup(uobj, pg->offset); 2261 if (tpg) 2262 (*pr)(" page found on object list\n"); 2263 else 2264 (*pr)(" >>> PAGE NOT FOUND ON OBJECT LIST! <<<\n"); 2265 } 2266 } 2267 } 2268 2269 /* cross-verify page queue */ 2270 if (pg->flags & PG_FREE) { 2271 int fl = uvm_page_get_freelist(pg); 2272 int b = uvm_page_get_bucket(pg); 2273 pgb = uvm.page_free[fl].pgfl_buckets[b]; 2274 pgl = &pgb->pgb_colors[VM_PGCOLOR(pg)]; 2275 (*pr)(" checking pageq list\n"); 2276 LIST_FOREACH(tpg, pgl, pageq.list) { 2277 if (tpg == pg) { 2278 break; 2279 } 2280 } 2281 if (tpg) 2282 (*pr)(" page found on pageq list\n"); 2283 else 2284 (*pr)(" >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n"); 2285 } 2286 } 2287 2288 /* 2289 * uvm_page_printall - print a summary of all managed pages 2290 */ 2291 2292 void 2293 uvm_page_printall(void (*pr)(const char *, ...)) 2294 { 2295 uvm_physseg_t i; 2296 paddr_t pfn; 2297 struct vm_page *pg; 2298 2299 (*pr)("%18s %4s %4s %18s %18s" 2300 #ifdef UVM_PAGE_TRKOWN 2301 " OWNER" 2302 #endif 2303 "\n", "PAGE", "FLAG", "PQ", "UOBJECT", "UANON"); 2304 for (i = uvm_physseg_get_first(); 2305 uvm_physseg_valid_p(i); 2306 i = uvm_physseg_get_next(i)) { 2307 for (pfn = uvm_physseg_get_start(i); 2308 pfn < uvm_physseg_get_end(i); 2309 pfn++) { 2310 pg = PHYS_TO_VM_PAGE(ptoa(pfn)); 2311 2312 (*pr)("%18p %04x %08x %18p %18p", 2313 pg, pg->flags, pg->pqflags, pg->uobject, 2314 pg->uanon); 2315 #ifdef UVM_PAGE_TRKOWN 2316 if (pg->flags & PG_BUSY) 2317 (*pr)(" %d [%s]", pg->owner, pg->owner_tag); 2318 #endif 2319 (*pr)("\n"); 2320 } 2321 } 2322 } 2323 2324 /* 2325 * uvm_page_print_freelists - print a summary freelists 2326 */ 2327 2328 void 2329 uvm_page_print_freelists(void (*pr)(const char *, ...)) 2330 { 2331 struct pgfreelist *pgfl; 2332 struct pgflbucket *pgb; 2333 int fl, b, c; 2334 2335 (*pr)("There are %d freelists with %d buckets of %d colors.\n\n", 2336 VM_NFREELIST, uvm.bucketcount, uvmexp.ncolors); 2337 2338 for (fl = 0; fl < VM_NFREELIST; fl++) { 2339 pgfl = &uvm.page_free[fl]; 2340 (*pr)("freelist(%d) @ %p\n", fl, pgfl); 2341 for (b = 0; b < uvm.bucketcount; b++) { 2342 pgb = uvm.page_free[fl].pgfl_buckets[b]; 2343 (*pr)(" bucket(%d) @ %p, nfree = %d, lock @ %p:\n", 2344 b, pgb, pgb->pgb_nfree, 2345 &uvm_freelist_locks[b].lock); 2346 for (c = 0; c < uvmexp.ncolors; c++) { 2347 (*pr)(" color(%d) @ %p, ", c, 2348 &pgb->pgb_colors[c]); 2349 (*pr)("first page = %p\n", 2350 LIST_FIRST(&pgb->pgb_colors[c])); 2351 } 2352 } 2353 } 2354 } 2355 2356 #endif /* DDB || DEBUGPRINT */ 2357