1 /* $NetBSD: uvm_page.c,v 1.244 2020/07/09 05:57:15 skrll Exp $ */ 2 3 /*- 4 * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1997 Charles D. Cranor and Washington University. 34 * Copyright (c) 1991, 1993, The Regents of the University of California. 35 * 36 * All rights reserved. 37 * 38 * This code is derived from software contributed to Berkeley by 39 * The Mach Operating System project at Carnegie-Mellon University. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * @(#)vm_page.c 8.3 (Berkeley) 3/21/94 66 * from: Id: uvm_page.c,v 1.1.2.18 1998/02/06 05:24:42 chs Exp 67 * 68 * 69 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 70 * All rights reserved. 71 * 72 * Permission to use, copy, modify and distribute this software and 73 * its documentation is hereby granted, provided that both the copyright 74 * notice and this permission notice appear in all copies of the 75 * software, derivative works or modified versions, and any portions 76 * thereof, and that both notices appear in supporting documentation. 77 * 78 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 79 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 80 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 81 * 82 * Carnegie Mellon requests users of this software to return to 83 * 84 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 85 * School of Computer Science 86 * Carnegie Mellon University 87 * Pittsburgh PA 15213-3890 88 * 89 * any improvements or extensions that they make and grant Carnegie the 90 * rights to redistribute these changes. 91 */ 92 93 /* 94 * uvm_page.c: page ops. 95 */ 96 97 #include <sys/cdefs.h> 98 __KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.244 2020/07/09 05:57:15 skrll Exp $"); 99 100 #include "opt_ddb.h" 101 #include "opt_uvm.h" 102 #include "opt_uvmhist.h" 103 #include "opt_readahead.h" 104 105 #include <sys/param.h> 106 #include <sys/systm.h> 107 #include <sys/sched.h> 108 #include <sys/kernel.h> 109 #include <sys/vnode.h> 110 #include <sys/proc.h> 111 #include <sys/radixtree.h> 112 #include <sys/atomic.h> 113 #include <sys/cpu.h> 114 115 #include <uvm/uvm.h> 116 #include <uvm/uvm_ddb.h> 117 #include <uvm/uvm_pdpolicy.h> 118 #include <uvm/uvm_pgflcache.h> 119 120 /* 121 * number of pages per-CPU to reserve for the kernel. 122 */ 123 #ifndef UVM_RESERVED_PAGES_PER_CPU 124 #define UVM_RESERVED_PAGES_PER_CPU 5 125 #endif 126 int vm_page_reserve_kernel = UVM_RESERVED_PAGES_PER_CPU; 127 128 /* 129 * physical memory size; 130 */ 131 psize_t physmem; 132 133 /* 134 * local variables 135 */ 136 137 /* 138 * these variables record the values returned by vm_page_bootstrap, 139 * for debugging purposes. The implementation of uvm_pageboot_alloc 140 * and pmap_startup here also uses them internally. 141 */ 142 143 static vaddr_t virtual_space_start; 144 static vaddr_t virtual_space_end; 145 146 /* 147 * we allocate an initial number of page colors in uvm_page_init(), 148 * and remember them. We may re-color pages as cache sizes are 149 * discovered during the autoconfiguration phase. But we can never 150 * free the initial set of buckets, since they are allocated using 151 * uvm_pageboot_alloc(). 152 */ 153 154 static size_t recolored_pages_memsize /* = 0 */; 155 static char *recolored_pages_mem; 156 157 /* 158 * freelist locks - one per bucket. 159 */ 160 161 union uvm_freelist_lock uvm_freelist_locks[PGFL_MAX_BUCKETS] 162 __cacheline_aligned; 163 164 /* 165 * basic NUMA information. 166 */ 167 168 static struct uvm_page_numa_region { 169 struct uvm_page_numa_region *next; 170 paddr_t start; 171 paddr_t size; 172 u_int numa_id; 173 } *uvm_page_numa_region; 174 175 #ifdef DEBUG 176 kmutex_t uvm_zerochecklock __cacheline_aligned; 177 vaddr_t uvm_zerocheckkva; 178 #endif /* DEBUG */ 179 180 /* 181 * These functions are reserved for uvm(9) internal use and are not 182 * exported in the header file uvm_physseg.h 183 * 184 * Thus they are redefined here. 185 */ 186 void uvm_physseg_init_seg(uvm_physseg_t, struct vm_page *); 187 void uvm_physseg_seg_chomp_slab(uvm_physseg_t, struct vm_page *, size_t); 188 189 /* returns a pgs array */ 190 struct vm_page *uvm_physseg_seg_alloc_from_slab(uvm_physseg_t, size_t); 191 192 /* 193 * inline functions 194 */ 195 196 /* 197 * uvm_pageinsert: insert a page in the object. 198 * 199 * => caller must lock object 200 * => call should have already set pg's object and offset pointers 201 * and bumped the version counter 202 */ 203 204 static inline void 205 uvm_pageinsert_object(struct uvm_object *uobj, struct vm_page *pg) 206 { 207 208 KASSERT(uobj == pg->uobject); 209 KASSERT(rw_write_held(uobj->vmobjlock)); 210 KASSERT((pg->flags & PG_TABLED) == 0); 211 212 if ((pg->flags & PG_STAT) != 0) { 213 /* Cannot use uvm_pagegetdirty(): not yet in radix tree. */ 214 const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY); 215 216 if ((pg->flags & PG_FILE) != 0) { 217 if (uobj->uo_npages == 0) { 218 struct vnode *vp = (struct vnode *)uobj; 219 mutex_enter(vp->v_interlock); 220 KASSERT((vp->v_iflag & VI_PAGES) == 0); 221 vp->v_iflag |= VI_PAGES; 222 vholdl(vp); 223 mutex_exit(vp->v_interlock); 224 } 225 if (UVM_OBJ_IS_VTEXT(uobj)) { 226 cpu_count(CPU_COUNT_EXECPAGES, 1); 227 } 228 cpu_count(CPU_COUNT_FILEUNKNOWN + status, 1); 229 } else { 230 cpu_count(CPU_COUNT_ANONUNKNOWN + status, 1); 231 } 232 } 233 pg->flags |= PG_TABLED; 234 uobj->uo_npages++; 235 } 236 237 static inline int 238 uvm_pageinsert_tree(struct uvm_object *uobj, struct vm_page *pg) 239 { 240 const uint64_t idx = pg->offset >> PAGE_SHIFT; 241 int error; 242 243 error = radix_tree_insert_node(&uobj->uo_pages, idx, pg); 244 if (error != 0) { 245 return error; 246 } 247 if ((pg->flags & PG_CLEAN) == 0) { 248 radix_tree_set_tag(&uobj->uo_pages, idx, UVM_PAGE_DIRTY_TAG); 249 } 250 KASSERT(((pg->flags & PG_CLEAN) == 0) == 251 radix_tree_get_tag(&uobj->uo_pages, idx, UVM_PAGE_DIRTY_TAG)); 252 return 0; 253 } 254 255 /* 256 * uvm_page_remove: remove page from object. 257 * 258 * => caller must lock object 259 */ 260 261 static inline void 262 uvm_pageremove_object(struct uvm_object *uobj, struct vm_page *pg) 263 { 264 265 KASSERT(uobj == pg->uobject); 266 KASSERT(rw_write_held(uobj->vmobjlock)); 267 KASSERT(pg->flags & PG_TABLED); 268 269 if ((pg->flags & PG_STAT) != 0) { 270 /* Cannot use uvm_pagegetdirty(): no longer in radix tree. */ 271 const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY); 272 273 if ((pg->flags & PG_FILE) != 0) { 274 if (uobj->uo_npages == 1) { 275 struct vnode *vp = (struct vnode *)uobj; 276 mutex_enter(vp->v_interlock); 277 KASSERT((vp->v_iflag & VI_PAGES) != 0); 278 vp->v_iflag &= ~VI_PAGES; 279 holdrelel(vp); 280 mutex_exit(vp->v_interlock); 281 } 282 if (UVM_OBJ_IS_VTEXT(uobj)) { 283 cpu_count(CPU_COUNT_EXECPAGES, -1); 284 } 285 cpu_count(CPU_COUNT_FILEUNKNOWN + status, -1); 286 } else { 287 cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1); 288 } 289 } 290 uobj->uo_npages--; 291 pg->flags &= ~PG_TABLED; 292 pg->uobject = NULL; 293 } 294 295 static inline void 296 uvm_pageremove_tree(struct uvm_object *uobj, struct vm_page *pg) 297 { 298 struct vm_page *opg __unused; 299 300 opg = radix_tree_remove_node(&uobj->uo_pages, pg->offset >> PAGE_SHIFT); 301 KASSERT(pg == opg); 302 } 303 304 static void 305 uvm_page_init_bucket(struct pgfreelist *pgfl, struct pgflbucket *pgb, int num) 306 { 307 int i; 308 309 pgb->pgb_nfree = 0; 310 for (i = 0; i < uvmexp.ncolors; i++) { 311 LIST_INIT(&pgb->pgb_colors[i]); 312 } 313 pgfl->pgfl_buckets[num] = pgb; 314 } 315 316 /* 317 * uvm_page_init: init the page system. called from uvm_init(). 318 * 319 * => we return the range of kernel virtual memory in kvm_startp/kvm_endp 320 */ 321 322 void 323 uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp) 324 { 325 static struct uvm_cpu boot_cpu __cacheline_aligned; 326 psize_t freepages, pagecount, bucketsize, n; 327 struct pgflbucket *pgb; 328 struct vm_page *pagearray; 329 char *bucketarray; 330 uvm_physseg_t bank; 331 int fl, b; 332 333 KASSERT(ncpu <= 1); 334 335 /* 336 * init the page queues and free page queue locks, except the 337 * free list; we allocate that later (with the initial vm_page 338 * structures). 339 */ 340 341 curcpu()->ci_data.cpu_uvm = &boot_cpu; 342 uvmpdpol_init(); 343 for (b = 0; b < __arraycount(uvm_freelist_locks); b++) { 344 mutex_init(&uvm_freelist_locks[b].lock, MUTEX_DEFAULT, IPL_VM); 345 } 346 347 /* 348 * allocate vm_page structures. 349 */ 350 351 /* 352 * sanity check: 353 * before calling this function the MD code is expected to register 354 * some free RAM with the uvm_page_physload() function. our job 355 * now is to allocate vm_page structures for this memory. 356 */ 357 358 if (uvm_physseg_get_last() == UVM_PHYSSEG_TYPE_INVALID) 359 panic("uvm_page_bootstrap: no memory pre-allocated"); 360 361 /* 362 * first calculate the number of free pages... 363 * 364 * note that we use start/end rather than avail_start/avail_end. 365 * this allows us to allocate extra vm_page structures in case we 366 * want to return some memory to the pool after booting. 367 */ 368 369 freepages = 0; 370 371 for (bank = uvm_physseg_get_first(); 372 uvm_physseg_valid_p(bank) ; 373 bank = uvm_physseg_get_next(bank)) { 374 freepages += (uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank)); 375 } 376 377 /* 378 * Let MD code initialize the number of colors, or default 379 * to 1 color if MD code doesn't care. 380 */ 381 if (uvmexp.ncolors == 0) 382 uvmexp.ncolors = 1; 383 uvmexp.colormask = uvmexp.ncolors - 1; 384 KASSERT((uvmexp.colormask & uvmexp.ncolors) == 0); 385 386 /* We always start with only 1 bucket. */ 387 uvm.bucketcount = 1; 388 389 /* 390 * we now know we have (PAGE_SIZE * freepages) bytes of memory we can 391 * use. for each page of memory we use we need a vm_page structure. 392 * thus, the total number of pages we can use is the total size of 393 * the memory divided by the PAGE_SIZE plus the size of the vm_page 394 * structure. we add one to freepages as a fudge factor to avoid 395 * truncation errors (since we can only allocate in terms of whole 396 * pages). 397 */ 398 pagecount = ((freepages + 1) << PAGE_SHIFT) / 399 (PAGE_SIZE + sizeof(struct vm_page)); 400 bucketsize = offsetof(struct pgflbucket, pgb_colors[uvmexp.ncolors]); 401 bucketsize = roundup2(bucketsize, coherency_unit); 402 bucketarray = (void *)uvm_pageboot_alloc( 403 bucketsize * VM_NFREELIST + 404 pagecount * sizeof(struct vm_page)); 405 pagearray = (struct vm_page *) 406 (bucketarray + bucketsize * VM_NFREELIST); 407 408 for (fl = 0; fl < VM_NFREELIST; fl++) { 409 pgb = (struct pgflbucket *)(bucketarray + bucketsize * fl); 410 uvm_page_init_bucket(&uvm.page_free[fl], pgb, 0); 411 } 412 memset(pagearray, 0, pagecount * sizeof(struct vm_page)); 413 414 /* 415 * init the freelist cache in the disabled state. 416 */ 417 uvm_pgflcache_init(); 418 419 /* 420 * init the vm_page structures and put them in the correct place. 421 */ 422 /* First init the extent */ 423 424 for (bank = uvm_physseg_get_first(), 425 uvm_physseg_seg_chomp_slab(bank, pagearray, pagecount); 426 uvm_physseg_valid_p(bank); 427 bank = uvm_physseg_get_next(bank)) { 428 429 n = uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank); 430 uvm_physseg_seg_alloc_from_slab(bank, n); 431 uvm_physseg_init_seg(bank, pagearray); 432 433 /* set up page array pointers */ 434 pagearray += n; 435 pagecount -= n; 436 } 437 438 /* 439 * pass up the values of virtual_space_start and 440 * virtual_space_end (obtained by uvm_pageboot_alloc) to the upper 441 * layers of the VM. 442 */ 443 444 *kvm_startp = round_page(virtual_space_start); 445 *kvm_endp = trunc_page(virtual_space_end); 446 #ifdef DEBUG 447 /* 448 * steal kva for uvm_pagezerocheck(). 449 */ 450 uvm_zerocheckkva = *kvm_startp; 451 *kvm_startp += PAGE_SIZE; 452 mutex_init(&uvm_zerochecklock, MUTEX_DEFAULT, IPL_VM); 453 #endif /* DEBUG */ 454 455 /* 456 * init various thresholds. 457 */ 458 459 uvmexp.reserve_pagedaemon = 1; 460 uvmexp.reserve_kernel = vm_page_reserve_kernel; 461 462 /* 463 * done! 464 */ 465 466 uvm.page_init_done = true; 467 } 468 469 /* 470 * uvm_pgfl_lock: lock all freelist buckets 471 */ 472 473 void 474 uvm_pgfl_lock(void) 475 { 476 int i; 477 478 for (i = 0; i < __arraycount(uvm_freelist_locks); i++) { 479 mutex_spin_enter(&uvm_freelist_locks[i].lock); 480 } 481 } 482 483 /* 484 * uvm_pgfl_unlock: unlock all freelist buckets 485 */ 486 487 void 488 uvm_pgfl_unlock(void) 489 { 490 int i; 491 492 for (i = 0; i < __arraycount(uvm_freelist_locks); i++) { 493 mutex_spin_exit(&uvm_freelist_locks[i].lock); 494 } 495 } 496 497 /* 498 * uvm_setpagesize: set the page size 499 * 500 * => sets page_shift and page_mask from uvmexp.pagesize. 501 */ 502 503 void 504 uvm_setpagesize(void) 505 { 506 507 /* 508 * If uvmexp.pagesize is 0 at this point, we expect PAGE_SIZE 509 * to be a constant (indicated by being a non-zero value). 510 */ 511 if (uvmexp.pagesize == 0) { 512 if (PAGE_SIZE == 0) 513 panic("uvm_setpagesize: uvmexp.pagesize not set"); 514 uvmexp.pagesize = PAGE_SIZE; 515 } 516 uvmexp.pagemask = uvmexp.pagesize - 1; 517 if ((uvmexp.pagemask & uvmexp.pagesize) != 0) 518 panic("uvm_setpagesize: page size %u (%#x) not a power of two", 519 uvmexp.pagesize, uvmexp.pagesize); 520 for (uvmexp.pageshift = 0; ; uvmexp.pageshift++) 521 if ((1 << uvmexp.pageshift) == uvmexp.pagesize) 522 break; 523 } 524 525 /* 526 * uvm_pageboot_alloc: steal memory from physmem for bootstrapping 527 */ 528 529 vaddr_t 530 uvm_pageboot_alloc(vsize_t size) 531 { 532 static bool initialized = false; 533 vaddr_t addr; 534 #if !defined(PMAP_STEAL_MEMORY) 535 vaddr_t vaddr; 536 paddr_t paddr; 537 #endif 538 539 /* 540 * on first call to this function, initialize ourselves. 541 */ 542 if (initialized == false) { 543 pmap_virtual_space(&virtual_space_start, &virtual_space_end); 544 545 /* round it the way we like it */ 546 virtual_space_start = round_page(virtual_space_start); 547 virtual_space_end = trunc_page(virtual_space_end); 548 549 initialized = true; 550 } 551 552 /* round to page size */ 553 size = round_page(size); 554 uvmexp.bootpages += atop(size); 555 556 #if defined(PMAP_STEAL_MEMORY) 557 558 /* 559 * defer bootstrap allocation to MD code (it may want to allocate 560 * from a direct-mapped segment). pmap_steal_memory should adjust 561 * virtual_space_start/virtual_space_end if necessary. 562 */ 563 564 addr = pmap_steal_memory(size, &virtual_space_start, 565 &virtual_space_end); 566 567 return(addr); 568 569 #else /* !PMAP_STEAL_MEMORY */ 570 571 /* 572 * allocate virtual memory for this request 573 */ 574 if (virtual_space_start == virtual_space_end || 575 (virtual_space_end - virtual_space_start) < size) 576 panic("uvm_pageboot_alloc: out of virtual space"); 577 578 addr = virtual_space_start; 579 580 #ifdef PMAP_GROWKERNEL 581 /* 582 * If the kernel pmap can't map the requested space, 583 * then allocate more resources for it. 584 */ 585 if (uvm_maxkaddr < (addr + size)) { 586 uvm_maxkaddr = pmap_growkernel(addr + size); 587 if (uvm_maxkaddr < (addr + size)) 588 panic("uvm_pageboot_alloc: pmap_growkernel() failed"); 589 } 590 #endif 591 592 virtual_space_start += size; 593 594 /* 595 * allocate and mapin physical pages to back new virtual pages 596 */ 597 598 for (vaddr = round_page(addr) ; vaddr < addr + size ; 599 vaddr += PAGE_SIZE) { 600 601 if (!uvm_page_physget(&paddr)) 602 panic("uvm_pageboot_alloc: out of memory"); 603 604 /* 605 * Note this memory is no longer managed, so using 606 * pmap_kenter is safe. 607 */ 608 pmap_kenter_pa(vaddr, paddr, VM_PROT_READ|VM_PROT_WRITE, 0); 609 } 610 pmap_update(pmap_kernel()); 611 return(addr); 612 #endif /* PMAP_STEAL_MEMORY */ 613 } 614 615 #if !defined(PMAP_STEAL_MEMORY) 616 /* 617 * uvm_page_physget: "steal" one page from the vm_physmem structure. 618 * 619 * => attempt to allocate it off the end of a segment in which the "avail" 620 * values match the start/end values. if we can't do that, then we 621 * will advance both values (making them equal, and removing some 622 * vm_page structures from the non-avail area). 623 * => return false if out of memory. 624 */ 625 626 /* subroutine: try to allocate from memory chunks on the specified freelist */ 627 static bool uvm_page_physget_freelist(paddr_t *, int); 628 629 static bool 630 uvm_page_physget_freelist(paddr_t *paddrp, int freelist) 631 { 632 uvm_physseg_t lcv; 633 634 /* pass 1: try allocating from a matching end */ 635 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST) 636 for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv)) 637 #else 638 for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv)) 639 #endif 640 { 641 if (uvm.page_init_done == true) 642 panic("uvm_page_physget: called _after_ bootstrap"); 643 644 /* Try to match at front or back on unused segment */ 645 if (uvm_page_physunload(lcv, freelist, paddrp)) 646 return true; 647 } 648 649 /* pass2: forget about matching ends, just allocate something */ 650 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST) 651 for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv)) 652 #else 653 for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv)) 654 #endif 655 { 656 /* Try the front regardless. */ 657 if (uvm_page_physunload_force(lcv, freelist, paddrp)) 658 return true; 659 } 660 return false; 661 } 662 663 bool 664 uvm_page_physget(paddr_t *paddrp) 665 { 666 int i; 667 668 /* try in the order of freelist preference */ 669 for (i = 0; i < VM_NFREELIST; i++) 670 if (uvm_page_physget_freelist(paddrp, i) == true) 671 return (true); 672 return (false); 673 } 674 #endif /* PMAP_STEAL_MEMORY */ 675 676 /* 677 * PHYS_TO_VM_PAGE: find vm_page for a PA. used by MI code to get vm_pages 678 * back from an I/O mapping (ugh!). used in some MD code as well. 679 */ 680 struct vm_page * 681 uvm_phys_to_vm_page(paddr_t pa) 682 { 683 paddr_t pf = atop(pa); 684 paddr_t off; 685 uvm_physseg_t upm; 686 687 upm = uvm_physseg_find(pf, &off); 688 if (upm != UVM_PHYSSEG_TYPE_INVALID) 689 return uvm_physseg_get_pg(upm, off); 690 return(NULL); 691 } 692 693 paddr_t 694 uvm_vm_page_to_phys(const struct vm_page *pg) 695 { 696 697 return pg->phys_addr & ~(PAGE_SIZE - 1); 698 } 699 700 /* 701 * uvm_page_numa_load: load NUMA range description. 702 */ 703 void 704 uvm_page_numa_load(paddr_t start, paddr_t size, u_int numa_id) 705 { 706 struct uvm_page_numa_region *d; 707 708 KASSERT(numa_id < PGFL_MAX_BUCKETS); 709 710 d = kmem_alloc(sizeof(*d), KM_SLEEP); 711 d->start = start; 712 d->size = size; 713 d->numa_id = numa_id; 714 d->next = uvm_page_numa_region; 715 uvm_page_numa_region = d; 716 } 717 718 /* 719 * uvm_page_numa_lookup: lookup NUMA node for the given page. 720 */ 721 static u_int 722 uvm_page_numa_lookup(struct vm_page *pg) 723 { 724 struct uvm_page_numa_region *d; 725 static bool warned; 726 paddr_t pa; 727 728 KASSERT(uvm_page_numa_region != NULL); 729 730 pa = VM_PAGE_TO_PHYS(pg); 731 for (d = uvm_page_numa_region; d != NULL; d = d->next) { 732 if (pa >= d->start && pa < d->start + d->size) { 733 return d->numa_id; 734 } 735 } 736 737 if (!warned) { 738 printf("uvm_page_numa_lookup: failed, first pg=%p pa=%#" 739 PRIxPADDR "\n", pg, VM_PAGE_TO_PHYS(pg)); 740 warned = true; 741 } 742 743 return 0; 744 } 745 746 /* 747 * uvm_page_redim: adjust freelist dimensions if they have changed. 748 */ 749 750 static void 751 uvm_page_redim(int newncolors, int newnbuckets) 752 { 753 struct pgfreelist npgfl; 754 struct pgflbucket *opgb, *npgb; 755 struct pgflist *ohead, *nhead; 756 struct vm_page *pg; 757 size_t bucketsize, bucketmemsize, oldbucketmemsize; 758 int fl, ob, oc, nb, nc, obuckets, ocolors; 759 char *bucketarray, *oldbucketmem, *bucketmem; 760 761 KASSERT(((newncolors - 1) & newncolors) == 0); 762 763 /* Anything to do? */ 764 if (newncolors <= uvmexp.ncolors && 765 newnbuckets == uvm.bucketcount) { 766 return; 767 } 768 if (uvm.page_init_done == false) { 769 uvmexp.ncolors = newncolors; 770 return; 771 } 772 773 bucketsize = offsetof(struct pgflbucket, pgb_colors[newncolors]); 774 bucketsize = roundup2(bucketsize, coherency_unit); 775 bucketmemsize = bucketsize * newnbuckets * VM_NFREELIST + 776 coherency_unit - 1; 777 bucketmem = kmem_zalloc(bucketmemsize, KM_SLEEP); 778 bucketarray = (char *)roundup2((uintptr_t)bucketmem, coherency_unit); 779 780 ocolors = uvmexp.ncolors; 781 obuckets = uvm.bucketcount; 782 783 /* Freelist cache musn't be enabled. */ 784 uvm_pgflcache_pause(); 785 786 /* Make sure we should still do this. */ 787 uvm_pgfl_lock(); 788 if (newncolors <= uvmexp.ncolors && 789 newnbuckets == uvm.bucketcount) { 790 uvm_pgfl_unlock(); 791 uvm_pgflcache_resume(); 792 kmem_free(bucketmem, bucketmemsize); 793 return; 794 } 795 796 uvmexp.ncolors = newncolors; 797 uvmexp.colormask = uvmexp.ncolors - 1; 798 uvm.bucketcount = newnbuckets; 799 800 for (fl = 0; fl < VM_NFREELIST; fl++) { 801 /* Init new buckets in new freelist. */ 802 memset(&npgfl, 0, sizeof(npgfl)); 803 for (nb = 0; nb < newnbuckets; nb++) { 804 npgb = (struct pgflbucket *)bucketarray; 805 uvm_page_init_bucket(&npgfl, npgb, nb); 806 bucketarray += bucketsize; 807 } 808 /* Now transfer pages from the old freelist. */ 809 for (nb = ob = 0; ob < obuckets; ob++) { 810 opgb = uvm.page_free[fl].pgfl_buckets[ob]; 811 for (oc = 0; oc < ocolors; oc++) { 812 ohead = &opgb->pgb_colors[oc]; 813 while ((pg = LIST_FIRST(ohead)) != NULL) { 814 LIST_REMOVE(pg, pageq.list); 815 /* 816 * Here we decide on the NEW color & 817 * bucket for the page. For NUMA 818 * we'll use the info that the 819 * hardware gave us. For non-NUMA 820 * assign take physical page frame 821 * number and cache color into 822 * account. We do this to try and 823 * avoid defeating any memory 824 * interleaving in the hardware. 825 */ 826 KASSERT( 827 uvm_page_get_bucket(pg) == ob); 828 KASSERT(fl == 829 uvm_page_get_freelist(pg)); 830 if (uvm_page_numa_region != NULL) { 831 nb = uvm_page_numa_lookup(pg); 832 } else { 833 nb = atop(VM_PAGE_TO_PHYS(pg)) 834 / uvmexp.ncolors / 8 835 % newnbuckets; 836 } 837 uvm_page_set_bucket(pg, nb); 838 npgb = npgfl.pgfl_buckets[nb]; 839 npgb->pgb_nfree++; 840 nc = VM_PGCOLOR(pg); 841 nhead = &npgb->pgb_colors[nc]; 842 LIST_INSERT_HEAD(nhead, pg, pageq.list); 843 } 844 } 845 } 846 /* Install the new freelist. */ 847 memcpy(&uvm.page_free[fl], &npgfl, sizeof(npgfl)); 848 } 849 850 /* Unlock and free the old memory. */ 851 oldbucketmemsize = recolored_pages_memsize; 852 oldbucketmem = recolored_pages_mem; 853 recolored_pages_memsize = bucketmemsize; 854 recolored_pages_mem = bucketmem; 855 856 uvm_pgfl_unlock(); 857 uvm_pgflcache_resume(); 858 859 if (oldbucketmemsize) { 860 kmem_free(oldbucketmem, oldbucketmemsize); 861 } 862 863 /* 864 * this calls uvm_km_alloc() which may want to hold 865 * uvm_freelist_lock. 866 */ 867 uvm_pager_realloc_emerg(); 868 } 869 870 /* 871 * uvm_page_recolor: Recolor the pages if the new color count is 872 * larger than the old one. 873 */ 874 875 void 876 uvm_page_recolor(int newncolors) 877 { 878 879 uvm_page_redim(newncolors, uvm.bucketcount); 880 } 881 882 /* 883 * uvm_page_rebucket: Determine a bucket structure and redim the free 884 * lists to match. 885 */ 886 887 void 888 uvm_page_rebucket(void) 889 { 890 u_int min_numa, max_numa, npackage, shift; 891 struct cpu_info *ci, *ci2, *ci3; 892 CPU_INFO_ITERATOR cii; 893 894 /* 895 * If we have more than one NUMA node, and the maximum NUMA node ID 896 * is less than PGFL_MAX_BUCKETS, then we'll use NUMA distribution 897 * for free pages. 898 */ 899 min_numa = (u_int)-1; 900 max_numa = 0; 901 for (CPU_INFO_FOREACH(cii, ci)) { 902 if (ci->ci_numa_id < min_numa) { 903 min_numa = ci->ci_numa_id; 904 } 905 if (ci->ci_numa_id > max_numa) { 906 max_numa = ci->ci_numa_id; 907 } 908 } 909 if (min_numa != max_numa && max_numa < PGFL_MAX_BUCKETS) { 910 aprint_debug("UVM: using NUMA allocation scheme\n"); 911 for (CPU_INFO_FOREACH(cii, ci)) { 912 ci->ci_data.cpu_uvm->pgflbucket = ci->ci_numa_id; 913 } 914 uvm_page_redim(uvmexp.ncolors, max_numa + 1); 915 return; 916 } 917 918 /* 919 * Otherwise we'll go with a scheme to maximise L2/L3 cache locality 920 * and minimise lock contention. Count the total number of CPU 921 * packages, and then try to distribute the buckets among CPU 922 * packages evenly. 923 */ 924 npackage = curcpu()->ci_nsibling[CPUREL_PACKAGE1ST]; 925 926 /* 927 * Figure out how to arrange the packages & buckets, and the total 928 * number of buckets we need. XXX 2 may not be the best factor. 929 */ 930 for (shift = 0; npackage > PGFL_MAX_BUCKETS; shift++) { 931 npackage >>= 1; 932 } 933 uvm_page_redim(uvmexp.ncolors, npackage); 934 935 /* 936 * Now tell each CPU which bucket to use. In the outer loop, scroll 937 * through all CPU packages. 938 */ 939 npackage = 0; 940 ci = curcpu(); 941 ci2 = ci->ci_sibling[CPUREL_PACKAGE1ST]; 942 do { 943 /* 944 * In the inner loop, scroll through all CPUs in the package 945 * and assign the same bucket ID. 946 */ 947 ci3 = ci2; 948 do { 949 ci3->ci_data.cpu_uvm->pgflbucket = npackage >> shift; 950 ci3 = ci3->ci_sibling[CPUREL_PACKAGE]; 951 } while (ci3 != ci2); 952 npackage++; 953 ci2 = ci2->ci_sibling[CPUREL_PACKAGE1ST]; 954 } while (ci2 != ci->ci_sibling[CPUREL_PACKAGE1ST]); 955 956 aprint_debug("UVM: using package allocation scheme, " 957 "%d package(s) per bucket\n", 1 << shift); 958 } 959 960 /* 961 * uvm_cpu_attach: initialize per-CPU data structures. 962 */ 963 964 void 965 uvm_cpu_attach(struct cpu_info *ci) 966 { 967 struct uvm_cpu *ucpu; 968 969 /* Already done in uvm_page_init(). */ 970 if (!CPU_IS_PRIMARY(ci)) { 971 /* Add more reserve pages for this CPU. */ 972 uvmexp.reserve_kernel += vm_page_reserve_kernel; 973 974 /* Allocate per-CPU data structures. */ 975 ucpu = kmem_zalloc(sizeof(struct uvm_cpu) + coherency_unit - 1, 976 KM_SLEEP); 977 ucpu = (struct uvm_cpu *)roundup2((uintptr_t)ucpu, 978 coherency_unit); 979 ci->ci_data.cpu_uvm = ucpu; 980 } else { 981 ucpu = ci->ci_data.cpu_uvm; 982 } 983 984 uvmpdpol_init_cpu(ucpu); 985 986 /* 987 * Attach RNG source for this CPU's VM events 988 */ 989 rnd_attach_source(&ucpu->rs, ci->ci_data.cpu_name, RND_TYPE_VM, 990 RND_FLAG_COLLECT_TIME|RND_FLAG_COLLECT_VALUE| 991 RND_FLAG_ESTIMATE_VALUE); 992 } 993 994 /* 995 * uvm_availmem: fetch the total amount of free memory in pages. this can 996 * have a detrimental effect on performance due to false sharing; don't call 997 * unless needed. 998 * 999 * some users can request the amount of free memory so often that it begins 1000 * to impact upon performance. if calling frequently and an inexact value 1001 * is okay, call with cached = true. 1002 */ 1003 1004 int 1005 uvm_availmem(bool cached) 1006 { 1007 int64_t fp; 1008 1009 cpu_count_sync(cached); 1010 if ((fp = cpu_count_get(CPU_COUNT_FREEPAGES)) < 0) { 1011 /* 1012 * XXXAD could briefly go negative because it's impossible 1013 * to get a clean snapshot. address this for other counters 1014 * used as running totals before NetBSD 10 although less 1015 * important for those. 1016 */ 1017 fp = 0; 1018 } 1019 return (int)fp; 1020 } 1021 1022 /* 1023 * uvm_pagealloc_pgb: helper routine that tries to allocate any color from a 1024 * specific freelist and specific bucket only. 1025 * 1026 * => must be at IPL_VM or higher to protect per-CPU data structures. 1027 */ 1028 1029 static struct vm_page * 1030 uvm_pagealloc_pgb(struct uvm_cpu *ucpu, int f, int b, int *trycolorp, int flags) 1031 { 1032 int c, trycolor, colormask; 1033 struct pgflbucket *pgb; 1034 struct vm_page *pg; 1035 kmutex_t *lock; 1036 bool fill; 1037 1038 /* 1039 * Skip the bucket if empty, no lock needed. There could be many 1040 * empty freelists/buckets. 1041 */ 1042 pgb = uvm.page_free[f].pgfl_buckets[b]; 1043 if (pgb->pgb_nfree == 0) { 1044 return NULL; 1045 } 1046 1047 /* Skip bucket if low on memory. */ 1048 lock = &uvm_freelist_locks[b].lock; 1049 mutex_spin_enter(lock); 1050 if (__predict_false(pgb->pgb_nfree <= uvmexp.reserve_kernel)) { 1051 if ((flags & UVM_PGA_USERESERVE) == 0 || 1052 (pgb->pgb_nfree <= uvmexp.reserve_pagedaemon && 1053 curlwp != uvm.pagedaemon_lwp)) { 1054 mutex_spin_exit(lock); 1055 return NULL; 1056 } 1057 fill = false; 1058 } else { 1059 fill = true; 1060 } 1061 1062 /* Try all page colors as needed. */ 1063 c = trycolor = *trycolorp; 1064 colormask = uvmexp.colormask; 1065 do { 1066 pg = LIST_FIRST(&pgb->pgb_colors[c]); 1067 if (__predict_true(pg != NULL)) { 1068 /* 1069 * Got a free page! PG_FREE must be cleared under 1070 * lock because of uvm_pglistalloc(). 1071 */ 1072 LIST_REMOVE(pg, pageq.list); 1073 KASSERT(pg->flags == PG_FREE); 1074 pg->flags = PG_BUSY | PG_CLEAN | PG_FAKE; 1075 pgb->pgb_nfree--; 1076 1077 /* 1078 * While we have the bucket locked and our data 1079 * structures fresh in L1 cache, we have an ideal 1080 * opportunity to grab some pages for the freelist 1081 * cache without causing extra contention. Only do 1082 * so if we found pages in this CPU's preferred 1083 * bucket. 1084 */ 1085 if (__predict_true(b == ucpu->pgflbucket && fill)) { 1086 uvm_pgflcache_fill(ucpu, f, b, c); 1087 } 1088 mutex_spin_exit(lock); 1089 KASSERT(uvm_page_get_bucket(pg) == b); 1090 CPU_COUNT(c == trycolor ? 1091 CPU_COUNT_COLORHIT : CPU_COUNT_COLORMISS, 1); 1092 CPU_COUNT(CPU_COUNT_CPUMISS, 1); 1093 *trycolorp = c; 1094 return pg; 1095 } 1096 c = (c + 1) & colormask; 1097 } while (c != trycolor); 1098 mutex_spin_exit(lock); 1099 1100 return NULL; 1101 } 1102 1103 /* 1104 * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat that allocates 1105 * any color from any bucket, in a specific freelist. 1106 * 1107 * => must be at IPL_VM or higher to protect per-CPU data structures. 1108 */ 1109 1110 static struct vm_page * 1111 uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int f, int *trycolorp, int flags) 1112 { 1113 int b, trybucket, bucketcount; 1114 struct vm_page *pg; 1115 1116 /* Try for the exact thing in the per-CPU cache. */ 1117 if ((pg = uvm_pgflcache_alloc(ucpu, f, *trycolorp)) != NULL) { 1118 CPU_COUNT(CPU_COUNT_CPUHIT, 1); 1119 CPU_COUNT(CPU_COUNT_COLORHIT, 1); 1120 return pg; 1121 } 1122 1123 /* Walk through all buckets, trying our preferred bucket first. */ 1124 trybucket = ucpu->pgflbucket; 1125 b = trybucket; 1126 bucketcount = uvm.bucketcount; 1127 do { 1128 pg = uvm_pagealloc_pgb(ucpu, f, b, trycolorp, flags); 1129 if (pg != NULL) { 1130 return pg; 1131 } 1132 b = (b + 1 == bucketcount ? 0 : b + 1); 1133 } while (b != trybucket); 1134 1135 return NULL; 1136 } 1137 1138 /* 1139 * uvm_pagealloc_strat: allocate vm_page from a particular free list. 1140 * 1141 * => return null if no pages free 1142 * => wake up pagedaemon if number of free pages drops below low water mark 1143 * => if obj != NULL, obj must be locked (to put in obj's tree) 1144 * => if anon != NULL, anon must be locked (to put in anon) 1145 * => only one of obj or anon can be non-null 1146 * => caller must activate/deactivate page if it is not wired. 1147 * => free_list is ignored if strat == UVM_PGA_STRAT_NORMAL. 1148 * => policy decision: it is more important to pull a page off of the 1149 * appropriate priority free list than it is to get a page from the 1150 * correct bucket or color bin. This is because we live with the 1151 * consequences of a bad free list decision for the entire 1152 * lifetime of the page, e.g. if the page comes from memory that 1153 * is slower to access. 1154 */ 1155 1156 struct vm_page * 1157 uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon, 1158 int flags, int strat, int free_list) 1159 { 1160 int color, lcv, error, s; 1161 struct uvm_cpu *ucpu; 1162 struct vm_page *pg; 1163 lwp_t *l; 1164 1165 KASSERT(obj == NULL || anon == NULL); 1166 KASSERT(anon == NULL || (flags & UVM_FLAG_COLORMATCH) || off == 0); 1167 KASSERT(off == trunc_page(off)); 1168 KASSERT(obj == NULL || rw_write_held(obj->vmobjlock)); 1169 KASSERT(anon == NULL || anon->an_lock == NULL || 1170 rw_write_held(anon->an_lock)); 1171 1172 /* 1173 * This implements a global round-robin page coloring 1174 * algorithm. 1175 */ 1176 1177 s = splvm(); 1178 ucpu = curcpu()->ci_data.cpu_uvm; 1179 if (flags & UVM_FLAG_COLORMATCH) { 1180 color = atop(off) & uvmexp.colormask; 1181 } else { 1182 color = ucpu->pgflcolor; 1183 } 1184 1185 /* 1186 * fail if any of these conditions is true: 1187 * [1] there really are no free pages, or 1188 * [2] only kernel "reserved" pages remain and 1189 * reserved pages have not been requested. 1190 * [3] only pagedaemon "reserved" pages remain and 1191 * the requestor isn't the pagedaemon. 1192 * we make kernel reserve pages available if called by a 1193 * kernel thread. 1194 */ 1195 l = curlwp; 1196 if (__predict_true(l != NULL) && (l->l_flag & LW_SYSTEM) != 0) { 1197 flags |= UVM_PGA_USERESERVE; 1198 } 1199 1200 again: 1201 switch (strat) { 1202 case UVM_PGA_STRAT_NORMAL: 1203 /* Check freelists: descending priority (ascending id) order. */ 1204 for (lcv = 0; lcv < VM_NFREELIST; lcv++) { 1205 pg = uvm_pagealloc_pgfl(ucpu, lcv, &color, flags); 1206 if (pg != NULL) { 1207 goto gotit; 1208 } 1209 } 1210 1211 /* No pages free! Have pagedaemon free some memory. */ 1212 splx(s); 1213 uvm_kick_pdaemon(); 1214 return NULL; 1215 1216 case UVM_PGA_STRAT_ONLY: 1217 case UVM_PGA_STRAT_FALLBACK: 1218 /* Attempt to allocate from the specified free list. */ 1219 KASSERT(free_list >= 0 && free_list < VM_NFREELIST); 1220 pg = uvm_pagealloc_pgfl(ucpu, free_list, &color, flags); 1221 if (pg != NULL) { 1222 goto gotit; 1223 } 1224 1225 /* Fall back, if possible. */ 1226 if (strat == UVM_PGA_STRAT_FALLBACK) { 1227 strat = UVM_PGA_STRAT_NORMAL; 1228 goto again; 1229 } 1230 1231 /* No pages free! Have pagedaemon free some memory. */ 1232 splx(s); 1233 uvm_kick_pdaemon(); 1234 return NULL; 1235 1236 case UVM_PGA_STRAT_NUMA: 1237 /* 1238 * NUMA strategy (experimental): allocating from the correct 1239 * bucket is more important than observing freelist 1240 * priority. Look only to the current NUMA node; if that 1241 * fails, we need to look to other NUMA nodes, so retry with 1242 * the normal strategy. 1243 */ 1244 for (lcv = 0; lcv < VM_NFREELIST; lcv++) { 1245 pg = uvm_pgflcache_alloc(ucpu, lcv, color); 1246 if (pg != NULL) { 1247 CPU_COUNT(CPU_COUNT_CPUHIT, 1); 1248 CPU_COUNT(CPU_COUNT_COLORHIT, 1); 1249 goto gotit; 1250 } 1251 pg = uvm_pagealloc_pgb(ucpu, lcv, 1252 ucpu->pgflbucket, &color, flags); 1253 if (pg != NULL) { 1254 goto gotit; 1255 } 1256 } 1257 strat = UVM_PGA_STRAT_NORMAL; 1258 goto again; 1259 1260 default: 1261 panic("uvm_pagealloc_strat: bad strat %d", strat); 1262 /* NOTREACHED */ 1263 } 1264 1265 gotit: 1266 /* 1267 * We now know which color we actually allocated from; set 1268 * the next color accordingly. 1269 */ 1270 1271 ucpu->pgflcolor = (color + 1) & uvmexp.colormask; 1272 1273 /* 1274 * while still at IPL_VM, update allocation statistics. 1275 */ 1276 1277 CPU_COUNT(CPU_COUNT_FREEPAGES, -1); 1278 if (anon) { 1279 CPU_COUNT(CPU_COUNT_ANONCLEAN, 1); 1280 } 1281 splx(s); 1282 KASSERT(pg->flags == (PG_BUSY|PG_CLEAN|PG_FAKE)); 1283 1284 /* 1285 * assign the page to the object. as the page was free, we know 1286 * that pg->uobject and pg->uanon are NULL. we only need to take 1287 * the page's interlock if we are changing the values. 1288 */ 1289 if (anon != NULL || obj != NULL) { 1290 mutex_enter(&pg->interlock); 1291 } 1292 pg->offset = off; 1293 pg->uobject = obj; 1294 pg->uanon = anon; 1295 KASSERT(uvm_page_owner_locked_p(pg, true)); 1296 if (anon) { 1297 anon->an_page = pg; 1298 pg->flags |= PG_ANON; 1299 mutex_exit(&pg->interlock); 1300 } else if (obj) { 1301 /* 1302 * set PG_FILE|PG_AOBJ before the first uvm_pageinsert. 1303 */ 1304 if (UVM_OBJ_IS_VNODE(obj)) { 1305 pg->flags |= PG_FILE; 1306 } else if (UVM_OBJ_IS_AOBJ(obj)) { 1307 pg->flags |= PG_AOBJ; 1308 } 1309 uvm_pageinsert_object(obj, pg); 1310 mutex_exit(&pg->interlock); 1311 error = uvm_pageinsert_tree(obj, pg); 1312 if (error != 0) { 1313 mutex_enter(&pg->interlock); 1314 uvm_pageremove_object(obj, pg); 1315 mutex_exit(&pg->interlock); 1316 uvm_pagefree(pg); 1317 return NULL; 1318 } 1319 } 1320 1321 #if defined(UVM_PAGE_TRKOWN) 1322 pg->owner_tag = NULL; 1323 #endif 1324 UVM_PAGE_OWN(pg, "new alloc"); 1325 1326 if (flags & UVM_PGA_ZERO) { 1327 /* A zero'd page is not clean. */ 1328 if (obj != NULL || anon != NULL) { 1329 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY); 1330 } 1331 pmap_zero_page(VM_PAGE_TO_PHYS(pg)); 1332 } 1333 1334 return(pg); 1335 } 1336 1337 /* 1338 * uvm_pagereplace: replace a page with another 1339 * 1340 * => object must be locked 1341 * => page interlocks must be held 1342 */ 1343 1344 void 1345 uvm_pagereplace(struct vm_page *oldpg, struct vm_page *newpg) 1346 { 1347 struct uvm_object *uobj = oldpg->uobject; 1348 struct vm_page *pg __diagused; 1349 uint64_t idx; 1350 1351 KASSERT((oldpg->flags & PG_TABLED) != 0); 1352 KASSERT(uobj != NULL); 1353 KASSERT((newpg->flags & PG_TABLED) == 0); 1354 KASSERT(newpg->uobject == NULL); 1355 KASSERT(rw_write_held(uobj->vmobjlock)); 1356 KASSERT(mutex_owned(&oldpg->interlock)); 1357 KASSERT(mutex_owned(&newpg->interlock)); 1358 1359 newpg->uobject = uobj; 1360 newpg->offset = oldpg->offset; 1361 idx = newpg->offset >> PAGE_SHIFT; 1362 pg = radix_tree_replace_node(&uobj->uo_pages, idx, newpg); 1363 KASSERT(pg == oldpg); 1364 if (((oldpg->flags ^ newpg->flags) & PG_CLEAN) != 0) { 1365 if ((newpg->flags & PG_CLEAN) != 0) { 1366 radix_tree_clear_tag(&uobj->uo_pages, idx, 1367 UVM_PAGE_DIRTY_TAG); 1368 } else { 1369 radix_tree_set_tag(&uobj->uo_pages, idx, 1370 UVM_PAGE_DIRTY_TAG); 1371 } 1372 } 1373 /* 1374 * oldpg's PG_STAT is stable. newpg is not reachable by others yet. 1375 */ 1376 newpg->flags |= 1377 (newpg->flags & ~PG_STAT) | (oldpg->flags & PG_STAT); 1378 uvm_pageinsert_object(uobj, newpg); 1379 uvm_pageremove_object(uobj, oldpg); 1380 } 1381 1382 /* 1383 * uvm_pagerealloc: reallocate a page from one object to another 1384 * 1385 * => both objects must be locked 1386 */ 1387 1388 int 1389 uvm_pagerealloc(struct vm_page *pg, struct uvm_object *newobj, voff_t newoff) 1390 { 1391 int error = 0; 1392 1393 /* 1394 * remove it from the old object 1395 */ 1396 1397 if (pg->uobject) { 1398 uvm_pageremove_tree(pg->uobject, pg); 1399 uvm_pageremove_object(pg->uobject, pg); 1400 } 1401 1402 /* 1403 * put it in the new object 1404 */ 1405 1406 if (newobj) { 1407 mutex_enter(&pg->interlock); 1408 pg->uobject = newobj; 1409 pg->offset = newoff; 1410 if (UVM_OBJ_IS_VNODE(newobj)) { 1411 pg->flags |= PG_FILE; 1412 } else if (UVM_OBJ_IS_AOBJ(newobj)) { 1413 pg->flags |= PG_AOBJ; 1414 } 1415 uvm_pageinsert_object(newobj, pg); 1416 mutex_exit(&pg->interlock); 1417 error = uvm_pageinsert_tree(newobj, pg); 1418 if (error != 0) { 1419 mutex_enter(&pg->interlock); 1420 uvm_pageremove_object(newobj, pg); 1421 mutex_exit(&pg->interlock); 1422 } 1423 } 1424 1425 return error; 1426 } 1427 1428 #ifdef DEBUG 1429 /* 1430 * check if page is zero-filled 1431 */ 1432 void 1433 uvm_pagezerocheck(struct vm_page *pg) 1434 { 1435 int *p, *ep; 1436 1437 KASSERT(uvm_zerocheckkva != 0); 1438 1439 /* 1440 * XXX assuming pmap_kenter_pa and pmap_kremove never call 1441 * uvm page allocator. 1442 * 1443 * it might be better to have "CPU-local temporary map" pmap interface. 1444 */ 1445 mutex_spin_enter(&uvm_zerochecklock); 1446 pmap_kenter_pa(uvm_zerocheckkva, VM_PAGE_TO_PHYS(pg), VM_PROT_READ, 0); 1447 p = (int *)uvm_zerocheckkva; 1448 ep = (int *)((char *)p + PAGE_SIZE); 1449 pmap_update(pmap_kernel()); 1450 while (p < ep) { 1451 if (*p != 0) 1452 panic("zero page isn't zero-filled"); 1453 p++; 1454 } 1455 pmap_kremove(uvm_zerocheckkva, PAGE_SIZE); 1456 mutex_spin_exit(&uvm_zerochecklock); 1457 /* 1458 * pmap_update() is not necessary here because no one except us 1459 * uses this VA. 1460 */ 1461 } 1462 #endif /* DEBUG */ 1463 1464 /* 1465 * uvm_pagefree: free page 1466 * 1467 * => erase page's identity (i.e. remove from object) 1468 * => put page on free list 1469 * => caller must lock owning object (either anon or uvm_object) 1470 * => assumes all valid mappings of pg are gone 1471 */ 1472 1473 void 1474 uvm_pagefree(struct vm_page *pg) 1475 { 1476 struct pgfreelist *pgfl; 1477 struct pgflbucket *pgb; 1478 struct uvm_cpu *ucpu; 1479 kmutex_t *lock; 1480 int bucket, s; 1481 bool locked; 1482 1483 #ifdef DEBUG 1484 if (pg->uobject == (void *)0xdeadbeef && 1485 pg->uanon == (void *)0xdeadbeef) { 1486 panic("uvm_pagefree: freeing free page %p", pg); 1487 } 1488 #endif /* DEBUG */ 1489 1490 KASSERT((pg->flags & PG_PAGEOUT) == 0); 1491 KASSERT(!(pg->flags & PG_FREE)); 1492 KASSERT(pg->uobject == NULL || rw_write_held(pg->uobject->vmobjlock)); 1493 KASSERT(pg->uobject != NULL || pg->uanon == NULL || 1494 rw_write_held(pg->uanon->an_lock)); 1495 1496 /* 1497 * remove the page from the object's tree before acquiring any page 1498 * interlocks: this can acquire locks to free radixtree nodes. 1499 */ 1500 if (pg->uobject != NULL) { 1501 uvm_pageremove_tree(pg->uobject, pg); 1502 } 1503 1504 /* 1505 * if the page is loaned, resolve the loan instead of freeing. 1506 */ 1507 1508 if (pg->loan_count) { 1509 KASSERT(pg->wire_count == 0); 1510 1511 /* 1512 * if the page is owned by an anon then we just want to 1513 * drop anon ownership. the kernel will free the page when 1514 * it is done with it. if the page is owned by an object, 1515 * remove it from the object and mark it dirty for the benefit 1516 * of possible anon owners. 1517 * 1518 * regardless of previous ownership, wakeup any waiters, 1519 * unbusy the page, and we're done. 1520 */ 1521 1522 uvm_pagelock(pg); 1523 locked = true; 1524 if (pg->uobject != NULL) { 1525 uvm_pageremove_object(pg->uobject, pg); 1526 pg->flags &= ~(PG_FILE|PG_AOBJ); 1527 } else if (pg->uanon != NULL) { 1528 if ((pg->flags & PG_ANON) == 0) { 1529 pg->loan_count--; 1530 } else { 1531 const unsigned status = uvm_pagegetdirty(pg); 1532 pg->flags &= ~PG_ANON; 1533 cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1); 1534 } 1535 pg->uanon->an_page = NULL; 1536 pg->uanon = NULL; 1537 } 1538 if (pg->pqflags & PQ_WANTED) { 1539 wakeup(pg); 1540 } 1541 pg->pqflags &= ~PQ_WANTED; 1542 pg->flags &= ~(PG_BUSY|PG_RELEASED|PG_PAGER1); 1543 #ifdef UVM_PAGE_TRKOWN 1544 pg->owner_tag = NULL; 1545 #endif 1546 KASSERT((pg->flags & PG_STAT) == 0); 1547 if (pg->loan_count) { 1548 KASSERT(pg->uobject == NULL); 1549 if (pg->uanon == NULL) { 1550 uvm_pagedequeue(pg); 1551 } 1552 uvm_pageunlock(pg); 1553 return; 1554 } 1555 } else if (pg->uobject != NULL || pg->uanon != NULL || 1556 pg->wire_count != 0) { 1557 uvm_pagelock(pg); 1558 locked = true; 1559 } else { 1560 locked = false; 1561 } 1562 1563 /* 1564 * remove page from its object or anon. 1565 */ 1566 if (pg->uobject != NULL) { 1567 uvm_pageremove_object(pg->uobject, pg); 1568 } else if (pg->uanon != NULL) { 1569 const unsigned int status = uvm_pagegetdirty(pg); 1570 pg->uanon->an_page = NULL; 1571 pg->uanon = NULL; 1572 cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1); 1573 } 1574 1575 /* 1576 * if the page was wired, unwire it now. 1577 */ 1578 1579 if (pg->wire_count) { 1580 pg->wire_count = 0; 1581 atomic_dec_uint(&uvmexp.wired); 1582 } 1583 if (locked) { 1584 /* 1585 * wake anyone waiting on the page. 1586 */ 1587 if ((pg->pqflags & PQ_WANTED) != 0) { 1588 pg->pqflags &= ~PQ_WANTED; 1589 wakeup(pg); 1590 } 1591 1592 /* 1593 * now remove the page from the queues. 1594 */ 1595 uvm_pagedequeue(pg); 1596 uvm_pageunlock(pg); 1597 } else { 1598 KASSERT(!uvmpdpol_pageisqueued_p(pg)); 1599 } 1600 1601 /* 1602 * and put on free queue 1603 */ 1604 1605 #ifdef DEBUG 1606 pg->uobject = (void *)0xdeadbeef; 1607 pg->uanon = (void *)0xdeadbeef; 1608 #endif /* DEBUG */ 1609 1610 /* Try to send the page to the per-CPU cache. */ 1611 s = splvm(); 1612 CPU_COUNT(CPU_COUNT_FREEPAGES, 1); 1613 ucpu = curcpu()->ci_data.cpu_uvm; 1614 bucket = uvm_page_get_bucket(pg); 1615 if (bucket == ucpu->pgflbucket && uvm_pgflcache_free(ucpu, pg)) { 1616 splx(s); 1617 return; 1618 } 1619 1620 /* Didn't work. Never mind, send it to a global bucket. */ 1621 pgfl = &uvm.page_free[uvm_page_get_freelist(pg)]; 1622 pgb = pgfl->pgfl_buckets[bucket]; 1623 lock = &uvm_freelist_locks[bucket].lock; 1624 1625 mutex_spin_enter(lock); 1626 /* PG_FREE must be set under lock because of uvm_pglistalloc(). */ 1627 pg->flags = PG_FREE; 1628 LIST_INSERT_HEAD(&pgb->pgb_colors[VM_PGCOLOR(pg)], pg, pageq.list); 1629 pgb->pgb_nfree++; 1630 mutex_spin_exit(lock); 1631 splx(s); 1632 } 1633 1634 /* 1635 * uvm_page_unbusy: unbusy an array of pages. 1636 * 1637 * => pages must either all belong to the same object, or all belong to anons. 1638 * => if pages are object-owned, object must be locked. 1639 * => if pages are anon-owned, anons must be locked. 1640 * => caller must make sure that anon-owned pages are not PG_RELEASED. 1641 */ 1642 1643 void 1644 uvm_page_unbusy(struct vm_page **pgs, int npgs) 1645 { 1646 struct vm_page *pg; 1647 int i; 1648 UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); 1649 1650 for (i = 0; i < npgs; i++) { 1651 pg = pgs[i]; 1652 if (pg == NULL || pg == PGO_DONTCARE) { 1653 continue; 1654 } 1655 1656 KASSERT(uvm_page_owner_locked_p(pg, true)); 1657 KASSERT(pg->flags & PG_BUSY); 1658 KASSERT((pg->flags & PG_PAGEOUT) == 0); 1659 if (pg->flags & PG_RELEASED) { 1660 UVMHIST_LOG(ubchist, "releasing pg %#jx", 1661 (uintptr_t)pg, 0, 0, 0); 1662 KASSERT(pg->uobject != NULL || 1663 (pg->uanon != NULL && pg->uanon->an_ref > 0)); 1664 pg->flags &= ~PG_RELEASED; 1665 uvm_pagefree(pg); 1666 } else { 1667 UVMHIST_LOG(ubchist, "unbusying pg %#jx", 1668 (uintptr_t)pg, 0, 0, 0); 1669 KASSERT((pg->flags & PG_FAKE) == 0); 1670 pg->flags &= ~PG_BUSY; 1671 uvm_pagelock(pg); 1672 uvm_pagewakeup(pg); 1673 uvm_pageunlock(pg); 1674 UVM_PAGE_OWN(pg, NULL); 1675 } 1676 } 1677 } 1678 1679 /* 1680 * uvm_pagewait: wait for a busy page 1681 * 1682 * => page must be known PG_BUSY 1683 * => object must be read or write locked 1684 * => object will be unlocked on return 1685 */ 1686 1687 void 1688 uvm_pagewait(struct vm_page *pg, krwlock_t *lock, const char *wmesg) 1689 { 1690 1691 KASSERT(rw_lock_held(lock)); 1692 KASSERT((pg->flags & PG_BUSY) != 0); 1693 KASSERT(uvm_page_owner_locked_p(pg, false)); 1694 1695 mutex_enter(&pg->interlock); 1696 pg->pqflags |= PQ_WANTED; 1697 rw_exit(lock); 1698 UVM_UNLOCK_AND_WAIT(pg, &pg->interlock, false, wmesg, 0); 1699 } 1700 1701 /* 1702 * uvm_pagewakeup: wake anyone waiting on a page 1703 * 1704 * => page interlock must be held 1705 */ 1706 1707 void 1708 uvm_pagewakeup(struct vm_page *pg) 1709 { 1710 UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); 1711 1712 KASSERT(mutex_owned(&pg->interlock)); 1713 1714 UVMHIST_LOG(ubchist, "waking pg %#jx", (uintptr_t)pg, 0, 0, 0); 1715 1716 if ((pg->pqflags & PQ_WANTED) != 0) { 1717 wakeup(pg); 1718 pg->pqflags &= ~PQ_WANTED; 1719 } 1720 } 1721 1722 /* 1723 * uvm_pagewanted_p: return true if someone is waiting on the page 1724 * 1725 * => object must be write locked (lock out all concurrent access) 1726 */ 1727 1728 bool 1729 uvm_pagewanted_p(struct vm_page *pg) 1730 { 1731 1732 KASSERT(uvm_page_owner_locked_p(pg, true)); 1733 1734 return (atomic_load_relaxed(&pg->pqflags) & PQ_WANTED) != 0; 1735 } 1736 1737 #if defined(UVM_PAGE_TRKOWN) 1738 /* 1739 * uvm_page_own: set or release page ownership 1740 * 1741 * => this is a debugging function that keeps track of who sets PG_BUSY 1742 * and where they do it. it can be used to track down problems 1743 * such a process setting "PG_BUSY" and never releasing it. 1744 * => page's object [if any] must be locked 1745 * => if "tag" is NULL then we are releasing page ownership 1746 */ 1747 void 1748 uvm_page_own(struct vm_page *pg, const char *tag) 1749 { 1750 1751 KASSERT((pg->flags & (PG_PAGEOUT|PG_RELEASED)) == 0); 1752 KASSERT(uvm_page_owner_locked_p(pg, true)); 1753 1754 /* gain ownership? */ 1755 if (tag) { 1756 KASSERT((pg->flags & PG_BUSY) != 0); 1757 if (pg->owner_tag) { 1758 printf("uvm_page_own: page %p already owned " 1759 "by proc %d.%d [%s]\n", pg, 1760 pg->owner, pg->lowner, pg->owner_tag); 1761 panic("uvm_page_own"); 1762 } 1763 pg->owner = curproc->p_pid; 1764 pg->lowner = curlwp->l_lid; 1765 pg->owner_tag = tag; 1766 return; 1767 } 1768 1769 /* drop ownership */ 1770 KASSERT((pg->flags & PG_BUSY) == 0); 1771 if (pg->owner_tag == NULL) { 1772 printf("uvm_page_own: dropping ownership of an non-owned " 1773 "page (%p)\n", pg); 1774 panic("uvm_page_own"); 1775 } 1776 pg->owner_tag = NULL; 1777 } 1778 #endif 1779 1780 /* 1781 * uvm_pagelookup: look up a page 1782 * 1783 * => caller should lock object to keep someone from pulling the page 1784 * out from under it 1785 */ 1786 1787 struct vm_page * 1788 uvm_pagelookup(struct uvm_object *obj, voff_t off) 1789 { 1790 struct vm_page *pg; 1791 1792 /* No - used from DDB. KASSERT(rw_lock_held(obj->vmobjlock)); */ 1793 1794 pg = radix_tree_lookup_node(&obj->uo_pages, off >> PAGE_SHIFT); 1795 1796 KASSERT(pg == NULL || obj->uo_npages != 0); 1797 KASSERT(pg == NULL || (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 || 1798 (pg->flags & PG_BUSY) != 0); 1799 return pg; 1800 } 1801 1802 /* 1803 * uvm_pagewire: wire the page, thus removing it from the daemon's grasp 1804 * 1805 * => caller must lock objects 1806 * => caller must hold pg->interlock 1807 */ 1808 1809 void 1810 uvm_pagewire(struct vm_page *pg) 1811 { 1812 1813 KASSERT(uvm_page_owner_locked_p(pg, true)); 1814 KASSERT(mutex_owned(&pg->interlock)); 1815 #if defined(READAHEAD_STATS) 1816 if ((pg->flags & PG_READAHEAD) != 0) { 1817 uvm_ra_hit.ev_count++; 1818 pg->flags &= ~PG_READAHEAD; 1819 } 1820 #endif /* defined(READAHEAD_STATS) */ 1821 if (pg->wire_count == 0) { 1822 uvm_pagedequeue(pg); 1823 atomic_inc_uint(&uvmexp.wired); 1824 } 1825 pg->wire_count++; 1826 KASSERT(pg->wire_count > 0); /* detect wraparound */ 1827 } 1828 1829 /* 1830 * uvm_pageunwire: unwire the page. 1831 * 1832 * => activate if wire count goes to zero. 1833 * => caller must lock objects 1834 * => caller must hold pg->interlock 1835 */ 1836 1837 void 1838 uvm_pageunwire(struct vm_page *pg) 1839 { 1840 1841 KASSERT(uvm_page_owner_locked_p(pg, true)); 1842 KASSERT(pg->wire_count != 0); 1843 KASSERT(!uvmpdpol_pageisqueued_p(pg)); 1844 KASSERT(mutex_owned(&pg->interlock)); 1845 pg->wire_count--; 1846 if (pg->wire_count == 0) { 1847 uvm_pageactivate(pg); 1848 KASSERT(uvmexp.wired != 0); 1849 atomic_dec_uint(&uvmexp.wired); 1850 } 1851 } 1852 1853 /* 1854 * uvm_pagedeactivate: deactivate page 1855 * 1856 * => caller must lock objects 1857 * => caller must check to make sure page is not wired 1858 * => object that page belongs to must be locked (so we can adjust pg->flags) 1859 * => caller must clear the reference on the page before calling 1860 * => caller must hold pg->interlock 1861 */ 1862 1863 void 1864 uvm_pagedeactivate(struct vm_page *pg) 1865 { 1866 1867 KASSERT(uvm_page_owner_locked_p(pg, false)); 1868 KASSERT(mutex_owned(&pg->interlock)); 1869 if (pg->wire_count == 0) { 1870 KASSERT(uvmpdpol_pageisqueued_p(pg)); 1871 uvmpdpol_pagedeactivate(pg); 1872 } 1873 } 1874 1875 /* 1876 * uvm_pageactivate: activate page 1877 * 1878 * => caller must lock objects 1879 * => caller must hold pg->interlock 1880 */ 1881 1882 void 1883 uvm_pageactivate(struct vm_page *pg) 1884 { 1885 1886 KASSERT(uvm_page_owner_locked_p(pg, false)); 1887 KASSERT(mutex_owned(&pg->interlock)); 1888 #if defined(READAHEAD_STATS) 1889 if ((pg->flags & PG_READAHEAD) != 0) { 1890 uvm_ra_hit.ev_count++; 1891 pg->flags &= ~PG_READAHEAD; 1892 } 1893 #endif /* defined(READAHEAD_STATS) */ 1894 if (pg->wire_count == 0) { 1895 uvmpdpol_pageactivate(pg); 1896 } 1897 } 1898 1899 /* 1900 * uvm_pagedequeue: remove a page from any paging queue 1901 * 1902 * => caller must lock objects 1903 * => caller must hold pg->interlock 1904 */ 1905 void 1906 uvm_pagedequeue(struct vm_page *pg) 1907 { 1908 1909 KASSERT(uvm_page_owner_locked_p(pg, true)); 1910 KASSERT(mutex_owned(&pg->interlock)); 1911 if (uvmpdpol_pageisqueued_p(pg)) { 1912 uvmpdpol_pagedequeue(pg); 1913 } 1914 } 1915 1916 /* 1917 * uvm_pageenqueue: add a page to a paging queue without activating. 1918 * used where a page is not really demanded (yet). eg. read-ahead 1919 * 1920 * => caller must lock objects 1921 * => caller must hold pg->interlock 1922 */ 1923 void 1924 uvm_pageenqueue(struct vm_page *pg) 1925 { 1926 1927 KASSERT(uvm_page_owner_locked_p(pg, false)); 1928 KASSERT(mutex_owned(&pg->interlock)); 1929 if (pg->wire_count == 0 && !uvmpdpol_pageisqueued_p(pg)) { 1930 uvmpdpol_pageenqueue(pg); 1931 } 1932 } 1933 1934 /* 1935 * uvm_pagelock: acquire page interlock 1936 */ 1937 void 1938 uvm_pagelock(struct vm_page *pg) 1939 { 1940 1941 mutex_enter(&pg->interlock); 1942 } 1943 1944 /* 1945 * uvm_pagelock2: acquire two page interlocks 1946 */ 1947 void 1948 uvm_pagelock2(struct vm_page *pg1, struct vm_page *pg2) 1949 { 1950 1951 if (pg1 < pg2) { 1952 mutex_enter(&pg1->interlock); 1953 mutex_enter(&pg2->interlock); 1954 } else { 1955 mutex_enter(&pg2->interlock); 1956 mutex_enter(&pg1->interlock); 1957 } 1958 } 1959 1960 /* 1961 * uvm_pageunlock: release page interlock, and if a page replacement intent 1962 * is set on the page, pass it to uvmpdpol to make real. 1963 * 1964 * => caller must hold pg->interlock 1965 */ 1966 void 1967 uvm_pageunlock(struct vm_page *pg) 1968 { 1969 1970 if ((pg->pqflags & PQ_INTENT_SET) == 0 || 1971 (pg->pqflags & PQ_INTENT_QUEUED) != 0) { 1972 mutex_exit(&pg->interlock); 1973 return; 1974 } 1975 pg->pqflags |= PQ_INTENT_QUEUED; 1976 mutex_exit(&pg->interlock); 1977 uvmpdpol_pagerealize(pg); 1978 } 1979 1980 /* 1981 * uvm_pageunlock2: release two page interlocks, and for both pages if a 1982 * page replacement intent is set on the page, pass it to uvmpdpol to make 1983 * real. 1984 * 1985 * => caller must hold pg->interlock 1986 */ 1987 void 1988 uvm_pageunlock2(struct vm_page *pg1, struct vm_page *pg2) 1989 { 1990 1991 if ((pg1->pqflags & PQ_INTENT_SET) == 0 || 1992 (pg1->pqflags & PQ_INTENT_QUEUED) != 0) { 1993 mutex_exit(&pg1->interlock); 1994 pg1 = NULL; 1995 } else { 1996 pg1->pqflags |= PQ_INTENT_QUEUED; 1997 mutex_exit(&pg1->interlock); 1998 } 1999 2000 if ((pg2->pqflags & PQ_INTENT_SET) == 0 || 2001 (pg2->pqflags & PQ_INTENT_QUEUED) != 0) { 2002 mutex_exit(&pg2->interlock); 2003 pg2 = NULL; 2004 } else { 2005 pg2->pqflags |= PQ_INTENT_QUEUED; 2006 mutex_exit(&pg2->interlock); 2007 } 2008 2009 if (pg1 != NULL) { 2010 uvmpdpol_pagerealize(pg1); 2011 } 2012 if (pg2 != NULL) { 2013 uvmpdpol_pagerealize(pg2); 2014 } 2015 } 2016 2017 /* 2018 * uvm_pagezero: zero fill a page 2019 * 2020 * => if page is part of an object then the object should be locked 2021 * to protect pg->flags. 2022 */ 2023 2024 void 2025 uvm_pagezero(struct vm_page *pg) 2026 { 2027 2028 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY); 2029 pmap_zero_page(VM_PAGE_TO_PHYS(pg)); 2030 } 2031 2032 /* 2033 * uvm_pagecopy: copy a page 2034 * 2035 * => if page is part of an object then the object should be locked 2036 * to protect pg->flags. 2037 */ 2038 2039 void 2040 uvm_pagecopy(struct vm_page *src, struct vm_page *dst) 2041 { 2042 2043 uvm_pagemarkdirty(dst, UVM_PAGE_STATUS_DIRTY); 2044 pmap_copy_page(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst)); 2045 } 2046 2047 /* 2048 * uvm_pageismanaged: test it see that a page (specified by PA) is managed. 2049 */ 2050 2051 bool 2052 uvm_pageismanaged(paddr_t pa) 2053 { 2054 2055 return (uvm_physseg_find(atop(pa), NULL) != UVM_PHYSSEG_TYPE_INVALID); 2056 } 2057 2058 /* 2059 * uvm_page_lookup_freelist: look up the free list for the specified page 2060 */ 2061 2062 int 2063 uvm_page_lookup_freelist(struct vm_page *pg) 2064 { 2065 uvm_physseg_t upm; 2066 2067 upm = uvm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), NULL); 2068 KASSERT(upm != UVM_PHYSSEG_TYPE_INVALID); 2069 return uvm_physseg_get_free_list(upm); 2070 } 2071 2072 /* 2073 * uvm_page_owner_locked_p: return true if object associated with page is 2074 * locked. this is a weak check for runtime assertions only. 2075 */ 2076 2077 bool 2078 uvm_page_owner_locked_p(struct vm_page *pg, bool exclusive) 2079 { 2080 2081 if (pg->uobject != NULL) { 2082 return exclusive 2083 ? rw_write_held(pg->uobject->vmobjlock) 2084 : rw_lock_held(pg->uobject->vmobjlock); 2085 } 2086 if (pg->uanon != NULL) { 2087 return exclusive 2088 ? rw_write_held(pg->uanon->an_lock) 2089 : rw_lock_held(pg->uanon->an_lock); 2090 } 2091 return true; 2092 } 2093 2094 /* 2095 * uvm_pagereadonly_p: return if the page should be mapped read-only 2096 */ 2097 2098 bool 2099 uvm_pagereadonly_p(struct vm_page *pg) 2100 { 2101 struct uvm_object * const uobj = pg->uobject; 2102 2103 KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock)); 2104 KASSERT(uobj != NULL || rw_lock_held(pg->uanon->an_lock)); 2105 if ((pg->flags & PG_RDONLY) != 0) { 2106 return true; 2107 } 2108 if (uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) { 2109 return true; 2110 } 2111 if (uobj == NULL) { 2112 return false; 2113 } 2114 return UVM_OBJ_NEEDS_WRITEFAULT(uobj); 2115 } 2116 2117 #ifdef PMAP_DIRECT 2118 /* 2119 * Call pmap to translate physical address into a virtual and to run a callback 2120 * for it. Used to avoid actually mapping the pages, pmap most likely uses direct map 2121 * or equivalent. 2122 */ 2123 int 2124 uvm_direct_process(struct vm_page **pgs, u_int npages, voff_t off, vsize_t len, 2125 int (*process)(void *, size_t, void *), void *arg) 2126 { 2127 int error = 0; 2128 paddr_t pa; 2129 size_t todo; 2130 voff_t pgoff = (off & PAGE_MASK); 2131 struct vm_page *pg; 2132 2133 KASSERT(npages > 0 && len > 0); 2134 2135 for (int i = 0; i < npages; i++) { 2136 pg = pgs[i]; 2137 2138 KASSERT(len > 0); 2139 2140 /* 2141 * Caller is responsible for ensuring all the pages are 2142 * available. 2143 */ 2144 KASSERT(pg != NULL && pg != PGO_DONTCARE); 2145 2146 pa = VM_PAGE_TO_PHYS(pg); 2147 todo = MIN(len, PAGE_SIZE - pgoff); 2148 2149 error = pmap_direct_process(pa, pgoff, todo, process, arg); 2150 if (error) 2151 break; 2152 2153 pgoff = 0; 2154 len -= todo; 2155 } 2156 2157 KASSERTMSG(error != 0 || len == 0, "len %lu != 0 for non-error", len); 2158 return error; 2159 } 2160 #endif /* PMAP_DIRECT */ 2161 2162 #if defined(DDB) || defined(DEBUGPRINT) 2163 2164 /* 2165 * uvm_page_printit: actually print the page 2166 */ 2167 2168 static const char page_flagbits[] = UVM_PGFLAGBITS; 2169 static const char page_pqflagbits[] = UVM_PQFLAGBITS; 2170 2171 void 2172 uvm_page_printit(struct vm_page *pg, bool full, 2173 void (*pr)(const char *, ...)) 2174 { 2175 struct vm_page *tpg; 2176 struct uvm_object *uobj; 2177 struct pgflbucket *pgb; 2178 struct pgflist *pgl; 2179 char pgbuf[128]; 2180 2181 (*pr)("PAGE %p:\n", pg); 2182 snprintb(pgbuf, sizeof(pgbuf), page_flagbits, pg->flags); 2183 (*pr)(" flags=%s\n", pgbuf); 2184 snprintb(pgbuf, sizeof(pgbuf), page_pqflagbits, pg->pqflags); 2185 (*pr)(" pqflags=%s\n", pgbuf); 2186 (*pr)(" uobject=%p, uanon=%p, offset=0x%llx\n", 2187 pg->uobject, pg->uanon, (long long)pg->offset); 2188 (*pr)(" loan_count=%d wire_count=%d bucket=%d freelist=%d\n", 2189 pg->loan_count, pg->wire_count, uvm_page_get_bucket(pg), 2190 uvm_page_get_freelist(pg)); 2191 (*pr)(" pa=0x%lx\n", (long)VM_PAGE_TO_PHYS(pg)); 2192 #if defined(UVM_PAGE_TRKOWN) 2193 if (pg->flags & PG_BUSY) 2194 (*pr)(" owning process = %d.%d, tag=%s\n", 2195 pg->owner, pg->lowner, pg->owner_tag); 2196 else 2197 (*pr)(" page not busy, no owner\n"); 2198 #else 2199 (*pr)(" [page ownership tracking disabled]\n"); 2200 #endif 2201 2202 if (!full) 2203 return; 2204 2205 /* cross-verify object/anon */ 2206 if ((pg->flags & PG_FREE) == 0) { 2207 if (pg->flags & PG_ANON) { 2208 if (pg->uanon == NULL || pg->uanon->an_page != pg) 2209 (*pr)(" >>> ANON DOES NOT POINT HERE <<< (%p)\n", 2210 (pg->uanon) ? pg->uanon->an_page : NULL); 2211 else 2212 (*pr)(" anon backpointer is OK\n"); 2213 } else { 2214 uobj = pg->uobject; 2215 if (uobj) { 2216 (*pr)(" checking object list\n"); 2217 tpg = uvm_pagelookup(uobj, pg->offset); 2218 if (tpg) 2219 (*pr)(" page found on object list\n"); 2220 else 2221 (*pr)(" >>> PAGE NOT FOUND ON OBJECT LIST! <<<\n"); 2222 } 2223 } 2224 } 2225 2226 /* cross-verify page queue */ 2227 if (pg->flags & PG_FREE) { 2228 int fl = uvm_page_get_freelist(pg); 2229 int b = uvm_page_get_bucket(pg); 2230 pgb = uvm.page_free[fl].pgfl_buckets[b]; 2231 pgl = &pgb->pgb_colors[VM_PGCOLOR(pg)]; 2232 (*pr)(" checking pageq list\n"); 2233 LIST_FOREACH(tpg, pgl, pageq.list) { 2234 if (tpg == pg) { 2235 break; 2236 } 2237 } 2238 if (tpg) 2239 (*pr)(" page found on pageq list\n"); 2240 else 2241 (*pr)(" >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n"); 2242 } 2243 } 2244 2245 /* 2246 * uvm_page_printall - print a summary of all managed pages 2247 */ 2248 2249 void 2250 uvm_page_printall(void (*pr)(const char *, ...)) 2251 { 2252 uvm_physseg_t i; 2253 paddr_t pfn; 2254 struct vm_page *pg; 2255 2256 (*pr)("%18s %4s %4s %18s %18s" 2257 #ifdef UVM_PAGE_TRKOWN 2258 " OWNER" 2259 #endif 2260 "\n", "PAGE", "FLAG", "PQ", "UOBJECT", "UANON"); 2261 for (i = uvm_physseg_get_first(); 2262 uvm_physseg_valid_p(i); 2263 i = uvm_physseg_get_next(i)) { 2264 for (pfn = uvm_physseg_get_start(i); 2265 pfn < uvm_physseg_get_end(i); 2266 pfn++) { 2267 pg = PHYS_TO_VM_PAGE(ptoa(pfn)); 2268 2269 (*pr)("%18p %04x %08x %18p %18p", 2270 pg, pg->flags, pg->pqflags, pg->uobject, 2271 pg->uanon); 2272 #ifdef UVM_PAGE_TRKOWN 2273 if (pg->flags & PG_BUSY) 2274 (*pr)(" %d [%s]", pg->owner, pg->owner_tag); 2275 #endif 2276 (*pr)("\n"); 2277 } 2278 } 2279 } 2280 2281 /* 2282 * uvm_page_print_freelists - print a summary freelists 2283 */ 2284 2285 void 2286 uvm_page_print_freelists(void (*pr)(const char *, ...)) 2287 { 2288 struct pgfreelist *pgfl; 2289 struct pgflbucket *pgb; 2290 int fl, b, c; 2291 2292 (*pr)("There are %d freelists with %d buckets of %d colors.\n\n", 2293 VM_NFREELIST, uvm.bucketcount, uvmexp.ncolors); 2294 2295 for (fl = 0; fl < VM_NFREELIST; fl++) { 2296 pgfl = &uvm.page_free[fl]; 2297 (*pr)("freelist(%d) @ %p\n", fl, pgfl); 2298 for (b = 0; b < uvm.bucketcount; b++) { 2299 pgb = uvm.page_free[fl].pgfl_buckets[b]; 2300 (*pr)(" bucket(%d) @ %p, nfree = %d, lock @ %p:\n", 2301 b, pgb, pgb->pgb_nfree, 2302 &uvm_freelist_locks[b].lock); 2303 for (c = 0; c < uvmexp.ncolors; c++) { 2304 (*pr)(" color(%d) @ %p, ", c, 2305 &pgb->pgb_colors[c]); 2306 (*pr)("first page = %p\n", 2307 LIST_FIRST(&pgb->pgb_colors[c])); 2308 } 2309 } 2310 } 2311 } 2312 2313 #endif /* DDB || DEBUGPRINT */ 2314