1 /* $NetBSD: uvm_page.c,v 1.246 2020/08/15 01:27:22 tnn Exp $ */ 2 3 /*- 4 * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1997 Charles D. Cranor and Washington University. 34 * Copyright (c) 1991, 1993, The Regents of the University of California. 35 * 36 * All rights reserved. 37 * 38 * This code is derived from software contributed to Berkeley by 39 * The Mach Operating System project at Carnegie-Mellon University. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * @(#)vm_page.c 8.3 (Berkeley) 3/21/94 66 * from: Id: uvm_page.c,v 1.1.2.18 1998/02/06 05:24:42 chs Exp 67 * 68 * 69 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 70 * All rights reserved. 71 * 72 * Permission to use, copy, modify and distribute this software and 73 * its documentation is hereby granted, provided that both the copyright 74 * notice and this permission notice appear in all copies of the 75 * software, derivative works or modified versions, and any portions 76 * thereof, and that both notices appear in supporting documentation. 77 * 78 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 79 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 80 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 81 * 82 * Carnegie Mellon requests users of this software to return to 83 * 84 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 85 * School of Computer Science 86 * Carnegie Mellon University 87 * Pittsburgh PA 15213-3890 88 * 89 * any improvements or extensions that they make and grant Carnegie the 90 * rights to redistribute these changes. 91 */ 92 93 /* 94 * uvm_page.c: page ops. 95 */ 96 97 #include <sys/cdefs.h> 98 __KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.246 2020/08/15 01:27:22 tnn Exp $"); 99 100 #include "opt_ddb.h" 101 #include "opt_uvm.h" 102 #include "opt_uvmhist.h" 103 #include "opt_readahead.h" 104 105 #include <sys/param.h> 106 #include <sys/systm.h> 107 #include <sys/sched.h> 108 #include <sys/kernel.h> 109 #include <sys/vnode.h> 110 #include <sys/proc.h> 111 #include <sys/radixtree.h> 112 #include <sys/atomic.h> 113 #include <sys/cpu.h> 114 115 #include <uvm/uvm.h> 116 #include <uvm/uvm_ddb.h> 117 #include <uvm/uvm_pdpolicy.h> 118 #include <uvm/uvm_pgflcache.h> 119 120 /* 121 * number of pages per-CPU to reserve for the kernel. 122 */ 123 #ifndef UVM_RESERVED_PAGES_PER_CPU 124 #define UVM_RESERVED_PAGES_PER_CPU 5 125 #endif 126 int vm_page_reserve_kernel = UVM_RESERVED_PAGES_PER_CPU; 127 128 /* 129 * physical memory size; 130 */ 131 psize_t physmem; 132 133 /* 134 * local variables 135 */ 136 137 /* 138 * these variables record the values returned by vm_page_bootstrap, 139 * for debugging purposes. The implementation of uvm_pageboot_alloc 140 * and pmap_startup here also uses them internally. 141 */ 142 143 static vaddr_t virtual_space_start; 144 static vaddr_t virtual_space_end; 145 146 /* 147 * we allocate an initial number of page colors in uvm_page_init(), 148 * and remember them. We may re-color pages as cache sizes are 149 * discovered during the autoconfiguration phase. But we can never 150 * free the initial set of buckets, since they are allocated using 151 * uvm_pageboot_alloc(). 152 */ 153 154 static size_t recolored_pages_memsize /* = 0 */; 155 static char *recolored_pages_mem; 156 157 /* 158 * freelist locks - one per bucket. 159 */ 160 161 union uvm_freelist_lock uvm_freelist_locks[PGFL_MAX_BUCKETS] 162 __cacheline_aligned; 163 164 /* 165 * basic NUMA information. 166 */ 167 168 static struct uvm_page_numa_region { 169 struct uvm_page_numa_region *next; 170 paddr_t start; 171 paddr_t size; 172 u_int numa_id; 173 } *uvm_page_numa_region; 174 175 #ifdef DEBUG 176 kmutex_t uvm_zerochecklock __cacheline_aligned; 177 vaddr_t uvm_zerocheckkva; 178 #endif /* DEBUG */ 179 180 /* 181 * These functions are reserved for uvm(9) internal use and are not 182 * exported in the header file uvm_physseg.h 183 * 184 * Thus they are redefined here. 185 */ 186 void uvm_physseg_init_seg(uvm_physseg_t, struct vm_page *); 187 void uvm_physseg_seg_chomp_slab(uvm_physseg_t, struct vm_page *, size_t); 188 189 /* returns a pgs array */ 190 struct vm_page *uvm_physseg_seg_alloc_from_slab(uvm_physseg_t, size_t); 191 192 /* 193 * inline functions 194 */ 195 196 /* 197 * uvm_pageinsert: insert a page in the object. 198 * 199 * => caller must lock object 200 * => call should have already set pg's object and offset pointers 201 * and bumped the version counter 202 */ 203 204 static inline void 205 uvm_pageinsert_object(struct uvm_object *uobj, struct vm_page *pg) 206 { 207 208 KASSERT(uobj == pg->uobject); 209 KASSERT(rw_write_held(uobj->vmobjlock)); 210 KASSERT((pg->flags & PG_TABLED) == 0); 211 212 if ((pg->flags & PG_STAT) != 0) { 213 /* Cannot use uvm_pagegetdirty(): not yet in radix tree. */ 214 const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY); 215 216 if ((pg->flags & PG_FILE) != 0) { 217 if (uobj->uo_npages == 0) { 218 struct vnode *vp = (struct vnode *)uobj; 219 mutex_enter(vp->v_interlock); 220 KASSERT((vp->v_iflag & VI_PAGES) == 0); 221 vp->v_iflag |= VI_PAGES; 222 vholdl(vp); 223 mutex_exit(vp->v_interlock); 224 } 225 if (UVM_OBJ_IS_VTEXT(uobj)) { 226 cpu_count(CPU_COUNT_EXECPAGES, 1); 227 } 228 cpu_count(CPU_COUNT_FILEUNKNOWN + status, 1); 229 } else { 230 cpu_count(CPU_COUNT_ANONUNKNOWN + status, 1); 231 } 232 } 233 pg->flags |= PG_TABLED; 234 uobj->uo_npages++; 235 } 236 237 static inline int 238 uvm_pageinsert_tree(struct uvm_object *uobj, struct vm_page *pg) 239 { 240 const uint64_t idx = pg->offset >> PAGE_SHIFT; 241 int error; 242 243 KASSERT(rw_write_held(uobj->vmobjlock)); 244 245 error = radix_tree_insert_node(&uobj->uo_pages, idx, pg); 246 if (error != 0) { 247 return error; 248 } 249 if ((pg->flags & PG_CLEAN) == 0) { 250 uvm_obj_page_set_dirty(pg); 251 } 252 KASSERT(((pg->flags & PG_CLEAN) == 0) == 253 uvm_obj_page_dirty_p(pg)); 254 return 0; 255 } 256 257 /* 258 * uvm_page_remove: remove page from object. 259 * 260 * => caller must lock object 261 */ 262 263 static inline void 264 uvm_pageremove_object(struct uvm_object *uobj, struct vm_page *pg) 265 { 266 267 KASSERT(uobj == pg->uobject); 268 KASSERT(rw_write_held(uobj->vmobjlock)); 269 KASSERT(pg->flags & PG_TABLED); 270 271 if ((pg->flags & PG_STAT) != 0) { 272 /* Cannot use uvm_pagegetdirty(): no longer in radix tree. */ 273 const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY); 274 275 if ((pg->flags & PG_FILE) != 0) { 276 if (uobj->uo_npages == 1) { 277 struct vnode *vp = (struct vnode *)uobj; 278 mutex_enter(vp->v_interlock); 279 KASSERT((vp->v_iflag & VI_PAGES) != 0); 280 vp->v_iflag &= ~VI_PAGES; 281 holdrelel(vp); 282 mutex_exit(vp->v_interlock); 283 } 284 if (UVM_OBJ_IS_VTEXT(uobj)) { 285 cpu_count(CPU_COUNT_EXECPAGES, -1); 286 } 287 cpu_count(CPU_COUNT_FILEUNKNOWN + status, -1); 288 } else { 289 cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1); 290 } 291 } 292 uobj->uo_npages--; 293 pg->flags &= ~PG_TABLED; 294 pg->uobject = NULL; 295 } 296 297 static inline void 298 uvm_pageremove_tree(struct uvm_object *uobj, struct vm_page *pg) 299 { 300 struct vm_page *opg __unused; 301 302 KASSERT(rw_write_held(uobj->vmobjlock)); 303 304 opg = radix_tree_remove_node(&uobj->uo_pages, pg->offset >> PAGE_SHIFT); 305 KASSERT(pg == opg); 306 } 307 308 static void 309 uvm_page_init_bucket(struct pgfreelist *pgfl, struct pgflbucket *pgb, int num) 310 { 311 int i; 312 313 pgb->pgb_nfree = 0; 314 for (i = 0; i < uvmexp.ncolors; i++) { 315 LIST_INIT(&pgb->pgb_colors[i]); 316 } 317 pgfl->pgfl_buckets[num] = pgb; 318 } 319 320 /* 321 * uvm_page_init: init the page system. called from uvm_init(). 322 * 323 * => we return the range of kernel virtual memory in kvm_startp/kvm_endp 324 */ 325 326 void 327 uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp) 328 { 329 static struct uvm_cpu boot_cpu __cacheline_aligned; 330 psize_t freepages, pagecount, bucketsize, n; 331 struct pgflbucket *pgb; 332 struct vm_page *pagearray; 333 char *bucketarray; 334 uvm_physseg_t bank; 335 int fl, b; 336 337 KASSERT(ncpu <= 1); 338 339 /* 340 * init the page queues and free page queue locks, except the 341 * free list; we allocate that later (with the initial vm_page 342 * structures). 343 */ 344 345 curcpu()->ci_data.cpu_uvm = &boot_cpu; 346 uvmpdpol_init(); 347 for (b = 0; b < __arraycount(uvm_freelist_locks); b++) { 348 mutex_init(&uvm_freelist_locks[b].lock, MUTEX_DEFAULT, IPL_VM); 349 } 350 351 /* 352 * allocate vm_page structures. 353 */ 354 355 /* 356 * sanity check: 357 * before calling this function the MD code is expected to register 358 * some free RAM with the uvm_page_physload() function. our job 359 * now is to allocate vm_page structures for this memory. 360 */ 361 362 if (uvm_physseg_get_last() == UVM_PHYSSEG_TYPE_INVALID) 363 panic("uvm_page_bootstrap: no memory pre-allocated"); 364 365 /* 366 * first calculate the number of free pages... 367 * 368 * note that we use start/end rather than avail_start/avail_end. 369 * this allows us to allocate extra vm_page structures in case we 370 * want to return some memory to the pool after booting. 371 */ 372 373 freepages = 0; 374 375 for (bank = uvm_physseg_get_first(); 376 uvm_physseg_valid_p(bank) ; 377 bank = uvm_physseg_get_next(bank)) { 378 freepages += (uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank)); 379 } 380 381 /* 382 * Let MD code initialize the number of colors, or default 383 * to 1 color if MD code doesn't care. 384 */ 385 if (uvmexp.ncolors == 0) 386 uvmexp.ncolors = 1; 387 uvmexp.colormask = uvmexp.ncolors - 1; 388 KASSERT((uvmexp.colormask & uvmexp.ncolors) == 0); 389 390 /* We always start with only 1 bucket. */ 391 uvm.bucketcount = 1; 392 393 /* 394 * we now know we have (PAGE_SIZE * freepages) bytes of memory we can 395 * use. for each page of memory we use we need a vm_page structure. 396 * thus, the total number of pages we can use is the total size of 397 * the memory divided by the PAGE_SIZE plus the size of the vm_page 398 * structure. we add one to freepages as a fudge factor to avoid 399 * truncation errors (since we can only allocate in terms of whole 400 * pages). 401 */ 402 pagecount = ((freepages + 1) << PAGE_SHIFT) / 403 (PAGE_SIZE + sizeof(struct vm_page)); 404 bucketsize = offsetof(struct pgflbucket, pgb_colors[uvmexp.ncolors]); 405 bucketsize = roundup2(bucketsize, coherency_unit); 406 bucketarray = (void *)uvm_pageboot_alloc( 407 bucketsize * VM_NFREELIST + 408 pagecount * sizeof(struct vm_page)); 409 pagearray = (struct vm_page *) 410 (bucketarray + bucketsize * VM_NFREELIST); 411 412 for (fl = 0; fl < VM_NFREELIST; fl++) { 413 pgb = (struct pgflbucket *)(bucketarray + bucketsize * fl); 414 uvm_page_init_bucket(&uvm.page_free[fl], pgb, 0); 415 } 416 memset(pagearray, 0, pagecount * sizeof(struct vm_page)); 417 418 /* 419 * init the freelist cache in the disabled state. 420 */ 421 uvm_pgflcache_init(); 422 423 /* 424 * init the vm_page structures and put them in the correct place. 425 */ 426 /* First init the extent */ 427 428 for (bank = uvm_physseg_get_first(), 429 uvm_physseg_seg_chomp_slab(bank, pagearray, pagecount); 430 uvm_physseg_valid_p(bank); 431 bank = uvm_physseg_get_next(bank)) { 432 433 n = uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank); 434 uvm_physseg_seg_alloc_from_slab(bank, n); 435 uvm_physseg_init_seg(bank, pagearray); 436 437 /* set up page array pointers */ 438 pagearray += n; 439 pagecount -= n; 440 } 441 442 /* 443 * pass up the values of virtual_space_start and 444 * virtual_space_end (obtained by uvm_pageboot_alloc) to the upper 445 * layers of the VM. 446 */ 447 448 *kvm_startp = round_page(virtual_space_start); 449 *kvm_endp = trunc_page(virtual_space_end); 450 #ifdef DEBUG 451 /* 452 * steal kva for uvm_pagezerocheck(). 453 */ 454 uvm_zerocheckkva = *kvm_startp; 455 *kvm_startp += PAGE_SIZE; 456 mutex_init(&uvm_zerochecklock, MUTEX_DEFAULT, IPL_VM); 457 #endif /* DEBUG */ 458 459 /* 460 * init various thresholds. 461 */ 462 463 uvmexp.reserve_pagedaemon = 1; 464 uvmexp.reserve_kernel = vm_page_reserve_kernel; 465 466 /* 467 * done! 468 */ 469 470 uvm.page_init_done = true; 471 } 472 473 /* 474 * uvm_pgfl_lock: lock all freelist buckets 475 */ 476 477 void 478 uvm_pgfl_lock(void) 479 { 480 int i; 481 482 for (i = 0; i < __arraycount(uvm_freelist_locks); i++) { 483 mutex_spin_enter(&uvm_freelist_locks[i].lock); 484 } 485 } 486 487 /* 488 * uvm_pgfl_unlock: unlock all freelist buckets 489 */ 490 491 void 492 uvm_pgfl_unlock(void) 493 { 494 int i; 495 496 for (i = 0; i < __arraycount(uvm_freelist_locks); i++) { 497 mutex_spin_exit(&uvm_freelist_locks[i].lock); 498 } 499 } 500 501 /* 502 * uvm_setpagesize: set the page size 503 * 504 * => sets page_shift and page_mask from uvmexp.pagesize. 505 */ 506 507 void 508 uvm_setpagesize(void) 509 { 510 511 /* 512 * If uvmexp.pagesize is 0 at this point, we expect PAGE_SIZE 513 * to be a constant (indicated by being a non-zero value). 514 */ 515 if (uvmexp.pagesize == 0) { 516 if (PAGE_SIZE == 0) 517 panic("uvm_setpagesize: uvmexp.pagesize not set"); 518 uvmexp.pagesize = PAGE_SIZE; 519 } 520 uvmexp.pagemask = uvmexp.pagesize - 1; 521 if ((uvmexp.pagemask & uvmexp.pagesize) != 0) 522 panic("uvm_setpagesize: page size %u (%#x) not a power of two", 523 uvmexp.pagesize, uvmexp.pagesize); 524 for (uvmexp.pageshift = 0; ; uvmexp.pageshift++) 525 if ((1 << uvmexp.pageshift) == uvmexp.pagesize) 526 break; 527 } 528 529 /* 530 * uvm_pageboot_alloc: steal memory from physmem for bootstrapping 531 */ 532 533 vaddr_t 534 uvm_pageboot_alloc(vsize_t size) 535 { 536 static bool initialized = false; 537 vaddr_t addr; 538 #if !defined(PMAP_STEAL_MEMORY) 539 vaddr_t vaddr; 540 paddr_t paddr; 541 #endif 542 543 /* 544 * on first call to this function, initialize ourselves. 545 */ 546 if (initialized == false) { 547 pmap_virtual_space(&virtual_space_start, &virtual_space_end); 548 549 /* round it the way we like it */ 550 virtual_space_start = round_page(virtual_space_start); 551 virtual_space_end = trunc_page(virtual_space_end); 552 553 initialized = true; 554 } 555 556 /* round to page size */ 557 size = round_page(size); 558 uvmexp.bootpages += atop(size); 559 560 #if defined(PMAP_STEAL_MEMORY) 561 562 /* 563 * defer bootstrap allocation to MD code (it may want to allocate 564 * from a direct-mapped segment). pmap_steal_memory should adjust 565 * virtual_space_start/virtual_space_end if necessary. 566 */ 567 568 addr = pmap_steal_memory(size, &virtual_space_start, 569 &virtual_space_end); 570 571 return(addr); 572 573 #else /* !PMAP_STEAL_MEMORY */ 574 575 /* 576 * allocate virtual memory for this request 577 */ 578 if (virtual_space_start == virtual_space_end || 579 (virtual_space_end - virtual_space_start) < size) 580 panic("uvm_pageboot_alloc: out of virtual space"); 581 582 addr = virtual_space_start; 583 584 #ifdef PMAP_GROWKERNEL 585 /* 586 * If the kernel pmap can't map the requested space, 587 * then allocate more resources for it. 588 */ 589 if (uvm_maxkaddr < (addr + size)) { 590 uvm_maxkaddr = pmap_growkernel(addr + size); 591 if (uvm_maxkaddr < (addr + size)) 592 panic("uvm_pageboot_alloc: pmap_growkernel() failed"); 593 } 594 #endif 595 596 virtual_space_start += size; 597 598 /* 599 * allocate and mapin physical pages to back new virtual pages 600 */ 601 602 for (vaddr = round_page(addr) ; vaddr < addr + size ; 603 vaddr += PAGE_SIZE) { 604 605 if (!uvm_page_physget(&paddr)) 606 panic("uvm_pageboot_alloc: out of memory"); 607 608 /* 609 * Note this memory is no longer managed, so using 610 * pmap_kenter is safe. 611 */ 612 pmap_kenter_pa(vaddr, paddr, VM_PROT_READ|VM_PROT_WRITE, 0); 613 } 614 pmap_update(pmap_kernel()); 615 return(addr); 616 #endif /* PMAP_STEAL_MEMORY */ 617 } 618 619 #if !defined(PMAP_STEAL_MEMORY) 620 /* 621 * uvm_page_physget: "steal" one page from the vm_physmem structure. 622 * 623 * => attempt to allocate it off the end of a segment in which the "avail" 624 * values match the start/end values. if we can't do that, then we 625 * will advance both values (making them equal, and removing some 626 * vm_page structures from the non-avail area). 627 * => return false if out of memory. 628 */ 629 630 /* subroutine: try to allocate from memory chunks on the specified freelist */ 631 static bool uvm_page_physget_freelist(paddr_t *, int); 632 633 static bool 634 uvm_page_physget_freelist(paddr_t *paddrp, int freelist) 635 { 636 uvm_physseg_t lcv; 637 638 /* pass 1: try allocating from a matching end */ 639 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST) 640 for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv)) 641 #else 642 for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv)) 643 #endif 644 { 645 if (uvm.page_init_done == true) 646 panic("uvm_page_physget: called _after_ bootstrap"); 647 648 /* Try to match at front or back on unused segment */ 649 if (uvm_page_physunload(lcv, freelist, paddrp)) 650 return true; 651 } 652 653 /* pass2: forget about matching ends, just allocate something */ 654 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST) 655 for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv)) 656 #else 657 for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv)) 658 #endif 659 { 660 /* Try the front regardless. */ 661 if (uvm_page_physunload_force(lcv, freelist, paddrp)) 662 return true; 663 } 664 return false; 665 } 666 667 bool 668 uvm_page_physget(paddr_t *paddrp) 669 { 670 int i; 671 672 /* try in the order of freelist preference */ 673 for (i = 0; i < VM_NFREELIST; i++) 674 if (uvm_page_physget_freelist(paddrp, i) == true) 675 return (true); 676 return (false); 677 } 678 #endif /* PMAP_STEAL_MEMORY */ 679 680 /* 681 * PHYS_TO_VM_PAGE: find vm_page for a PA. used by MI code to get vm_pages 682 * back from an I/O mapping (ugh!). used in some MD code as well. 683 */ 684 struct vm_page * 685 uvm_phys_to_vm_page(paddr_t pa) 686 { 687 paddr_t pf = atop(pa); 688 paddr_t off; 689 uvm_physseg_t upm; 690 691 upm = uvm_physseg_find(pf, &off); 692 if (upm != UVM_PHYSSEG_TYPE_INVALID) 693 return uvm_physseg_get_pg(upm, off); 694 return(NULL); 695 } 696 697 paddr_t 698 uvm_vm_page_to_phys(const struct vm_page *pg) 699 { 700 701 return pg->phys_addr & ~(PAGE_SIZE - 1); 702 } 703 704 /* 705 * uvm_page_numa_load: load NUMA range description. 706 */ 707 void 708 uvm_page_numa_load(paddr_t start, paddr_t size, u_int numa_id) 709 { 710 struct uvm_page_numa_region *d; 711 712 KASSERT(numa_id < PGFL_MAX_BUCKETS); 713 714 d = kmem_alloc(sizeof(*d), KM_SLEEP); 715 d->start = start; 716 d->size = size; 717 d->numa_id = numa_id; 718 d->next = uvm_page_numa_region; 719 uvm_page_numa_region = d; 720 } 721 722 /* 723 * uvm_page_numa_lookup: lookup NUMA node for the given page. 724 */ 725 static u_int 726 uvm_page_numa_lookup(struct vm_page *pg) 727 { 728 struct uvm_page_numa_region *d; 729 static bool warned; 730 paddr_t pa; 731 732 KASSERT(uvm_page_numa_region != NULL); 733 734 pa = VM_PAGE_TO_PHYS(pg); 735 for (d = uvm_page_numa_region; d != NULL; d = d->next) { 736 if (pa >= d->start && pa < d->start + d->size) { 737 return d->numa_id; 738 } 739 } 740 741 if (!warned) { 742 printf("uvm_page_numa_lookup: failed, first pg=%p pa=%#" 743 PRIxPADDR "\n", pg, VM_PAGE_TO_PHYS(pg)); 744 warned = true; 745 } 746 747 return 0; 748 } 749 750 /* 751 * uvm_page_redim: adjust freelist dimensions if they have changed. 752 */ 753 754 static void 755 uvm_page_redim(int newncolors, int newnbuckets) 756 { 757 struct pgfreelist npgfl; 758 struct pgflbucket *opgb, *npgb; 759 struct pgflist *ohead, *nhead; 760 struct vm_page *pg; 761 size_t bucketsize, bucketmemsize, oldbucketmemsize; 762 int fl, ob, oc, nb, nc, obuckets, ocolors; 763 char *bucketarray, *oldbucketmem, *bucketmem; 764 765 KASSERT(((newncolors - 1) & newncolors) == 0); 766 767 /* Anything to do? */ 768 if (newncolors <= uvmexp.ncolors && 769 newnbuckets == uvm.bucketcount) { 770 return; 771 } 772 if (uvm.page_init_done == false) { 773 uvmexp.ncolors = newncolors; 774 return; 775 } 776 777 bucketsize = offsetof(struct pgflbucket, pgb_colors[newncolors]); 778 bucketsize = roundup2(bucketsize, coherency_unit); 779 bucketmemsize = bucketsize * newnbuckets * VM_NFREELIST + 780 coherency_unit - 1; 781 bucketmem = kmem_zalloc(bucketmemsize, KM_SLEEP); 782 bucketarray = (char *)roundup2((uintptr_t)bucketmem, coherency_unit); 783 784 ocolors = uvmexp.ncolors; 785 obuckets = uvm.bucketcount; 786 787 /* Freelist cache musn't be enabled. */ 788 uvm_pgflcache_pause(); 789 790 /* Make sure we should still do this. */ 791 uvm_pgfl_lock(); 792 if (newncolors <= uvmexp.ncolors && 793 newnbuckets == uvm.bucketcount) { 794 uvm_pgfl_unlock(); 795 uvm_pgflcache_resume(); 796 kmem_free(bucketmem, bucketmemsize); 797 return; 798 } 799 800 uvmexp.ncolors = newncolors; 801 uvmexp.colormask = uvmexp.ncolors - 1; 802 uvm.bucketcount = newnbuckets; 803 804 for (fl = 0; fl < VM_NFREELIST; fl++) { 805 /* Init new buckets in new freelist. */ 806 memset(&npgfl, 0, sizeof(npgfl)); 807 for (nb = 0; nb < newnbuckets; nb++) { 808 npgb = (struct pgflbucket *)bucketarray; 809 uvm_page_init_bucket(&npgfl, npgb, nb); 810 bucketarray += bucketsize; 811 } 812 /* Now transfer pages from the old freelist. */ 813 for (nb = ob = 0; ob < obuckets; ob++) { 814 opgb = uvm.page_free[fl].pgfl_buckets[ob]; 815 for (oc = 0; oc < ocolors; oc++) { 816 ohead = &opgb->pgb_colors[oc]; 817 while ((pg = LIST_FIRST(ohead)) != NULL) { 818 LIST_REMOVE(pg, pageq.list); 819 /* 820 * Here we decide on the NEW color & 821 * bucket for the page. For NUMA 822 * we'll use the info that the 823 * hardware gave us. For non-NUMA 824 * assign take physical page frame 825 * number and cache color into 826 * account. We do this to try and 827 * avoid defeating any memory 828 * interleaving in the hardware. 829 */ 830 KASSERT( 831 uvm_page_get_bucket(pg) == ob); 832 KASSERT(fl == 833 uvm_page_get_freelist(pg)); 834 if (uvm_page_numa_region != NULL) { 835 nb = uvm_page_numa_lookup(pg); 836 } else { 837 nb = atop(VM_PAGE_TO_PHYS(pg)) 838 / uvmexp.ncolors / 8 839 % newnbuckets; 840 } 841 uvm_page_set_bucket(pg, nb); 842 npgb = npgfl.pgfl_buckets[nb]; 843 npgb->pgb_nfree++; 844 nc = VM_PGCOLOR(pg); 845 nhead = &npgb->pgb_colors[nc]; 846 LIST_INSERT_HEAD(nhead, pg, pageq.list); 847 } 848 } 849 } 850 /* Install the new freelist. */ 851 memcpy(&uvm.page_free[fl], &npgfl, sizeof(npgfl)); 852 } 853 854 /* Unlock and free the old memory. */ 855 oldbucketmemsize = recolored_pages_memsize; 856 oldbucketmem = recolored_pages_mem; 857 recolored_pages_memsize = bucketmemsize; 858 recolored_pages_mem = bucketmem; 859 860 uvm_pgfl_unlock(); 861 uvm_pgflcache_resume(); 862 863 if (oldbucketmemsize) { 864 kmem_free(oldbucketmem, oldbucketmemsize); 865 } 866 867 /* 868 * this calls uvm_km_alloc() which may want to hold 869 * uvm_freelist_lock. 870 */ 871 uvm_pager_realloc_emerg(); 872 } 873 874 /* 875 * uvm_page_recolor: Recolor the pages if the new color count is 876 * larger than the old one. 877 */ 878 879 void 880 uvm_page_recolor(int newncolors) 881 { 882 883 uvm_page_redim(newncolors, uvm.bucketcount); 884 } 885 886 /* 887 * uvm_page_rebucket: Determine a bucket structure and redim the free 888 * lists to match. 889 */ 890 891 void 892 uvm_page_rebucket(void) 893 { 894 u_int min_numa, max_numa, npackage, shift; 895 struct cpu_info *ci, *ci2, *ci3; 896 CPU_INFO_ITERATOR cii; 897 898 /* 899 * If we have more than one NUMA node, and the maximum NUMA node ID 900 * is less than PGFL_MAX_BUCKETS, then we'll use NUMA distribution 901 * for free pages. 902 */ 903 min_numa = (u_int)-1; 904 max_numa = 0; 905 for (CPU_INFO_FOREACH(cii, ci)) { 906 if (ci->ci_numa_id < min_numa) { 907 min_numa = ci->ci_numa_id; 908 } 909 if (ci->ci_numa_id > max_numa) { 910 max_numa = ci->ci_numa_id; 911 } 912 } 913 if (min_numa != max_numa && max_numa < PGFL_MAX_BUCKETS) { 914 aprint_debug("UVM: using NUMA allocation scheme\n"); 915 for (CPU_INFO_FOREACH(cii, ci)) { 916 ci->ci_data.cpu_uvm->pgflbucket = ci->ci_numa_id; 917 } 918 uvm_page_redim(uvmexp.ncolors, max_numa + 1); 919 return; 920 } 921 922 /* 923 * Otherwise we'll go with a scheme to maximise L2/L3 cache locality 924 * and minimise lock contention. Count the total number of CPU 925 * packages, and then try to distribute the buckets among CPU 926 * packages evenly. 927 */ 928 npackage = curcpu()->ci_nsibling[CPUREL_PACKAGE1ST]; 929 930 /* 931 * Figure out how to arrange the packages & buckets, and the total 932 * number of buckets we need. XXX 2 may not be the best factor. 933 */ 934 for (shift = 0; npackage > PGFL_MAX_BUCKETS; shift++) { 935 npackage >>= 1; 936 } 937 uvm_page_redim(uvmexp.ncolors, npackage); 938 939 /* 940 * Now tell each CPU which bucket to use. In the outer loop, scroll 941 * through all CPU packages. 942 */ 943 npackage = 0; 944 ci = curcpu(); 945 ci2 = ci->ci_sibling[CPUREL_PACKAGE1ST]; 946 do { 947 /* 948 * In the inner loop, scroll through all CPUs in the package 949 * and assign the same bucket ID. 950 */ 951 ci3 = ci2; 952 do { 953 ci3->ci_data.cpu_uvm->pgflbucket = npackage >> shift; 954 ci3 = ci3->ci_sibling[CPUREL_PACKAGE]; 955 } while (ci3 != ci2); 956 npackage++; 957 ci2 = ci2->ci_sibling[CPUREL_PACKAGE1ST]; 958 } while (ci2 != ci->ci_sibling[CPUREL_PACKAGE1ST]); 959 960 aprint_debug("UVM: using package allocation scheme, " 961 "%d package(s) per bucket\n", 1 << shift); 962 } 963 964 /* 965 * uvm_cpu_attach: initialize per-CPU data structures. 966 */ 967 968 void 969 uvm_cpu_attach(struct cpu_info *ci) 970 { 971 struct uvm_cpu *ucpu; 972 973 /* Already done in uvm_page_init(). */ 974 if (!CPU_IS_PRIMARY(ci)) { 975 /* Add more reserve pages for this CPU. */ 976 uvmexp.reserve_kernel += vm_page_reserve_kernel; 977 978 /* Allocate per-CPU data structures. */ 979 ucpu = kmem_zalloc(sizeof(struct uvm_cpu) + coherency_unit - 1, 980 KM_SLEEP); 981 ucpu = (struct uvm_cpu *)roundup2((uintptr_t)ucpu, 982 coherency_unit); 983 ci->ci_data.cpu_uvm = ucpu; 984 } else { 985 ucpu = ci->ci_data.cpu_uvm; 986 } 987 988 uvmpdpol_init_cpu(ucpu); 989 990 /* 991 * Attach RNG source for this CPU's VM events 992 */ 993 rnd_attach_source(&ucpu->rs, ci->ci_data.cpu_name, RND_TYPE_VM, 994 RND_FLAG_COLLECT_TIME|RND_FLAG_COLLECT_VALUE| 995 RND_FLAG_ESTIMATE_VALUE); 996 } 997 998 /* 999 * uvm_availmem: fetch the total amount of free memory in pages. this can 1000 * have a detrimental effect on performance due to false sharing; don't call 1001 * unless needed. 1002 * 1003 * some users can request the amount of free memory so often that it begins 1004 * to impact upon performance. if calling frequently and an inexact value 1005 * is okay, call with cached = true. 1006 */ 1007 1008 int 1009 uvm_availmem(bool cached) 1010 { 1011 int64_t fp; 1012 1013 cpu_count_sync(cached); 1014 if ((fp = cpu_count_get(CPU_COUNT_FREEPAGES)) < 0) { 1015 /* 1016 * XXXAD could briefly go negative because it's impossible 1017 * to get a clean snapshot. address this for other counters 1018 * used as running totals before NetBSD 10 although less 1019 * important for those. 1020 */ 1021 fp = 0; 1022 } 1023 return (int)fp; 1024 } 1025 1026 /* 1027 * uvm_pagealloc_pgb: helper routine that tries to allocate any color from a 1028 * specific freelist and specific bucket only. 1029 * 1030 * => must be at IPL_VM or higher to protect per-CPU data structures. 1031 */ 1032 1033 static struct vm_page * 1034 uvm_pagealloc_pgb(struct uvm_cpu *ucpu, int f, int b, int *trycolorp, int flags) 1035 { 1036 int c, trycolor, colormask; 1037 struct pgflbucket *pgb; 1038 struct vm_page *pg; 1039 kmutex_t *lock; 1040 bool fill; 1041 1042 /* 1043 * Skip the bucket if empty, no lock needed. There could be many 1044 * empty freelists/buckets. 1045 */ 1046 pgb = uvm.page_free[f].pgfl_buckets[b]; 1047 if (pgb->pgb_nfree == 0) { 1048 return NULL; 1049 } 1050 1051 /* Skip bucket if low on memory. */ 1052 lock = &uvm_freelist_locks[b].lock; 1053 mutex_spin_enter(lock); 1054 if (__predict_false(pgb->pgb_nfree <= uvmexp.reserve_kernel)) { 1055 if ((flags & UVM_PGA_USERESERVE) == 0 || 1056 (pgb->pgb_nfree <= uvmexp.reserve_pagedaemon && 1057 curlwp != uvm.pagedaemon_lwp)) { 1058 mutex_spin_exit(lock); 1059 return NULL; 1060 } 1061 fill = false; 1062 } else { 1063 fill = true; 1064 } 1065 1066 /* Try all page colors as needed. */ 1067 c = trycolor = *trycolorp; 1068 colormask = uvmexp.colormask; 1069 do { 1070 pg = LIST_FIRST(&pgb->pgb_colors[c]); 1071 if (__predict_true(pg != NULL)) { 1072 /* 1073 * Got a free page! PG_FREE must be cleared under 1074 * lock because of uvm_pglistalloc(). 1075 */ 1076 LIST_REMOVE(pg, pageq.list); 1077 KASSERT(pg->flags == PG_FREE); 1078 pg->flags = PG_BUSY | PG_CLEAN | PG_FAKE; 1079 pgb->pgb_nfree--; 1080 1081 /* 1082 * While we have the bucket locked and our data 1083 * structures fresh in L1 cache, we have an ideal 1084 * opportunity to grab some pages for the freelist 1085 * cache without causing extra contention. Only do 1086 * so if we found pages in this CPU's preferred 1087 * bucket. 1088 */ 1089 if (__predict_true(b == ucpu->pgflbucket && fill)) { 1090 uvm_pgflcache_fill(ucpu, f, b, c); 1091 } 1092 mutex_spin_exit(lock); 1093 KASSERT(uvm_page_get_bucket(pg) == b); 1094 CPU_COUNT(c == trycolor ? 1095 CPU_COUNT_COLORHIT : CPU_COUNT_COLORMISS, 1); 1096 CPU_COUNT(CPU_COUNT_CPUMISS, 1); 1097 *trycolorp = c; 1098 return pg; 1099 } 1100 c = (c + 1) & colormask; 1101 } while (c != trycolor); 1102 mutex_spin_exit(lock); 1103 1104 return NULL; 1105 } 1106 1107 /* 1108 * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat that allocates 1109 * any color from any bucket, in a specific freelist. 1110 * 1111 * => must be at IPL_VM or higher to protect per-CPU data structures. 1112 */ 1113 1114 static struct vm_page * 1115 uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int f, int *trycolorp, int flags) 1116 { 1117 int b, trybucket, bucketcount; 1118 struct vm_page *pg; 1119 1120 /* Try for the exact thing in the per-CPU cache. */ 1121 if ((pg = uvm_pgflcache_alloc(ucpu, f, *trycolorp)) != NULL) { 1122 CPU_COUNT(CPU_COUNT_CPUHIT, 1); 1123 CPU_COUNT(CPU_COUNT_COLORHIT, 1); 1124 return pg; 1125 } 1126 1127 /* Walk through all buckets, trying our preferred bucket first. */ 1128 trybucket = ucpu->pgflbucket; 1129 b = trybucket; 1130 bucketcount = uvm.bucketcount; 1131 do { 1132 pg = uvm_pagealloc_pgb(ucpu, f, b, trycolorp, flags); 1133 if (pg != NULL) { 1134 return pg; 1135 } 1136 b = (b + 1 == bucketcount ? 0 : b + 1); 1137 } while (b != trybucket); 1138 1139 return NULL; 1140 } 1141 1142 /* 1143 * uvm_pagealloc_strat: allocate vm_page from a particular free list. 1144 * 1145 * => return null if no pages free 1146 * => wake up pagedaemon if number of free pages drops below low water mark 1147 * => if obj != NULL, obj must be locked (to put in obj's tree) 1148 * => if anon != NULL, anon must be locked (to put in anon) 1149 * => only one of obj or anon can be non-null 1150 * => caller must activate/deactivate page if it is not wired. 1151 * => free_list is ignored if strat == UVM_PGA_STRAT_NORMAL. 1152 * => policy decision: it is more important to pull a page off of the 1153 * appropriate priority free list than it is to get a page from the 1154 * correct bucket or color bin. This is because we live with the 1155 * consequences of a bad free list decision for the entire 1156 * lifetime of the page, e.g. if the page comes from memory that 1157 * is slower to access. 1158 */ 1159 1160 struct vm_page * 1161 uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon, 1162 int flags, int strat, int free_list) 1163 { 1164 int color, lcv, error, s; 1165 struct uvm_cpu *ucpu; 1166 struct vm_page *pg; 1167 lwp_t *l; 1168 1169 KASSERT(obj == NULL || anon == NULL); 1170 KASSERT(anon == NULL || (flags & UVM_FLAG_COLORMATCH) || off == 0); 1171 KASSERT(off == trunc_page(off)); 1172 KASSERT(obj == NULL || rw_write_held(obj->vmobjlock)); 1173 KASSERT(anon == NULL || anon->an_lock == NULL || 1174 rw_write_held(anon->an_lock)); 1175 1176 /* 1177 * This implements a global round-robin page coloring 1178 * algorithm. 1179 */ 1180 1181 s = splvm(); 1182 ucpu = curcpu()->ci_data.cpu_uvm; 1183 if (flags & UVM_FLAG_COLORMATCH) { 1184 color = atop(off) & uvmexp.colormask; 1185 } else { 1186 color = ucpu->pgflcolor; 1187 } 1188 1189 /* 1190 * fail if any of these conditions is true: 1191 * [1] there really are no free pages, or 1192 * [2] only kernel "reserved" pages remain and 1193 * reserved pages have not been requested. 1194 * [3] only pagedaemon "reserved" pages remain and 1195 * the requestor isn't the pagedaemon. 1196 * we make kernel reserve pages available if called by a 1197 * kernel thread. 1198 */ 1199 l = curlwp; 1200 if (__predict_true(l != NULL) && (l->l_flag & LW_SYSTEM) != 0) { 1201 flags |= UVM_PGA_USERESERVE; 1202 } 1203 1204 again: 1205 switch (strat) { 1206 case UVM_PGA_STRAT_NORMAL: 1207 /* Check freelists: descending priority (ascending id) order. */ 1208 for (lcv = 0; lcv < VM_NFREELIST; lcv++) { 1209 pg = uvm_pagealloc_pgfl(ucpu, lcv, &color, flags); 1210 if (pg != NULL) { 1211 goto gotit; 1212 } 1213 } 1214 1215 /* No pages free! Have pagedaemon free some memory. */ 1216 splx(s); 1217 uvm_kick_pdaemon(); 1218 return NULL; 1219 1220 case UVM_PGA_STRAT_ONLY: 1221 case UVM_PGA_STRAT_FALLBACK: 1222 /* Attempt to allocate from the specified free list. */ 1223 KASSERT(free_list >= 0 && free_list < VM_NFREELIST); 1224 pg = uvm_pagealloc_pgfl(ucpu, free_list, &color, flags); 1225 if (pg != NULL) { 1226 goto gotit; 1227 } 1228 1229 /* Fall back, if possible. */ 1230 if (strat == UVM_PGA_STRAT_FALLBACK) { 1231 strat = UVM_PGA_STRAT_NORMAL; 1232 goto again; 1233 } 1234 1235 /* No pages free! Have pagedaemon free some memory. */ 1236 splx(s); 1237 uvm_kick_pdaemon(); 1238 return NULL; 1239 1240 case UVM_PGA_STRAT_NUMA: 1241 /* 1242 * NUMA strategy (experimental): allocating from the correct 1243 * bucket is more important than observing freelist 1244 * priority. Look only to the current NUMA node; if that 1245 * fails, we need to look to other NUMA nodes, so retry with 1246 * the normal strategy. 1247 */ 1248 for (lcv = 0; lcv < VM_NFREELIST; lcv++) { 1249 pg = uvm_pgflcache_alloc(ucpu, lcv, color); 1250 if (pg != NULL) { 1251 CPU_COUNT(CPU_COUNT_CPUHIT, 1); 1252 CPU_COUNT(CPU_COUNT_COLORHIT, 1); 1253 goto gotit; 1254 } 1255 pg = uvm_pagealloc_pgb(ucpu, lcv, 1256 ucpu->pgflbucket, &color, flags); 1257 if (pg != NULL) { 1258 goto gotit; 1259 } 1260 } 1261 strat = UVM_PGA_STRAT_NORMAL; 1262 goto again; 1263 1264 default: 1265 panic("uvm_pagealloc_strat: bad strat %d", strat); 1266 /* NOTREACHED */ 1267 } 1268 1269 gotit: 1270 /* 1271 * We now know which color we actually allocated from; set 1272 * the next color accordingly. 1273 */ 1274 1275 ucpu->pgflcolor = (color + 1) & uvmexp.colormask; 1276 1277 /* 1278 * while still at IPL_VM, update allocation statistics. 1279 */ 1280 1281 CPU_COUNT(CPU_COUNT_FREEPAGES, -1); 1282 if (anon) { 1283 CPU_COUNT(CPU_COUNT_ANONCLEAN, 1); 1284 } 1285 splx(s); 1286 KASSERT(pg->flags == (PG_BUSY|PG_CLEAN|PG_FAKE)); 1287 1288 /* 1289 * assign the page to the object. as the page was free, we know 1290 * that pg->uobject and pg->uanon are NULL. we only need to take 1291 * the page's interlock if we are changing the values. 1292 */ 1293 if (anon != NULL || obj != NULL) { 1294 mutex_enter(&pg->interlock); 1295 } 1296 pg->offset = off; 1297 pg->uobject = obj; 1298 pg->uanon = anon; 1299 KASSERT(uvm_page_owner_locked_p(pg, true)); 1300 if (anon) { 1301 anon->an_page = pg; 1302 pg->flags |= PG_ANON; 1303 mutex_exit(&pg->interlock); 1304 } else if (obj) { 1305 /* 1306 * set PG_FILE|PG_AOBJ before the first uvm_pageinsert. 1307 */ 1308 if (UVM_OBJ_IS_VNODE(obj)) { 1309 pg->flags |= PG_FILE; 1310 } else if (UVM_OBJ_IS_AOBJ(obj)) { 1311 pg->flags |= PG_AOBJ; 1312 } 1313 uvm_pageinsert_object(obj, pg); 1314 mutex_exit(&pg->interlock); 1315 error = uvm_pageinsert_tree(obj, pg); 1316 if (error != 0) { 1317 mutex_enter(&pg->interlock); 1318 uvm_pageremove_object(obj, pg); 1319 mutex_exit(&pg->interlock); 1320 uvm_pagefree(pg); 1321 return NULL; 1322 } 1323 } 1324 1325 #if defined(UVM_PAGE_TRKOWN) 1326 pg->owner_tag = NULL; 1327 #endif 1328 UVM_PAGE_OWN(pg, "new alloc"); 1329 1330 if (flags & UVM_PGA_ZERO) { 1331 /* A zero'd page is not clean. */ 1332 if (obj != NULL || anon != NULL) { 1333 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY); 1334 } 1335 pmap_zero_page(VM_PAGE_TO_PHYS(pg)); 1336 } 1337 1338 return(pg); 1339 } 1340 1341 /* 1342 * uvm_pagereplace: replace a page with another 1343 * 1344 * => object must be locked 1345 * => page interlocks must be held 1346 */ 1347 1348 void 1349 uvm_pagereplace(struct vm_page *oldpg, struct vm_page *newpg) 1350 { 1351 struct uvm_object *uobj = oldpg->uobject; 1352 struct vm_page *pg __diagused; 1353 uint64_t idx; 1354 1355 KASSERT((oldpg->flags & PG_TABLED) != 0); 1356 KASSERT(uobj != NULL); 1357 KASSERT((newpg->flags & PG_TABLED) == 0); 1358 KASSERT(newpg->uobject == NULL); 1359 KASSERT(rw_write_held(uobj->vmobjlock)); 1360 KASSERT(mutex_owned(&oldpg->interlock)); 1361 KASSERT(mutex_owned(&newpg->interlock)); 1362 1363 newpg->uobject = uobj; 1364 newpg->offset = oldpg->offset; 1365 idx = newpg->offset >> PAGE_SHIFT; 1366 pg = radix_tree_replace_node(&uobj->uo_pages, idx, newpg); 1367 KASSERT(pg == oldpg); 1368 if (((oldpg->flags ^ newpg->flags) & PG_CLEAN) != 0) { 1369 if ((newpg->flags & PG_CLEAN) != 0) { 1370 uvm_obj_page_clear_dirty(newpg); 1371 } else { 1372 uvm_obj_page_set_dirty(newpg); 1373 } 1374 } 1375 /* 1376 * oldpg's PG_STAT is stable. newpg is not reachable by others yet. 1377 */ 1378 newpg->flags |= 1379 (newpg->flags & ~PG_STAT) | (oldpg->flags & PG_STAT); 1380 uvm_pageinsert_object(uobj, newpg); 1381 uvm_pageremove_object(uobj, oldpg); 1382 } 1383 1384 /* 1385 * uvm_pagerealloc: reallocate a page from one object to another 1386 * 1387 * => both objects must be locked 1388 */ 1389 1390 int 1391 uvm_pagerealloc(struct vm_page *pg, struct uvm_object *newobj, voff_t newoff) 1392 { 1393 int error = 0; 1394 1395 /* 1396 * remove it from the old object 1397 */ 1398 1399 if (pg->uobject) { 1400 uvm_pageremove_tree(pg->uobject, pg); 1401 uvm_pageremove_object(pg->uobject, pg); 1402 } 1403 1404 /* 1405 * put it in the new object 1406 */ 1407 1408 if (newobj) { 1409 mutex_enter(&pg->interlock); 1410 pg->uobject = newobj; 1411 pg->offset = newoff; 1412 if (UVM_OBJ_IS_VNODE(newobj)) { 1413 pg->flags |= PG_FILE; 1414 } else if (UVM_OBJ_IS_AOBJ(newobj)) { 1415 pg->flags |= PG_AOBJ; 1416 } 1417 uvm_pageinsert_object(newobj, pg); 1418 mutex_exit(&pg->interlock); 1419 error = uvm_pageinsert_tree(newobj, pg); 1420 if (error != 0) { 1421 mutex_enter(&pg->interlock); 1422 uvm_pageremove_object(newobj, pg); 1423 mutex_exit(&pg->interlock); 1424 } 1425 } 1426 1427 return error; 1428 } 1429 1430 #ifdef DEBUG 1431 /* 1432 * check if page is zero-filled 1433 */ 1434 void 1435 uvm_pagezerocheck(struct vm_page *pg) 1436 { 1437 int *p, *ep; 1438 1439 KASSERT(uvm_zerocheckkva != 0); 1440 1441 /* 1442 * XXX assuming pmap_kenter_pa and pmap_kremove never call 1443 * uvm page allocator. 1444 * 1445 * it might be better to have "CPU-local temporary map" pmap interface. 1446 */ 1447 mutex_spin_enter(&uvm_zerochecklock); 1448 pmap_kenter_pa(uvm_zerocheckkva, VM_PAGE_TO_PHYS(pg), VM_PROT_READ, 0); 1449 p = (int *)uvm_zerocheckkva; 1450 ep = (int *)((char *)p + PAGE_SIZE); 1451 pmap_update(pmap_kernel()); 1452 while (p < ep) { 1453 if (*p != 0) 1454 panic("zero page isn't zero-filled"); 1455 p++; 1456 } 1457 pmap_kremove(uvm_zerocheckkva, PAGE_SIZE); 1458 mutex_spin_exit(&uvm_zerochecklock); 1459 /* 1460 * pmap_update() is not necessary here because no one except us 1461 * uses this VA. 1462 */ 1463 } 1464 #endif /* DEBUG */ 1465 1466 /* 1467 * uvm_pagefree: free page 1468 * 1469 * => erase page's identity (i.e. remove from object) 1470 * => put page on free list 1471 * => caller must lock owning object (either anon or uvm_object) 1472 * => assumes all valid mappings of pg are gone 1473 */ 1474 1475 void 1476 uvm_pagefree(struct vm_page *pg) 1477 { 1478 struct pgfreelist *pgfl; 1479 struct pgflbucket *pgb; 1480 struct uvm_cpu *ucpu; 1481 kmutex_t *lock; 1482 int bucket, s; 1483 bool locked; 1484 1485 #ifdef DEBUG 1486 if (pg->uobject == (void *)0xdeadbeef && 1487 pg->uanon == (void *)0xdeadbeef) { 1488 panic("uvm_pagefree: freeing free page %p", pg); 1489 } 1490 #endif /* DEBUG */ 1491 1492 KASSERT((pg->flags & PG_PAGEOUT) == 0); 1493 KASSERT(!(pg->flags & PG_FREE)); 1494 KASSERT(pg->uobject == NULL || rw_write_held(pg->uobject->vmobjlock)); 1495 KASSERT(pg->uobject != NULL || pg->uanon == NULL || 1496 rw_write_held(pg->uanon->an_lock)); 1497 1498 /* 1499 * remove the page from the object's tree before acquiring any page 1500 * interlocks: this can acquire locks to free radixtree nodes. 1501 */ 1502 if (pg->uobject != NULL) { 1503 uvm_pageremove_tree(pg->uobject, pg); 1504 } 1505 1506 /* 1507 * if the page is loaned, resolve the loan instead of freeing. 1508 */ 1509 1510 if (pg->loan_count) { 1511 KASSERT(pg->wire_count == 0); 1512 1513 /* 1514 * if the page is owned by an anon then we just want to 1515 * drop anon ownership. the kernel will free the page when 1516 * it is done with it. if the page is owned by an object, 1517 * remove it from the object and mark it dirty for the benefit 1518 * of possible anon owners. 1519 * 1520 * regardless of previous ownership, wakeup any waiters, 1521 * unbusy the page, and we're done. 1522 */ 1523 1524 uvm_pagelock(pg); 1525 locked = true; 1526 if (pg->uobject != NULL) { 1527 uvm_pageremove_object(pg->uobject, pg); 1528 pg->flags &= ~(PG_FILE|PG_AOBJ); 1529 } else if (pg->uanon != NULL) { 1530 if ((pg->flags & PG_ANON) == 0) { 1531 pg->loan_count--; 1532 } else { 1533 const unsigned status = uvm_pagegetdirty(pg); 1534 pg->flags &= ~PG_ANON; 1535 cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1); 1536 } 1537 pg->uanon->an_page = NULL; 1538 pg->uanon = NULL; 1539 } 1540 if (pg->pqflags & PQ_WANTED) { 1541 wakeup(pg); 1542 } 1543 pg->pqflags &= ~PQ_WANTED; 1544 pg->flags &= ~(PG_BUSY|PG_RELEASED|PG_PAGER1); 1545 #ifdef UVM_PAGE_TRKOWN 1546 pg->owner_tag = NULL; 1547 #endif 1548 KASSERT((pg->flags & PG_STAT) == 0); 1549 if (pg->loan_count) { 1550 KASSERT(pg->uobject == NULL); 1551 if (pg->uanon == NULL) { 1552 uvm_pagedequeue(pg); 1553 } 1554 uvm_pageunlock(pg); 1555 return; 1556 } 1557 } else if (pg->uobject != NULL || pg->uanon != NULL || 1558 pg->wire_count != 0) { 1559 uvm_pagelock(pg); 1560 locked = true; 1561 } else { 1562 locked = false; 1563 } 1564 1565 /* 1566 * remove page from its object or anon. 1567 */ 1568 if (pg->uobject != NULL) { 1569 uvm_pageremove_object(pg->uobject, pg); 1570 } else if (pg->uanon != NULL) { 1571 const unsigned int status = uvm_pagegetdirty(pg); 1572 pg->uanon->an_page = NULL; 1573 pg->uanon = NULL; 1574 cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1); 1575 } 1576 1577 /* 1578 * if the page was wired, unwire it now. 1579 */ 1580 1581 if (pg->wire_count) { 1582 pg->wire_count = 0; 1583 atomic_dec_uint(&uvmexp.wired); 1584 } 1585 if (locked) { 1586 /* 1587 * wake anyone waiting on the page. 1588 */ 1589 if ((pg->pqflags & PQ_WANTED) != 0) { 1590 pg->pqflags &= ~PQ_WANTED; 1591 wakeup(pg); 1592 } 1593 1594 /* 1595 * now remove the page from the queues. 1596 */ 1597 uvm_pagedequeue(pg); 1598 uvm_pageunlock(pg); 1599 } else { 1600 KASSERT(!uvmpdpol_pageisqueued_p(pg)); 1601 } 1602 1603 /* 1604 * and put on free queue 1605 */ 1606 1607 #ifdef DEBUG 1608 pg->uobject = (void *)0xdeadbeef; 1609 pg->uanon = (void *)0xdeadbeef; 1610 #endif /* DEBUG */ 1611 1612 /* Try to send the page to the per-CPU cache. */ 1613 s = splvm(); 1614 CPU_COUNT(CPU_COUNT_FREEPAGES, 1); 1615 ucpu = curcpu()->ci_data.cpu_uvm; 1616 bucket = uvm_page_get_bucket(pg); 1617 if (bucket == ucpu->pgflbucket && uvm_pgflcache_free(ucpu, pg)) { 1618 splx(s); 1619 return; 1620 } 1621 1622 /* Didn't work. Never mind, send it to a global bucket. */ 1623 pgfl = &uvm.page_free[uvm_page_get_freelist(pg)]; 1624 pgb = pgfl->pgfl_buckets[bucket]; 1625 lock = &uvm_freelist_locks[bucket].lock; 1626 1627 mutex_spin_enter(lock); 1628 /* PG_FREE must be set under lock because of uvm_pglistalloc(). */ 1629 pg->flags = PG_FREE; 1630 LIST_INSERT_HEAD(&pgb->pgb_colors[VM_PGCOLOR(pg)], pg, pageq.list); 1631 pgb->pgb_nfree++; 1632 mutex_spin_exit(lock); 1633 splx(s); 1634 } 1635 1636 /* 1637 * uvm_page_unbusy: unbusy an array of pages. 1638 * 1639 * => pages must either all belong to the same object, or all belong to anons. 1640 * => if pages are object-owned, object must be locked. 1641 * => if pages are anon-owned, anons must be locked. 1642 * => caller must make sure that anon-owned pages are not PG_RELEASED. 1643 */ 1644 1645 void 1646 uvm_page_unbusy(struct vm_page **pgs, int npgs) 1647 { 1648 struct vm_page *pg; 1649 int i; 1650 UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); 1651 1652 for (i = 0; i < npgs; i++) { 1653 pg = pgs[i]; 1654 if (pg == NULL || pg == PGO_DONTCARE) { 1655 continue; 1656 } 1657 1658 KASSERT(uvm_page_owner_locked_p(pg, true)); 1659 KASSERT(pg->flags & PG_BUSY); 1660 KASSERT((pg->flags & PG_PAGEOUT) == 0); 1661 if (pg->flags & PG_RELEASED) { 1662 UVMHIST_LOG(ubchist, "releasing pg %#jx", 1663 (uintptr_t)pg, 0, 0, 0); 1664 KASSERT(pg->uobject != NULL || 1665 (pg->uanon != NULL && pg->uanon->an_ref > 0)); 1666 pg->flags &= ~PG_RELEASED; 1667 uvm_pagefree(pg); 1668 } else { 1669 UVMHIST_LOG(ubchist, "unbusying pg %#jx", 1670 (uintptr_t)pg, 0, 0, 0); 1671 KASSERT((pg->flags & PG_FAKE) == 0); 1672 pg->flags &= ~PG_BUSY; 1673 uvm_pagelock(pg); 1674 uvm_pagewakeup(pg); 1675 uvm_pageunlock(pg); 1676 UVM_PAGE_OWN(pg, NULL); 1677 } 1678 } 1679 } 1680 1681 /* 1682 * uvm_pagewait: wait for a busy page 1683 * 1684 * => page must be known PG_BUSY 1685 * => object must be read or write locked 1686 * => object will be unlocked on return 1687 */ 1688 1689 void 1690 uvm_pagewait(struct vm_page *pg, krwlock_t *lock, const char *wmesg) 1691 { 1692 1693 KASSERT(rw_lock_held(lock)); 1694 KASSERT((pg->flags & PG_BUSY) != 0); 1695 KASSERT(uvm_page_owner_locked_p(pg, false)); 1696 1697 mutex_enter(&pg->interlock); 1698 pg->pqflags |= PQ_WANTED; 1699 rw_exit(lock); 1700 UVM_UNLOCK_AND_WAIT(pg, &pg->interlock, false, wmesg, 0); 1701 } 1702 1703 /* 1704 * uvm_pagewakeup: wake anyone waiting on a page 1705 * 1706 * => page interlock must be held 1707 */ 1708 1709 void 1710 uvm_pagewakeup(struct vm_page *pg) 1711 { 1712 UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); 1713 1714 KASSERT(mutex_owned(&pg->interlock)); 1715 1716 UVMHIST_LOG(ubchist, "waking pg %#jx", (uintptr_t)pg, 0, 0, 0); 1717 1718 if ((pg->pqflags & PQ_WANTED) != 0) { 1719 wakeup(pg); 1720 pg->pqflags &= ~PQ_WANTED; 1721 } 1722 } 1723 1724 /* 1725 * uvm_pagewanted_p: return true if someone is waiting on the page 1726 * 1727 * => object must be write locked (lock out all concurrent access) 1728 */ 1729 1730 bool 1731 uvm_pagewanted_p(struct vm_page *pg) 1732 { 1733 1734 KASSERT(uvm_page_owner_locked_p(pg, true)); 1735 1736 return (atomic_load_relaxed(&pg->pqflags) & PQ_WANTED) != 0; 1737 } 1738 1739 #if defined(UVM_PAGE_TRKOWN) 1740 /* 1741 * uvm_page_own: set or release page ownership 1742 * 1743 * => this is a debugging function that keeps track of who sets PG_BUSY 1744 * and where they do it. it can be used to track down problems 1745 * such a process setting "PG_BUSY" and never releasing it. 1746 * => page's object [if any] must be locked 1747 * => if "tag" is NULL then we are releasing page ownership 1748 */ 1749 void 1750 uvm_page_own(struct vm_page *pg, const char *tag) 1751 { 1752 1753 KASSERT((pg->flags & (PG_PAGEOUT|PG_RELEASED)) == 0); 1754 KASSERT(uvm_page_owner_locked_p(pg, true)); 1755 1756 /* gain ownership? */ 1757 if (tag) { 1758 KASSERT((pg->flags & PG_BUSY) != 0); 1759 if (pg->owner_tag) { 1760 printf("uvm_page_own: page %p already owned " 1761 "by proc %d.%d [%s]\n", pg, 1762 pg->owner, pg->lowner, pg->owner_tag); 1763 panic("uvm_page_own"); 1764 } 1765 pg->owner = curproc->p_pid; 1766 pg->lowner = curlwp->l_lid; 1767 pg->owner_tag = tag; 1768 return; 1769 } 1770 1771 /* drop ownership */ 1772 KASSERT((pg->flags & PG_BUSY) == 0); 1773 if (pg->owner_tag == NULL) { 1774 printf("uvm_page_own: dropping ownership of an non-owned " 1775 "page (%p)\n", pg); 1776 panic("uvm_page_own"); 1777 } 1778 pg->owner_tag = NULL; 1779 } 1780 #endif 1781 1782 /* 1783 * uvm_pagelookup: look up a page 1784 * 1785 * => caller should lock object to keep someone from pulling the page 1786 * out from under it 1787 */ 1788 1789 struct vm_page * 1790 uvm_pagelookup(struct uvm_object *obj, voff_t off) 1791 { 1792 struct vm_page *pg; 1793 bool ddb __diagused = false; 1794 #ifdef DDB 1795 extern int db_active; 1796 ddb = db_active != 0; 1797 #endif 1798 1799 KASSERT(ddb || rw_lock_held(obj->vmobjlock)); 1800 1801 pg = radix_tree_lookup_node(&obj->uo_pages, off >> PAGE_SHIFT); 1802 1803 KASSERT(pg == NULL || obj->uo_npages != 0); 1804 KASSERT(pg == NULL || (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 || 1805 (pg->flags & PG_BUSY) != 0); 1806 return pg; 1807 } 1808 1809 /* 1810 * uvm_pagewire: wire the page, thus removing it from the daemon's grasp 1811 * 1812 * => caller must lock objects 1813 * => caller must hold pg->interlock 1814 */ 1815 1816 void 1817 uvm_pagewire(struct vm_page *pg) 1818 { 1819 1820 KASSERT(uvm_page_owner_locked_p(pg, true)); 1821 KASSERT(mutex_owned(&pg->interlock)); 1822 #if defined(READAHEAD_STATS) 1823 if ((pg->flags & PG_READAHEAD) != 0) { 1824 uvm_ra_hit.ev_count++; 1825 pg->flags &= ~PG_READAHEAD; 1826 } 1827 #endif /* defined(READAHEAD_STATS) */ 1828 if (pg->wire_count == 0) { 1829 uvm_pagedequeue(pg); 1830 atomic_inc_uint(&uvmexp.wired); 1831 } 1832 pg->wire_count++; 1833 KASSERT(pg->wire_count > 0); /* detect wraparound */ 1834 } 1835 1836 /* 1837 * uvm_pageunwire: unwire the page. 1838 * 1839 * => activate if wire count goes to zero. 1840 * => caller must lock objects 1841 * => caller must hold pg->interlock 1842 */ 1843 1844 void 1845 uvm_pageunwire(struct vm_page *pg) 1846 { 1847 1848 KASSERT(uvm_page_owner_locked_p(pg, true)); 1849 KASSERT(pg->wire_count != 0); 1850 KASSERT(!uvmpdpol_pageisqueued_p(pg)); 1851 KASSERT(mutex_owned(&pg->interlock)); 1852 pg->wire_count--; 1853 if (pg->wire_count == 0) { 1854 uvm_pageactivate(pg); 1855 KASSERT(uvmexp.wired != 0); 1856 atomic_dec_uint(&uvmexp.wired); 1857 } 1858 } 1859 1860 /* 1861 * uvm_pagedeactivate: deactivate page 1862 * 1863 * => caller must lock objects 1864 * => caller must check to make sure page is not wired 1865 * => object that page belongs to must be locked (so we can adjust pg->flags) 1866 * => caller must clear the reference on the page before calling 1867 * => caller must hold pg->interlock 1868 */ 1869 1870 void 1871 uvm_pagedeactivate(struct vm_page *pg) 1872 { 1873 1874 KASSERT(uvm_page_owner_locked_p(pg, false)); 1875 KASSERT(mutex_owned(&pg->interlock)); 1876 if (pg->wire_count == 0) { 1877 KASSERT(uvmpdpol_pageisqueued_p(pg)); 1878 uvmpdpol_pagedeactivate(pg); 1879 } 1880 } 1881 1882 /* 1883 * uvm_pageactivate: activate page 1884 * 1885 * => caller must lock objects 1886 * => caller must hold pg->interlock 1887 */ 1888 1889 void 1890 uvm_pageactivate(struct vm_page *pg) 1891 { 1892 1893 KASSERT(uvm_page_owner_locked_p(pg, false)); 1894 KASSERT(mutex_owned(&pg->interlock)); 1895 #if defined(READAHEAD_STATS) 1896 if ((pg->flags & PG_READAHEAD) != 0) { 1897 uvm_ra_hit.ev_count++; 1898 pg->flags &= ~PG_READAHEAD; 1899 } 1900 #endif /* defined(READAHEAD_STATS) */ 1901 if (pg->wire_count == 0) { 1902 uvmpdpol_pageactivate(pg); 1903 } 1904 } 1905 1906 /* 1907 * uvm_pagedequeue: remove a page from any paging queue 1908 * 1909 * => caller must lock objects 1910 * => caller must hold pg->interlock 1911 */ 1912 void 1913 uvm_pagedequeue(struct vm_page *pg) 1914 { 1915 1916 KASSERT(uvm_page_owner_locked_p(pg, true)); 1917 KASSERT(mutex_owned(&pg->interlock)); 1918 if (uvmpdpol_pageisqueued_p(pg)) { 1919 uvmpdpol_pagedequeue(pg); 1920 } 1921 } 1922 1923 /* 1924 * uvm_pageenqueue: add a page to a paging queue without activating. 1925 * used where a page is not really demanded (yet). eg. read-ahead 1926 * 1927 * => caller must lock objects 1928 * => caller must hold pg->interlock 1929 */ 1930 void 1931 uvm_pageenqueue(struct vm_page *pg) 1932 { 1933 1934 KASSERT(uvm_page_owner_locked_p(pg, false)); 1935 KASSERT(mutex_owned(&pg->interlock)); 1936 if (pg->wire_count == 0 && !uvmpdpol_pageisqueued_p(pg)) { 1937 uvmpdpol_pageenqueue(pg); 1938 } 1939 } 1940 1941 /* 1942 * uvm_pagelock: acquire page interlock 1943 */ 1944 void 1945 uvm_pagelock(struct vm_page *pg) 1946 { 1947 1948 mutex_enter(&pg->interlock); 1949 } 1950 1951 /* 1952 * uvm_pagelock2: acquire two page interlocks 1953 */ 1954 void 1955 uvm_pagelock2(struct vm_page *pg1, struct vm_page *pg2) 1956 { 1957 1958 if (pg1 < pg2) { 1959 mutex_enter(&pg1->interlock); 1960 mutex_enter(&pg2->interlock); 1961 } else { 1962 mutex_enter(&pg2->interlock); 1963 mutex_enter(&pg1->interlock); 1964 } 1965 } 1966 1967 /* 1968 * uvm_pageunlock: release page interlock, and if a page replacement intent 1969 * is set on the page, pass it to uvmpdpol to make real. 1970 * 1971 * => caller must hold pg->interlock 1972 */ 1973 void 1974 uvm_pageunlock(struct vm_page *pg) 1975 { 1976 1977 if ((pg->pqflags & PQ_INTENT_SET) == 0 || 1978 (pg->pqflags & PQ_INTENT_QUEUED) != 0) { 1979 mutex_exit(&pg->interlock); 1980 return; 1981 } 1982 pg->pqflags |= PQ_INTENT_QUEUED; 1983 mutex_exit(&pg->interlock); 1984 uvmpdpol_pagerealize(pg); 1985 } 1986 1987 /* 1988 * uvm_pageunlock2: release two page interlocks, and for both pages if a 1989 * page replacement intent is set on the page, pass it to uvmpdpol to make 1990 * real. 1991 * 1992 * => caller must hold pg->interlock 1993 */ 1994 void 1995 uvm_pageunlock2(struct vm_page *pg1, struct vm_page *pg2) 1996 { 1997 1998 if ((pg1->pqflags & PQ_INTENT_SET) == 0 || 1999 (pg1->pqflags & PQ_INTENT_QUEUED) != 0) { 2000 mutex_exit(&pg1->interlock); 2001 pg1 = NULL; 2002 } else { 2003 pg1->pqflags |= PQ_INTENT_QUEUED; 2004 mutex_exit(&pg1->interlock); 2005 } 2006 2007 if ((pg2->pqflags & PQ_INTENT_SET) == 0 || 2008 (pg2->pqflags & PQ_INTENT_QUEUED) != 0) { 2009 mutex_exit(&pg2->interlock); 2010 pg2 = NULL; 2011 } else { 2012 pg2->pqflags |= PQ_INTENT_QUEUED; 2013 mutex_exit(&pg2->interlock); 2014 } 2015 2016 if (pg1 != NULL) { 2017 uvmpdpol_pagerealize(pg1); 2018 } 2019 if (pg2 != NULL) { 2020 uvmpdpol_pagerealize(pg2); 2021 } 2022 } 2023 2024 /* 2025 * uvm_pagezero: zero fill a page 2026 * 2027 * => if page is part of an object then the object should be locked 2028 * to protect pg->flags. 2029 */ 2030 2031 void 2032 uvm_pagezero(struct vm_page *pg) 2033 { 2034 2035 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY); 2036 pmap_zero_page(VM_PAGE_TO_PHYS(pg)); 2037 } 2038 2039 /* 2040 * uvm_pagecopy: copy a page 2041 * 2042 * => if page is part of an object then the object should be locked 2043 * to protect pg->flags. 2044 */ 2045 2046 void 2047 uvm_pagecopy(struct vm_page *src, struct vm_page *dst) 2048 { 2049 2050 uvm_pagemarkdirty(dst, UVM_PAGE_STATUS_DIRTY); 2051 pmap_copy_page(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst)); 2052 } 2053 2054 /* 2055 * uvm_pageismanaged: test it see that a page (specified by PA) is managed. 2056 */ 2057 2058 bool 2059 uvm_pageismanaged(paddr_t pa) 2060 { 2061 2062 return (uvm_physseg_find(atop(pa), NULL) != UVM_PHYSSEG_TYPE_INVALID); 2063 } 2064 2065 /* 2066 * uvm_page_lookup_freelist: look up the free list for the specified page 2067 */ 2068 2069 int 2070 uvm_page_lookup_freelist(struct vm_page *pg) 2071 { 2072 uvm_physseg_t upm; 2073 2074 upm = uvm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), NULL); 2075 KASSERT(upm != UVM_PHYSSEG_TYPE_INVALID); 2076 return uvm_physseg_get_free_list(upm); 2077 } 2078 2079 /* 2080 * uvm_page_owner_locked_p: return true if object associated with page is 2081 * locked. this is a weak check for runtime assertions only. 2082 */ 2083 2084 bool 2085 uvm_page_owner_locked_p(struct vm_page *pg, bool exclusive) 2086 { 2087 2088 if (pg->uobject != NULL) { 2089 return exclusive 2090 ? rw_write_held(pg->uobject->vmobjlock) 2091 : rw_lock_held(pg->uobject->vmobjlock); 2092 } 2093 if (pg->uanon != NULL) { 2094 return exclusive 2095 ? rw_write_held(pg->uanon->an_lock) 2096 : rw_lock_held(pg->uanon->an_lock); 2097 } 2098 return true; 2099 } 2100 2101 /* 2102 * uvm_pagereadonly_p: return if the page should be mapped read-only 2103 */ 2104 2105 bool 2106 uvm_pagereadonly_p(struct vm_page *pg) 2107 { 2108 struct uvm_object * const uobj = pg->uobject; 2109 2110 KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock)); 2111 KASSERT(uobj != NULL || rw_lock_held(pg->uanon->an_lock)); 2112 if ((pg->flags & PG_RDONLY) != 0) { 2113 return true; 2114 } 2115 if (uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) { 2116 return true; 2117 } 2118 if (uobj == NULL) { 2119 return false; 2120 } 2121 return UVM_OBJ_NEEDS_WRITEFAULT(uobj); 2122 } 2123 2124 #ifdef PMAP_DIRECT 2125 /* 2126 * Call pmap to translate physical address into a virtual and to run a callback 2127 * for it. Used to avoid actually mapping the pages, pmap most likely uses direct map 2128 * or equivalent. 2129 */ 2130 int 2131 uvm_direct_process(struct vm_page **pgs, u_int npages, voff_t off, vsize_t len, 2132 int (*process)(void *, size_t, void *), void *arg) 2133 { 2134 int error = 0; 2135 paddr_t pa; 2136 size_t todo; 2137 voff_t pgoff = (off & PAGE_MASK); 2138 struct vm_page *pg; 2139 2140 KASSERT(npages > 0 && len > 0); 2141 2142 for (int i = 0; i < npages; i++) { 2143 pg = pgs[i]; 2144 2145 KASSERT(len > 0); 2146 2147 /* 2148 * Caller is responsible for ensuring all the pages are 2149 * available. 2150 */ 2151 KASSERT(pg != NULL && pg != PGO_DONTCARE); 2152 2153 pa = VM_PAGE_TO_PHYS(pg); 2154 todo = MIN(len, PAGE_SIZE - pgoff); 2155 2156 error = pmap_direct_process(pa, pgoff, todo, process, arg); 2157 if (error) 2158 break; 2159 2160 pgoff = 0; 2161 len -= todo; 2162 } 2163 2164 KASSERTMSG(error != 0 || len == 0, "len %lu != 0 for non-error", len); 2165 return error; 2166 } 2167 #endif /* PMAP_DIRECT */ 2168 2169 #if defined(DDB) || defined(DEBUGPRINT) 2170 2171 /* 2172 * uvm_page_printit: actually print the page 2173 */ 2174 2175 static const char page_flagbits[] = UVM_PGFLAGBITS; 2176 static const char page_pqflagbits[] = UVM_PQFLAGBITS; 2177 2178 void 2179 uvm_page_printit(struct vm_page *pg, bool full, 2180 void (*pr)(const char *, ...)) 2181 { 2182 struct vm_page *tpg; 2183 struct uvm_object *uobj; 2184 struct pgflbucket *pgb; 2185 struct pgflist *pgl; 2186 char pgbuf[128]; 2187 2188 (*pr)("PAGE %p:\n", pg); 2189 snprintb(pgbuf, sizeof(pgbuf), page_flagbits, pg->flags); 2190 (*pr)(" flags=%s\n", pgbuf); 2191 snprintb(pgbuf, sizeof(pgbuf), page_pqflagbits, pg->pqflags); 2192 (*pr)(" pqflags=%s\n", pgbuf); 2193 (*pr)(" uobject=%p, uanon=%p, offset=0x%llx\n", 2194 pg->uobject, pg->uanon, (long long)pg->offset); 2195 (*pr)(" loan_count=%d wire_count=%d bucket=%d freelist=%d\n", 2196 pg->loan_count, pg->wire_count, uvm_page_get_bucket(pg), 2197 uvm_page_get_freelist(pg)); 2198 (*pr)(" pa=0x%lx\n", (long)VM_PAGE_TO_PHYS(pg)); 2199 #if defined(UVM_PAGE_TRKOWN) 2200 if (pg->flags & PG_BUSY) 2201 (*pr)(" owning process = %d.%d, tag=%s\n", 2202 pg->owner, pg->lowner, pg->owner_tag); 2203 else 2204 (*pr)(" page not busy, no owner\n"); 2205 #else 2206 (*pr)(" [page ownership tracking disabled]\n"); 2207 #endif 2208 2209 if (!full) 2210 return; 2211 2212 /* cross-verify object/anon */ 2213 if ((pg->flags & PG_FREE) == 0) { 2214 if (pg->flags & PG_ANON) { 2215 if (pg->uanon == NULL || pg->uanon->an_page != pg) 2216 (*pr)(" >>> ANON DOES NOT POINT HERE <<< (%p)\n", 2217 (pg->uanon) ? pg->uanon->an_page : NULL); 2218 else 2219 (*pr)(" anon backpointer is OK\n"); 2220 } else { 2221 uobj = pg->uobject; 2222 if (uobj) { 2223 (*pr)(" checking object list\n"); 2224 tpg = uvm_pagelookup(uobj, pg->offset); 2225 if (tpg) 2226 (*pr)(" page found on object list\n"); 2227 else 2228 (*pr)(" >>> PAGE NOT FOUND ON OBJECT LIST! <<<\n"); 2229 } 2230 } 2231 } 2232 2233 /* cross-verify page queue */ 2234 if (pg->flags & PG_FREE) { 2235 int fl = uvm_page_get_freelist(pg); 2236 int b = uvm_page_get_bucket(pg); 2237 pgb = uvm.page_free[fl].pgfl_buckets[b]; 2238 pgl = &pgb->pgb_colors[VM_PGCOLOR(pg)]; 2239 (*pr)(" checking pageq list\n"); 2240 LIST_FOREACH(tpg, pgl, pageq.list) { 2241 if (tpg == pg) { 2242 break; 2243 } 2244 } 2245 if (tpg) 2246 (*pr)(" page found on pageq list\n"); 2247 else 2248 (*pr)(" >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n"); 2249 } 2250 } 2251 2252 /* 2253 * uvm_page_printall - print a summary of all managed pages 2254 */ 2255 2256 void 2257 uvm_page_printall(void (*pr)(const char *, ...)) 2258 { 2259 uvm_physseg_t i; 2260 paddr_t pfn; 2261 struct vm_page *pg; 2262 2263 (*pr)("%18s %4s %4s %18s %18s" 2264 #ifdef UVM_PAGE_TRKOWN 2265 " OWNER" 2266 #endif 2267 "\n", "PAGE", "FLAG", "PQ", "UOBJECT", "UANON"); 2268 for (i = uvm_physseg_get_first(); 2269 uvm_physseg_valid_p(i); 2270 i = uvm_physseg_get_next(i)) { 2271 for (pfn = uvm_physseg_get_start(i); 2272 pfn < uvm_physseg_get_end(i); 2273 pfn++) { 2274 pg = PHYS_TO_VM_PAGE(ptoa(pfn)); 2275 2276 (*pr)("%18p %04x %08x %18p %18p", 2277 pg, pg->flags, pg->pqflags, pg->uobject, 2278 pg->uanon); 2279 #ifdef UVM_PAGE_TRKOWN 2280 if (pg->flags & PG_BUSY) 2281 (*pr)(" %d [%s]", pg->owner, pg->owner_tag); 2282 #endif 2283 (*pr)("\n"); 2284 } 2285 } 2286 } 2287 2288 /* 2289 * uvm_page_print_freelists - print a summary freelists 2290 */ 2291 2292 void 2293 uvm_page_print_freelists(void (*pr)(const char *, ...)) 2294 { 2295 struct pgfreelist *pgfl; 2296 struct pgflbucket *pgb; 2297 int fl, b, c; 2298 2299 (*pr)("There are %d freelists with %d buckets of %d colors.\n\n", 2300 VM_NFREELIST, uvm.bucketcount, uvmexp.ncolors); 2301 2302 for (fl = 0; fl < VM_NFREELIST; fl++) { 2303 pgfl = &uvm.page_free[fl]; 2304 (*pr)("freelist(%d) @ %p\n", fl, pgfl); 2305 for (b = 0; b < uvm.bucketcount; b++) { 2306 pgb = uvm.page_free[fl].pgfl_buckets[b]; 2307 (*pr)(" bucket(%d) @ %p, nfree = %d, lock @ %p:\n", 2308 b, pgb, pgb->pgb_nfree, 2309 &uvm_freelist_locks[b].lock); 2310 for (c = 0; c < uvmexp.ncolors; c++) { 2311 (*pr)(" color(%d) @ %p, ", c, 2312 &pgb->pgb_colors[c]); 2313 (*pr)("first page = %p\n", 2314 LIST_FIRST(&pgb->pgb_colors[c])); 2315 } 2316 } 2317 } 2318 } 2319 2320 #endif /* DDB || DEBUGPRINT */ 2321