1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * The Mach Operating System project at Carnegie-Mellon University. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 39 * $FreeBSD: src/sys/vm/vm_page.c,v 1.147.2.18 2002/03/10 05:03:19 alc Exp $ 40 * $DragonFly: src/sys/vm/vm_page.c,v 1.40 2008/08/25 17:01:42 dillon Exp $ 41 */ 42 43 /* 44 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 45 * All rights reserved. 46 * 47 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 48 * 49 * Permission to use, copy, modify and distribute this software and 50 * its documentation is hereby granted, provided that both the copyright 51 * notice and this permission notice appear in all copies of the 52 * software, derivative works or modified versions, and any portions 53 * thereof, and that both notices appear in supporting documentation. 54 * 55 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 56 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 57 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 58 * 59 * Carnegie Mellon requests users of this software to return to 60 * 61 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 62 * School of Computer Science 63 * Carnegie Mellon University 64 * Pittsburgh PA 15213-3890 65 * 66 * any improvements or extensions that they make and grant Carnegie the 67 * rights to redistribute these changes. 68 */ 69 /* 70 * Resident memory management module. The module manipulates 'VM pages'. 71 * A VM page is the core building block for memory management. 72 */ 73 74 #include <sys/param.h> 75 #include <sys/systm.h> 76 #include <sys/malloc.h> 77 #include <sys/proc.h> 78 #include <sys/vmmeter.h> 79 #include <sys/vnode.h> 80 #include <sys/kernel.h> 81 82 #include <vm/vm.h> 83 #include <vm/vm_param.h> 84 #include <sys/lock.h> 85 #include <vm/vm_kern.h> 86 #include <vm/pmap.h> 87 #include <vm/vm_map.h> 88 #include <vm/vm_object.h> 89 #include <vm/vm_page.h> 90 #include <vm/vm_pageout.h> 91 #include <vm/vm_pager.h> 92 #include <vm/vm_extern.h> 93 #include <vm/swap_pager.h> 94 95 #include <machine/md_var.h> 96 97 #include <vm/vm_page2.h> 98 #include <sys/mplock2.h> 99 100 #define VMACTION_HSIZE 256 101 #define VMACTION_HMASK (VMACTION_HSIZE - 1) 102 103 static void vm_page_queue_init(void); 104 static void vm_page_free_wakeup(void); 105 static vm_page_t vm_page_select_cache(vm_object_t, vm_pindex_t); 106 static vm_page_t _vm_page_list_find2(int basequeue, int index); 107 108 struct vpgqueues vm_page_queues[PQ_COUNT]; /* Array of tailq lists */ 109 110 LIST_HEAD(vm_page_action_list, vm_page_action); 111 struct vm_page_action_list action_list[VMACTION_HSIZE]; 112 static volatile int vm_pages_waiting; 113 114 115 #define ASSERT_IN_CRIT_SECTION() KKASSERT(crit_test(curthread)); 116 117 RB_GENERATE2(vm_page_rb_tree, vm_page, rb_entry, rb_vm_page_compare, 118 vm_pindex_t, pindex); 119 120 static void 121 vm_page_queue_init(void) 122 { 123 int i; 124 125 for (i = 0; i < PQ_L2_SIZE; i++) 126 vm_page_queues[PQ_FREE+i].cnt = &vmstats.v_free_count; 127 for (i = 0; i < PQ_L2_SIZE; i++) 128 vm_page_queues[PQ_CACHE+i].cnt = &vmstats.v_cache_count; 129 130 vm_page_queues[PQ_INACTIVE].cnt = &vmstats.v_inactive_count; 131 vm_page_queues[PQ_ACTIVE].cnt = &vmstats.v_active_count; 132 vm_page_queues[PQ_HOLD].cnt = &vmstats.v_active_count; 133 /* PQ_NONE has no queue */ 134 135 for (i = 0; i < PQ_COUNT; i++) 136 TAILQ_INIT(&vm_page_queues[i].pl); 137 138 for (i = 0; i < VMACTION_HSIZE; i++) 139 LIST_INIT(&action_list[i]); 140 } 141 142 /* 143 * note: place in initialized data section? Is this necessary? 144 */ 145 long first_page = 0; 146 int vm_page_array_size = 0; 147 int vm_page_zero_count = 0; 148 vm_page_t vm_page_array = 0; 149 150 /* 151 * (low level boot) 152 * 153 * Sets the page size, perhaps based upon the memory size. 154 * Must be called before any use of page-size dependent functions. 155 */ 156 void 157 vm_set_page_size(void) 158 { 159 if (vmstats.v_page_size == 0) 160 vmstats.v_page_size = PAGE_SIZE; 161 if (((vmstats.v_page_size - 1) & vmstats.v_page_size) != 0) 162 panic("vm_set_page_size: page size not a power of two"); 163 } 164 165 /* 166 * (low level boot) 167 * 168 * Add a new page to the freelist for use by the system. New pages 169 * are added to both the head and tail of the associated free page 170 * queue in a bottom-up fashion, so both zero'd and non-zero'd page 171 * requests pull 'recent' adds (higher physical addresses) first. 172 * 173 * Must be called in a critical section. 174 */ 175 vm_page_t 176 vm_add_new_page(vm_paddr_t pa) 177 { 178 struct vpgqueues *vpq; 179 vm_page_t m; 180 181 ++vmstats.v_page_count; 182 ++vmstats.v_free_count; 183 m = PHYS_TO_VM_PAGE(pa); 184 m->phys_addr = pa; 185 m->flags = 0; 186 m->pc = (pa >> PAGE_SHIFT) & PQ_L2_MASK; 187 m->queue = m->pc + PQ_FREE; 188 KKASSERT(m->dirty == 0); 189 190 vpq = &vm_page_queues[m->queue]; 191 if (vpq->flipflop) 192 TAILQ_INSERT_TAIL(&vpq->pl, m, pageq); 193 else 194 TAILQ_INSERT_HEAD(&vpq->pl, m, pageq); 195 vpq->flipflop = 1 - vpq->flipflop; 196 197 vm_page_queues[m->queue].lcnt++; 198 return (m); 199 } 200 201 /* 202 * (low level boot) 203 * 204 * Initializes the resident memory module. 205 * 206 * Allocates memory for the page cells, and for the object/offset-to-page 207 * hash table headers. Each page cell is initialized and placed on the 208 * free list. 209 * 210 * starta/enda represents the range of physical memory addresses available 211 * for use (skipping memory already used by the kernel), subject to 212 * phys_avail[]. Note that phys_avail[] has already mapped out memory 213 * already in use by the kernel. 214 */ 215 vm_offset_t 216 vm_page_startup(vm_offset_t vaddr) 217 { 218 vm_offset_t mapped; 219 vm_size_t npages; 220 vm_paddr_t page_range; 221 vm_paddr_t new_end; 222 int i; 223 vm_paddr_t pa; 224 int nblocks; 225 vm_paddr_t last_pa; 226 vm_paddr_t end; 227 vm_paddr_t biggestone, biggestsize; 228 vm_paddr_t total; 229 230 total = 0; 231 biggestsize = 0; 232 biggestone = 0; 233 nblocks = 0; 234 vaddr = round_page(vaddr); 235 236 for (i = 0; phys_avail[i + 1]; i += 2) { 237 phys_avail[i] = round_page64(phys_avail[i]); 238 phys_avail[i + 1] = trunc_page64(phys_avail[i + 1]); 239 } 240 241 for (i = 0; phys_avail[i + 1]; i += 2) { 242 vm_paddr_t size = phys_avail[i + 1] - phys_avail[i]; 243 244 if (size > biggestsize) { 245 biggestone = i; 246 biggestsize = size; 247 } 248 ++nblocks; 249 total += size; 250 } 251 252 end = phys_avail[biggestone+1]; 253 end = trunc_page(end); 254 255 /* 256 * Initialize the queue headers for the free queue, the active queue 257 * and the inactive queue. 258 */ 259 260 vm_page_queue_init(); 261 262 /* VKERNELs don't support minidumps and as such don't need vm_page_dump */ 263 #if !defined(_KERNEL_VIRTUAL) 264 /* 265 * Allocate a bitmap to indicate that a random physical page 266 * needs to be included in a minidump. 267 * 268 * The amd64 port needs this to indicate which direct map pages 269 * need to be dumped, via calls to dump_add_page()/dump_drop_page(). 270 * 271 * However, i386 still needs this workspace internally within the 272 * minidump code. In theory, they are not needed on i386, but are 273 * included should the sf_buf code decide to use them. 274 */ 275 page_range = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE; 276 vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY); 277 end -= vm_page_dump_size; 278 vm_page_dump = (void *)pmap_map(&vaddr, end, end + vm_page_dump_size, 279 VM_PROT_READ | VM_PROT_WRITE); 280 bzero((void *)vm_page_dump, vm_page_dump_size); 281 #endif 282 283 /* 284 * Compute the number of pages of memory that will be available for 285 * use (taking into account the overhead of a page structure per 286 * page). 287 */ 288 first_page = phys_avail[0] / PAGE_SIZE; 289 page_range = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE - first_page; 290 npages = (total - (page_range * sizeof(struct vm_page))) / PAGE_SIZE; 291 292 /* 293 * Initialize the mem entry structures now, and put them in the free 294 * queue. 295 */ 296 new_end = trunc_page(end - page_range * sizeof(struct vm_page)); 297 mapped = pmap_map(&vaddr, new_end, end, 298 VM_PROT_READ | VM_PROT_WRITE); 299 vm_page_array = (vm_page_t)mapped; 300 301 #if defined(__x86_64__) && !defined(_KERNEL_VIRTUAL) 302 /* 303 * since pmap_map on amd64 returns stuff out of a direct-map region, 304 * we have to manually add these pages to the minidump tracking so 305 * that they can be dumped, including the vm_page_array. 306 */ 307 for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE) 308 dump_add_page(pa); 309 #endif 310 311 /* 312 * Clear all of the page structures 313 */ 314 bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page)); 315 vm_page_array_size = page_range; 316 317 /* 318 * Construct the free queue(s) in ascending order (by physical 319 * address) so that the first 16MB of physical memory is allocated 320 * last rather than first. On large-memory machines, this avoids 321 * the exhaustion of low physical memory before isa_dmainit has run. 322 */ 323 vmstats.v_page_count = 0; 324 vmstats.v_free_count = 0; 325 for (i = 0; phys_avail[i + 1] && npages > 0; i += 2) { 326 pa = phys_avail[i]; 327 if (i == biggestone) 328 last_pa = new_end; 329 else 330 last_pa = phys_avail[i + 1]; 331 while (pa < last_pa && npages-- > 0) { 332 vm_add_new_page(pa); 333 pa += PAGE_SIZE; 334 } 335 } 336 return (vaddr); 337 } 338 339 /* 340 * Scan comparison function for Red-Black tree scans. An inclusive 341 * (start,end) is expected. Other fields are not used. 342 */ 343 int 344 rb_vm_page_scancmp(struct vm_page *p, void *data) 345 { 346 struct rb_vm_page_scan_info *info = data; 347 348 if (p->pindex < info->start_pindex) 349 return(-1); 350 if (p->pindex > info->end_pindex) 351 return(1); 352 return(0); 353 } 354 355 int 356 rb_vm_page_compare(struct vm_page *p1, struct vm_page *p2) 357 { 358 if (p1->pindex < p2->pindex) 359 return(-1); 360 if (p1->pindex > p2->pindex) 361 return(1); 362 return(0); 363 } 364 365 /* 366 * Holding a page keeps it from being reused. Other parts of the system 367 * can still disassociate the page from its current object and free it, or 368 * perform read or write I/O on it and/or otherwise manipulate the page, 369 * but if the page is held the VM system will leave the page and its data 370 * intact and not reuse the page for other purposes until the last hold 371 * reference is released. (see vm_page_wire() if you want to prevent the 372 * page from being disassociated from its object too). 373 * 374 * The caller must hold vm_token. 375 * 376 * The caller must still validate the contents of the page and, if necessary, 377 * wait for any pending I/O (e.g. vm_page_sleep_busy() loop) to complete 378 * before manipulating the page. 379 */ 380 void 381 vm_page_hold(vm_page_t m) 382 { 383 ASSERT_LWKT_TOKEN_HELD(&vm_token); 384 ++m->hold_count; 385 } 386 387 /* 388 * The opposite of vm_page_hold(). A page can be freed while being held, 389 * which places it on the PQ_HOLD queue. We must call vm_page_free_toq() 390 * in this case to actually free it once the hold count drops to 0. 391 * 392 * The caller must hold vm_token if non-blocking operation is desired, 393 * but otherwise does not need to. 394 */ 395 void 396 vm_page_unhold(vm_page_t m) 397 { 398 lwkt_gettoken(&vm_token); 399 --m->hold_count; 400 KASSERT(m->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!")); 401 if (m->hold_count == 0 && m->queue == PQ_HOLD) { 402 vm_page_busy(m); 403 vm_page_free_toq(m); 404 } 405 lwkt_reltoken(&vm_token); 406 } 407 408 /* 409 * Inserts the given vm_page into the object and object list. 410 * 411 * The pagetables are not updated but will presumably fault the page 412 * in if necessary, or if a kernel page the caller will at some point 413 * enter the page into the kernel's pmap. We are not allowed to block 414 * here so we *can't* do this anyway. 415 * 416 * This routine may not block. 417 * This routine must be called with the vm_token held. 418 * This routine must be called with a critical section held. 419 */ 420 void 421 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) 422 { 423 ASSERT_IN_CRIT_SECTION(); 424 ASSERT_LWKT_TOKEN_HELD(&vm_token); 425 if (m->object != NULL) 426 panic("vm_page_insert: already inserted"); 427 428 /* 429 * Record the object/offset pair in this page 430 */ 431 m->object = object; 432 m->pindex = pindex; 433 434 /* 435 * Insert it into the object. 436 */ 437 vm_page_rb_tree_RB_INSERT(&object->rb_memq, m); 438 object->generation++; 439 440 /* 441 * show that the object has one more resident page. 442 */ 443 object->resident_page_count++; 444 445 /* 446 * Add the pv_list_cout of the page when its inserted in 447 * the object 448 */ 449 object->agg_pv_list_count = object->agg_pv_list_count + m->md.pv_list_count; 450 451 /* 452 * Since we are inserting a new and possibly dirty page, 453 * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags. 454 */ 455 if ((m->valid & m->dirty) || (m->flags & PG_WRITEABLE)) 456 vm_object_set_writeable_dirty(object); 457 458 /* 459 * Checks for a swap assignment and sets PG_SWAPPED if appropriate. 460 */ 461 swap_pager_page_inserted(m); 462 } 463 464 /* 465 * Removes the given vm_page_t from the global (object,index) hash table 466 * and from the object's memq. 467 * 468 * The underlying pmap entry (if any) is NOT removed here. 469 * This routine may not block. 470 * 471 * The page must be BUSY and will remain BUSY on return. 472 * No other requirements. 473 * 474 * NOTE: FreeBSD side effect was to unbusy the page on return. We leave 475 * it busy. 476 */ 477 void 478 vm_page_remove(vm_page_t m) 479 { 480 vm_object_t object; 481 482 crit_enter(); 483 lwkt_gettoken(&vm_token); 484 if (m->object == NULL) { 485 lwkt_reltoken(&vm_token); 486 crit_exit(); 487 return; 488 } 489 490 if ((m->flags & PG_BUSY) == 0) 491 panic("vm_page_remove: page not busy"); 492 493 object = m->object; 494 495 /* 496 * Remove the page from the object and update the object. 497 */ 498 vm_page_rb_tree_RB_REMOVE(&object->rb_memq, m); 499 object->resident_page_count--; 500 object->agg_pv_list_count = object->agg_pv_list_count - m->md.pv_list_count; 501 object->generation++; 502 m->object = NULL; 503 504 lwkt_reltoken(&vm_token); 505 crit_exit(); 506 } 507 508 /* 509 * Locate and return the page at (object, pindex), or NULL if the 510 * page could not be found. 511 * 512 * The caller must hold vm_token. 513 */ 514 vm_page_t 515 vm_page_lookup(vm_object_t object, vm_pindex_t pindex) 516 { 517 vm_page_t m; 518 519 /* 520 * Search the hash table for this object/offset pair 521 */ 522 ASSERT_LWKT_TOKEN_HELD(&vm_token); 523 crit_enter(); 524 m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex); 525 crit_exit(); 526 KKASSERT(m == NULL || (m->object == object && m->pindex == pindex)); 527 return(m); 528 } 529 530 /* 531 * vm_page_rename() 532 * 533 * Move the given memory entry from its current object to the specified 534 * target object/offset. 535 * 536 * The object must be locked. 537 * This routine may not block. 538 * 539 * Note: This routine will raise itself to splvm(), the caller need not. 540 * 541 * Note: Swap associated with the page must be invalidated by the move. We 542 * have to do this for several reasons: (1) we aren't freeing the 543 * page, (2) we are dirtying the page, (3) the VM system is probably 544 * moving the page from object A to B, and will then later move 545 * the backing store from A to B and we can't have a conflict. 546 * 547 * Note: We *always* dirty the page. It is necessary both for the 548 * fact that we moved it, and because we may be invalidating 549 * swap. If the page is on the cache, we have to deactivate it 550 * or vm_page_dirty() will panic. Dirty pages are not allowed 551 * on the cache. 552 */ 553 void 554 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) 555 { 556 crit_enter(); 557 lwkt_gettoken(&vm_token); 558 vm_page_remove(m); 559 vm_page_insert(m, new_object, new_pindex); 560 if (m->queue - m->pc == PQ_CACHE) 561 vm_page_deactivate(m); 562 vm_page_dirty(m); 563 vm_page_wakeup(m); 564 lwkt_reltoken(&vm_token); 565 crit_exit(); 566 } 567 568 /* 569 * vm_page_unqueue() without any wakeup. This routine is used when a page 570 * is being moved between queues or otherwise is to remain BUSYied by the 571 * caller. 572 * 573 * The caller must hold vm_token 574 * This routine may not block. 575 */ 576 void 577 vm_page_unqueue_nowakeup(vm_page_t m) 578 { 579 int queue = m->queue; 580 struct vpgqueues *pq; 581 582 ASSERT_LWKT_TOKEN_HELD(&vm_token); 583 if (queue != PQ_NONE) { 584 pq = &vm_page_queues[queue]; 585 m->queue = PQ_NONE; 586 TAILQ_REMOVE(&pq->pl, m, pageq); 587 (*pq->cnt)--; 588 pq->lcnt--; 589 } 590 } 591 592 /* 593 * vm_page_unqueue() - Remove a page from its queue, wakeup the pagedemon 594 * if necessary. 595 * 596 * The caller must hold vm_token 597 * This routine may not block. 598 */ 599 void 600 vm_page_unqueue(vm_page_t m) 601 { 602 int queue = m->queue; 603 struct vpgqueues *pq; 604 605 ASSERT_LWKT_TOKEN_HELD(&vm_token); 606 if (queue != PQ_NONE) { 607 m->queue = PQ_NONE; 608 pq = &vm_page_queues[queue]; 609 TAILQ_REMOVE(&pq->pl, m, pageq); 610 (*pq->cnt)--; 611 pq->lcnt--; 612 if ((queue - m->pc) == PQ_CACHE || (queue - m->pc) == PQ_FREE) 613 pagedaemon_wakeup(); 614 } 615 } 616 617 /* 618 * vm_page_list_find() 619 * 620 * Find a page on the specified queue with color optimization. 621 * 622 * The page coloring optimization attempts to locate a page that does 623 * not overload other nearby pages in the object in the cpu's L1 or L2 624 * caches. We need this optimization because cpu caches tend to be 625 * physical caches, while object spaces tend to be virtual. 626 * 627 * Must be called with vm_token held. 628 * This routine may not block. 629 * 630 * Note that this routine is carefully inlined. A non-inlined version 631 * is available for outside callers but the only critical path is 632 * from within this source file. 633 */ 634 static __inline 635 vm_page_t 636 _vm_page_list_find(int basequeue, int index, boolean_t prefer_zero) 637 { 638 vm_page_t m; 639 640 if (prefer_zero) 641 m = TAILQ_LAST(&vm_page_queues[basequeue+index].pl, pglist); 642 else 643 m = TAILQ_FIRST(&vm_page_queues[basequeue+index].pl); 644 if (m == NULL) 645 m = _vm_page_list_find2(basequeue, index); 646 return(m); 647 } 648 649 static vm_page_t 650 _vm_page_list_find2(int basequeue, int index) 651 { 652 int i; 653 vm_page_t m = NULL; 654 struct vpgqueues *pq; 655 656 pq = &vm_page_queues[basequeue]; 657 658 /* 659 * Note that for the first loop, index+i and index-i wind up at the 660 * same place. Even though this is not totally optimal, we've already 661 * blown it by missing the cache case so we do not care. 662 */ 663 664 for(i = PQ_L2_SIZE / 2; i > 0; --i) { 665 if ((m = TAILQ_FIRST(&pq[(index + i) & PQ_L2_MASK].pl)) != NULL) 666 break; 667 668 if ((m = TAILQ_FIRST(&pq[(index - i) & PQ_L2_MASK].pl)) != NULL) 669 break; 670 } 671 return(m); 672 } 673 674 /* 675 * Must be called with vm_token held if the caller desired non-blocking 676 * operation and a stable result. 677 */ 678 vm_page_t 679 vm_page_list_find(int basequeue, int index, boolean_t prefer_zero) 680 { 681 return(_vm_page_list_find(basequeue, index, prefer_zero)); 682 } 683 684 /* 685 * Find a page on the cache queue with color optimization. As pages 686 * might be found, but not applicable, they are deactivated. This 687 * keeps us from using potentially busy cached pages. 688 * 689 * This routine may not block. 690 * Must be called with vm_token held. 691 */ 692 vm_page_t 693 vm_page_select_cache(vm_object_t object, vm_pindex_t pindex) 694 { 695 vm_page_t m; 696 697 ASSERT_LWKT_TOKEN_HELD(&vm_token); 698 while (TRUE) { 699 m = _vm_page_list_find( 700 PQ_CACHE, 701 (pindex + object->pg_color) & PQ_L2_MASK, 702 FALSE 703 ); 704 if (m && ((m->flags & (PG_BUSY|PG_UNMANAGED)) || m->busy || 705 m->hold_count || m->wire_count)) { 706 vm_page_deactivate(m); 707 continue; 708 } 709 return m; 710 } 711 /* not reached */ 712 } 713 714 /* 715 * Find a free or zero page, with specified preference. We attempt to 716 * inline the nominal case and fall back to _vm_page_select_free() 717 * otherwise. 718 * 719 * This routine must be called with a critical section held. 720 * This routine may not block. 721 */ 722 static __inline vm_page_t 723 vm_page_select_free(vm_object_t object, vm_pindex_t pindex, boolean_t prefer_zero) 724 { 725 vm_page_t m; 726 727 m = _vm_page_list_find( 728 PQ_FREE, 729 (pindex + object->pg_color) & PQ_L2_MASK, 730 prefer_zero 731 ); 732 return(m); 733 } 734 735 /* 736 * vm_page_alloc() 737 * 738 * Allocate and return a memory cell associated with this VM object/offset 739 * pair. 740 * 741 * page_req classes: 742 * 743 * VM_ALLOC_NORMAL allow use of cache pages, nominal free drain 744 * VM_ALLOC_QUICK like normal but cannot use cache 745 * VM_ALLOC_SYSTEM greater free drain 746 * VM_ALLOC_INTERRUPT allow free list to be completely drained 747 * VM_ALLOC_ZERO advisory request for pre-zero'd page 748 * 749 * The object must be locked. 750 * This routine may not block. 751 * The returned page will be marked PG_BUSY 752 * 753 * Additional special handling is required when called from an interrupt 754 * (VM_ALLOC_INTERRUPT). We are not allowed to mess with the page cache 755 * in this case. 756 */ 757 vm_page_t 758 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req) 759 { 760 vm_page_t m = NULL; 761 762 crit_enter(); 763 lwkt_gettoken(&vm_token); 764 765 KKASSERT(object != NULL); 766 KASSERT(!vm_page_lookup(object, pindex), 767 ("vm_page_alloc: page already allocated")); 768 KKASSERT(page_req & 769 (VM_ALLOC_NORMAL|VM_ALLOC_QUICK| 770 VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM)); 771 772 /* 773 * Certain system threads (pageout daemon, buf_daemon's) are 774 * allowed to eat deeper into the free page list. 775 */ 776 if (curthread->td_flags & TDF_SYSTHREAD) 777 page_req |= VM_ALLOC_SYSTEM; 778 779 loop: 780 if (vmstats.v_free_count > vmstats.v_free_reserved || 781 ((page_req & VM_ALLOC_INTERRUPT) && vmstats.v_free_count > 0) || 782 ((page_req & VM_ALLOC_SYSTEM) && vmstats.v_cache_count == 0 && 783 vmstats.v_free_count > vmstats.v_interrupt_free_min) 784 ) { 785 /* 786 * The free queue has sufficient free pages to take one out. 787 */ 788 if (page_req & VM_ALLOC_ZERO) 789 m = vm_page_select_free(object, pindex, TRUE); 790 else 791 m = vm_page_select_free(object, pindex, FALSE); 792 } else if (page_req & VM_ALLOC_NORMAL) { 793 /* 794 * Allocatable from the cache (non-interrupt only). On 795 * success, we must free the page and try again, thus 796 * ensuring that vmstats.v_*_free_min counters are replenished. 797 */ 798 #ifdef INVARIANTS 799 if (curthread->td_preempted) { 800 kprintf("vm_page_alloc(): warning, attempt to allocate" 801 " cache page from preempting interrupt\n"); 802 m = NULL; 803 } else { 804 m = vm_page_select_cache(object, pindex); 805 } 806 #else 807 m = vm_page_select_cache(object, pindex); 808 #endif 809 /* 810 * On success move the page into the free queue and loop. 811 */ 812 if (m != NULL) { 813 KASSERT(m->dirty == 0, 814 ("Found dirty cache page %p", m)); 815 vm_page_busy(m); 816 vm_page_protect(m, VM_PROT_NONE); 817 vm_page_free(m); 818 goto loop; 819 } 820 821 /* 822 * On failure return NULL 823 */ 824 lwkt_reltoken(&vm_token); 825 crit_exit(); 826 #if defined(DIAGNOSTIC) 827 if (vmstats.v_cache_count > 0) 828 kprintf("vm_page_alloc(NORMAL): missing pages on cache queue: %d\n", vmstats.v_cache_count); 829 #endif 830 vm_pageout_deficit++; 831 pagedaemon_wakeup(); 832 return (NULL); 833 } else { 834 /* 835 * No pages available, wakeup the pageout daemon and give up. 836 */ 837 lwkt_reltoken(&vm_token); 838 crit_exit(); 839 vm_pageout_deficit++; 840 pagedaemon_wakeup(); 841 return (NULL); 842 } 843 844 /* 845 * Good page found. The page has not yet been busied. We are in 846 * a critical section. 847 */ 848 KASSERT(m != NULL, ("vm_page_alloc(): missing page on free queue\n")); 849 KASSERT(m->dirty == 0, 850 ("vm_page_alloc: free/cache page %p was dirty", m)); 851 852 /* 853 * Remove from free queue 854 */ 855 vm_page_unqueue_nowakeup(m); 856 857 /* 858 * Initialize structure. Only the PG_ZERO flag is inherited. Set 859 * the page PG_BUSY 860 */ 861 if (m->flags & PG_ZERO) { 862 vm_page_zero_count--; 863 m->flags = PG_ZERO | PG_BUSY; 864 } else { 865 m->flags = PG_BUSY; 866 } 867 m->wire_count = 0; 868 m->hold_count = 0; 869 m->act_count = 0; 870 m->busy = 0; 871 m->valid = 0; 872 873 /* 874 * vm_page_insert() is safe prior to the crit_exit(). Note also that 875 * inserting a page here does not insert it into the pmap (which 876 * could cause us to block allocating memory). We cannot block 877 * anywhere. 878 */ 879 vm_page_insert(m, object, pindex); 880 881 /* 882 * Don't wakeup too often - wakeup the pageout daemon when 883 * we would be nearly out of memory. 884 */ 885 pagedaemon_wakeup(); 886 887 lwkt_reltoken(&vm_token); 888 crit_exit(); 889 890 /* 891 * A PG_BUSY page is returned. 892 */ 893 return (m); 894 } 895 896 /* 897 * Wait for sufficient free memory for nominal heavy memory use kernel 898 * operations. 899 */ 900 void 901 vm_wait_nominal(void) 902 { 903 while (vm_page_count_min(0)) 904 vm_wait(0); 905 } 906 907 /* 908 * Test if vm_wait_nominal() would block. 909 */ 910 int 911 vm_test_nominal(void) 912 { 913 if (vm_page_count_min(0)) 914 return(1); 915 return(0); 916 } 917 918 /* 919 * Block until free pages are available for allocation, called in various 920 * places before memory allocations. 921 * 922 * The caller may loop if vm_page_count_min() == FALSE so we cannot be 923 * more generous then that. 924 */ 925 void 926 vm_wait(int timo) 927 { 928 /* 929 * never wait forever 930 */ 931 if (timo == 0) 932 timo = hz; 933 lwkt_gettoken(&vm_token); 934 935 if (curthread == pagethread) { 936 /* 937 * The pageout daemon itself needs pages, this is bad. 938 */ 939 if (vm_page_count_min(0)) { 940 vm_pageout_pages_needed = 1; 941 tsleep(&vm_pageout_pages_needed, 0, "VMWait", timo); 942 } 943 } else { 944 /* 945 * Wakeup the pageout daemon if necessary and wait. 946 */ 947 if (vm_page_count_target()) { 948 if (vm_pages_needed == 0) { 949 vm_pages_needed = 1; 950 wakeup(&vm_pages_needed); 951 } 952 ++vm_pages_waiting; /* SMP race ok */ 953 tsleep(&vmstats.v_free_count, 0, "vmwait", timo); 954 } 955 } 956 lwkt_reltoken(&vm_token); 957 } 958 959 /* 960 * Block until free pages are available for allocation 961 * 962 * Called only from vm_fault so that processes page faulting can be 963 * easily tracked. 964 */ 965 void 966 vm_waitpfault(void) 967 { 968 /* 969 * Wakeup the pageout daemon if necessary and wait. 970 */ 971 if (vm_page_count_target()) { 972 lwkt_gettoken(&vm_token); 973 if (vm_page_count_target()) { 974 if (vm_pages_needed == 0) { 975 vm_pages_needed = 1; 976 wakeup(&vm_pages_needed); 977 } 978 ++vm_pages_waiting; /* SMP race ok */ 979 tsleep(&vmstats.v_free_count, 0, "pfault", hz); 980 } 981 lwkt_reltoken(&vm_token); 982 } 983 } 984 985 /* 986 * Put the specified page on the active list (if appropriate). Ensure 987 * that act_count is at least ACT_INIT but do not otherwise mess with it. 988 * 989 * The page queues must be locked. 990 * This routine may not block. 991 */ 992 void 993 vm_page_activate(vm_page_t m) 994 { 995 crit_enter(); 996 lwkt_gettoken(&vm_token); 997 if (m->queue != PQ_ACTIVE) { 998 if ((m->queue - m->pc) == PQ_CACHE) 999 mycpu->gd_cnt.v_reactivated++; 1000 1001 vm_page_unqueue(m); 1002 1003 if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) { 1004 m->queue = PQ_ACTIVE; 1005 vm_page_queues[PQ_ACTIVE].lcnt++; 1006 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, 1007 m, pageq); 1008 if (m->act_count < ACT_INIT) 1009 m->act_count = ACT_INIT; 1010 vmstats.v_active_count++; 1011 } 1012 } else { 1013 if (m->act_count < ACT_INIT) 1014 m->act_count = ACT_INIT; 1015 } 1016 lwkt_reltoken(&vm_token); 1017 crit_exit(); 1018 } 1019 1020 /* 1021 * Helper routine for vm_page_free_toq() and vm_page_cache(). This 1022 * routine is called when a page has been added to the cache or free 1023 * queues. 1024 * 1025 * This routine may not block. 1026 * This routine must be called at splvm() 1027 */ 1028 static __inline void 1029 vm_page_free_wakeup(void) 1030 { 1031 /* 1032 * If the pageout daemon itself needs pages, then tell it that 1033 * there are some free. 1034 */ 1035 if (vm_pageout_pages_needed && 1036 vmstats.v_cache_count + vmstats.v_free_count >= 1037 vmstats.v_pageout_free_min 1038 ) { 1039 wakeup(&vm_pageout_pages_needed); 1040 vm_pageout_pages_needed = 0; 1041 } 1042 1043 /* 1044 * Wakeup processes that are waiting on memory. 1045 * 1046 * NOTE: vm_paging_target() is the pageout daemon's target, while 1047 * vm_page_count_target() is somewhere inbetween. We want 1048 * to wake processes up prior to the pageout daemon reaching 1049 * its target to provide some hysteresis. 1050 */ 1051 if (vm_pages_waiting) { 1052 if (!vm_page_count_target()) { 1053 /* 1054 * Plenty of pages are free, wakeup everyone. 1055 */ 1056 vm_pages_waiting = 0; 1057 wakeup(&vmstats.v_free_count); 1058 ++mycpu->gd_cnt.v_ppwakeups; 1059 } else if (!vm_page_count_min(0)) { 1060 /* 1061 * Some pages are free, wakeup someone. 1062 */ 1063 int wcount = vm_pages_waiting; 1064 if (wcount > 0) 1065 --wcount; 1066 vm_pages_waiting = wcount; 1067 wakeup_one(&vmstats.v_free_count); 1068 ++mycpu->gd_cnt.v_ppwakeups; 1069 } 1070 } 1071 } 1072 1073 /* 1074 * vm_page_free_toq: 1075 * 1076 * Returns the given page to the PQ_FREE list, disassociating it with 1077 * any VM object. 1078 * 1079 * The vm_page must be PG_BUSY on entry. PG_BUSY will be released on 1080 * return (the page will have been freed). No particular spl is required 1081 * on entry. 1082 * 1083 * This routine may not block. 1084 */ 1085 void 1086 vm_page_free_toq(vm_page_t m) 1087 { 1088 struct vpgqueues *pq; 1089 1090 crit_enter(); 1091 lwkt_gettoken(&vm_token); 1092 mycpu->gd_cnt.v_tfree++; 1093 1094 KKASSERT((m->flags & PG_MAPPED) == 0); 1095 1096 if (m->busy || ((m->queue - m->pc) == PQ_FREE)) { 1097 kprintf( 1098 "vm_page_free: pindex(%lu), busy(%d), PG_BUSY(%d), hold(%d)\n", 1099 (u_long)m->pindex, m->busy, (m->flags & PG_BUSY) ? 1 : 0, 1100 m->hold_count); 1101 if ((m->queue - m->pc) == PQ_FREE) 1102 panic("vm_page_free: freeing free page"); 1103 else 1104 panic("vm_page_free: freeing busy page"); 1105 } 1106 1107 /* 1108 * unqueue, then remove page. Note that we cannot destroy 1109 * the page here because we do not want to call the pager's 1110 * callback routine until after we've put the page on the 1111 * appropriate free queue. 1112 */ 1113 vm_page_unqueue_nowakeup(m); 1114 vm_page_remove(m); 1115 1116 /* 1117 * No further management of fictitious pages occurs beyond object 1118 * and queue removal. 1119 */ 1120 if ((m->flags & PG_FICTITIOUS) != 0) { 1121 vm_page_wakeup(m); 1122 lwkt_reltoken(&vm_token); 1123 crit_exit(); 1124 return; 1125 } 1126 1127 m->valid = 0; 1128 vm_page_undirty(m); 1129 1130 if (m->wire_count != 0) { 1131 if (m->wire_count > 1) { 1132 panic( 1133 "vm_page_free: invalid wire count (%d), pindex: 0x%lx", 1134 m->wire_count, (long)m->pindex); 1135 } 1136 panic("vm_page_free: freeing wired page"); 1137 } 1138 1139 /* 1140 * Clear the UNMANAGED flag when freeing an unmanaged page. 1141 */ 1142 if (m->flags & PG_UNMANAGED) { 1143 m->flags &= ~PG_UNMANAGED; 1144 } 1145 1146 if (m->hold_count != 0) { 1147 m->flags &= ~PG_ZERO; 1148 m->queue = PQ_HOLD; 1149 } else { 1150 m->queue = PQ_FREE + m->pc; 1151 } 1152 pq = &vm_page_queues[m->queue]; 1153 pq->lcnt++; 1154 ++(*pq->cnt); 1155 1156 /* 1157 * Put zero'd pages on the end ( where we look for zero'd pages 1158 * first ) and non-zerod pages at the head. 1159 */ 1160 if (m->flags & PG_ZERO) { 1161 TAILQ_INSERT_TAIL(&pq->pl, m, pageq); 1162 ++vm_page_zero_count; 1163 } else { 1164 TAILQ_INSERT_HEAD(&pq->pl, m, pageq); 1165 } 1166 vm_page_wakeup(m); 1167 vm_page_free_wakeup(); 1168 lwkt_reltoken(&vm_token); 1169 crit_exit(); 1170 } 1171 1172 /* 1173 * vm_page_free_fromq_fast() 1174 * 1175 * Remove a non-zero page from one of the free queues; the page is removed for 1176 * zeroing, so do not issue a wakeup. 1177 * 1178 * MPUNSAFE 1179 */ 1180 vm_page_t 1181 vm_page_free_fromq_fast(void) 1182 { 1183 static int qi; 1184 vm_page_t m; 1185 int i; 1186 1187 crit_enter(); 1188 lwkt_gettoken(&vm_token); 1189 for (i = 0; i < PQ_L2_SIZE; ++i) { 1190 m = vm_page_list_find(PQ_FREE, qi, FALSE); 1191 qi = (qi + PQ_PRIME2) & PQ_L2_MASK; 1192 if (m && (m->flags & PG_ZERO) == 0) { 1193 vm_page_unqueue_nowakeup(m); 1194 vm_page_busy(m); 1195 break; 1196 } 1197 m = NULL; 1198 } 1199 lwkt_reltoken(&vm_token); 1200 crit_exit(); 1201 return (m); 1202 } 1203 1204 /* 1205 * vm_page_unmanage() 1206 * 1207 * Prevent PV management from being done on the page. The page is 1208 * removed from the paging queues as if it were wired, and as a 1209 * consequence of no longer being managed the pageout daemon will not 1210 * touch it (since there is no way to locate the pte mappings for the 1211 * page). madvise() calls that mess with the pmap will also no longer 1212 * operate on the page. 1213 * 1214 * Beyond that the page is still reasonably 'normal'. Freeing the page 1215 * will clear the flag. 1216 * 1217 * This routine is used by OBJT_PHYS objects - objects using unswappable 1218 * physical memory as backing store rather then swap-backed memory and 1219 * will eventually be extended to support 4MB unmanaged physical 1220 * mappings. 1221 * 1222 * Must be called with a critical section held. 1223 * Must be called with vm_token held. 1224 */ 1225 void 1226 vm_page_unmanage(vm_page_t m) 1227 { 1228 ASSERT_IN_CRIT_SECTION(); 1229 ASSERT_LWKT_TOKEN_HELD(&vm_token); 1230 if ((m->flags & PG_UNMANAGED) == 0) { 1231 if (m->wire_count == 0) 1232 vm_page_unqueue(m); 1233 } 1234 vm_page_flag_set(m, PG_UNMANAGED); 1235 } 1236 1237 /* 1238 * Mark this page as wired down by yet another map, removing it from 1239 * paging queues as necessary. 1240 * 1241 * The page queues must be locked. 1242 * This routine may not block. 1243 */ 1244 void 1245 vm_page_wire(vm_page_t m) 1246 { 1247 /* 1248 * Only bump the wire statistics if the page is not already wired, 1249 * and only unqueue the page if it is on some queue (if it is unmanaged 1250 * it is already off the queues). Don't do anything with fictitious 1251 * pages because they are always wired. 1252 */ 1253 crit_enter(); 1254 lwkt_gettoken(&vm_token); 1255 if ((m->flags & PG_FICTITIOUS) == 0) { 1256 if (m->wire_count == 0) { 1257 if ((m->flags & PG_UNMANAGED) == 0) 1258 vm_page_unqueue(m); 1259 vmstats.v_wire_count++; 1260 } 1261 m->wire_count++; 1262 KASSERT(m->wire_count != 0, 1263 ("vm_page_wire: wire_count overflow m=%p", m)); 1264 } 1265 lwkt_reltoken(&vm_token); 1266 crit_exit(); 1267 } 1268 1269 /* 1270 * Release one wiring of this page, potentially enabling it to be paged again. 1271 * 1272 * Many pages placed on the inactive queue should actually go 1273 * into the cache, but it is difficult to figure out which. What 1274 * we do instead, if the inactive target is well met, is to put 1275 * clean pages at the head of the inactive queue instead of the tail. 1276 * This will cause them to be moved to the cache more quickly and 1277 * if not actively re-referenced, freed more quickly. If we just 1278 * stick these pages at the end of the inactive queue, heavy filesystem 1279 * meta-data accesses can cause an unnecessary paging load on memory bound 1280 * processes. This optimization causes one-time-use metadata to be 1281 * reused more quickly. 1282 * 1283 * BUT, if we are in a low-memory situation we have no choice but to 1284 * put clean pages on the cache queue. 1285 * 1286 * A number of routines use vm_page_unwire() to guarantee that the page 1287 * will go into either the inactive or active queues, and will NEVER 1288 * be placed in the cache - for example, just after dirtying a page. 1289 * dirty pages in the cache are not allowed. 1290 * 1291 * The page queues must be locked. 1292 * This routine may not block. 1293 */ 1294 void 1295 vm_page_unwire(vm_page_t m, int activate) 1296 { 1297 crit_enter(); 1298 lwkt_gettoken(&vm_token); 1299 if (m->flags & PG_FICTITIOUS) { 1300 /* do nothing */ 1301 } else if (m->wire_count <= 0) { 1302 panic("vm_page_unwire: invalid wire count: %d", m->wire_count); 1303 } else { 1304 if (--m->wire_count == 0) { 1305 --vmstats.v_wire_count; 1306 if (m->flags & PG_UNMANAGED) { 1307 ; 1308 } else if (activate) { 1309 TAILQ_INSERT_TAIL( 1310 &vm_page_queues[PQ_ACTIVE].pl, m, pageq); 1311 m->queue = PQ_ACTIVE; 1312 vm_page_queues[PQ_ACTIVE].lcnt++; 1313 vmstats.v_active_count++; 1314 } else { 1315 vm_page_flag_clear(m, PG_WINATCFLS); 1316 TAILQ_INSERT_TAIL( 1317 &vm_page_queues[PQ_INACTIVE].pl, m, pageq); 1318 m->queue = PQ_INACTIVE; 1319 vm_page_queues[PQ_INACTIVE].lcnt++; 1320 vmstats.v_inactive_count++; 1321 ++vm_swapcache_inactive_heuristic; 1322 } 1323 } 1324 } 1325 lwkt_reltoken(&vm_token); 1326 crit_exit(); 1327 } 1328 1329 1330 /* 1331 * Move the specified page to the inactive queue. If the page has 1332 * any associated swap, the swap is deallocated. 1333 * 1334 * Normally athead is 0 resulting in LRU operation. athead is set 1335 * to 1 if we want this page to be 'as if it were placed in the cache', 1336 * except without unmapping it from the process address space. 1337 * 1338 * This routine may not block. 1339 * The caller must hold vm_token. 1340 */ 1341 static __inline void 1342 _vm_page_deactivate(vm_page_t m, int athead) 1343 { 1344 /* 1345 * Ignore if already inactive. 1346 */ 1347 if (m->queue == PQ_INACTIVE) 1348 return; 1349 1350 if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) { 1351 if ((m->queue - m->pc) == PQ_CACHE) 1352 mycpu->gd_cnt.v_reactivated++; 1353 vm_page_flag_clear(m, PG_WINATCFLS); 1354 vm_page_unqueue(m); 1355 if (athead) { 1356 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, 1357 m, pageq); 1358 } else { 1359 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, 1360 m, pageq); 1361 ++vm_swapcache_inactive_heuristic; 1362 } 1363 m->queue = PQ_INACTIVE; 1364 vm_page_queues[PQ_INACTIVE].lcnt++; 1365 vmstats.v_inactive_count++; 1366 } 1367 } 1368 1369 /* 1370 * Attempt to deactivate a page. 1371 * 1372 * No requirements. 1373 */ 1374 void 1375 vm_page_deactivate(vm_page_t m) 1376 { 1377 crit_enter(); 1378 lwkt_gettoken(&vm_token); 1379 _vm_page_deactivate(m, 0); 1380 lwkt_reltoken(&vm_token); 1381 crit_exit(); 1382 } 1383 1384 /* 1385 * Attempt to move a page to PQ_CACHE. 1386 * Returns 0 on failure, 1 on success 1387 * 1388 * No requirements. 1389 */ 1390 int 1391 vm_page_try_to_cache(vm_page_t m) 1392 { 1393 crit_enter(); 1394 lwkt_gettoken(&vm_token); 1395 if (m->dirty || m->hold_count || m->busy || m->wire_count || 1396 (m->flags & (PG_BUSY|PG_UNMANAGED))) { 1397 lwkt_reltoken(&vm_token); 1398 crit_exit(); 1399 return(0); 1400 } 1401 vm_page_test_dirty(m); 1402 if (m->dirty) { 1403 lwkt_reltoken(&vm_token); 1404 crit_exit(); 1405 return(0); 1406 } 1407 vm_page_cache(m); 1408 lwkt_reltoken(&vm_token); 1409 crit_exit(); 1410 return(1); 1411 } 1412 1413 /* 1414 * Attempt to free the page. If we cannot free it, we do nothing. 1415 * 1 is returned on success, 0 on failure. 1416 * 1417 * No requirements. 1418 */ 1419 int 1420 vm_page_try_to_free(vm_page_t m) 1421 { 1422 crit_enter(); 1423 lwkt_gettoken(&vm_token); 1424 if (m->dirty || m->hold_count || m->busy || m->wire_count || 1425 (m->flags & (PG_BUSY|PG_UNMANAGED))) { 1426 lwkt_reltoken(&vm_token); 1427 crit_exit(); 1428 return(0); 1429 } 1430 vm_page_test_dirty(m); 1431 if (m->dirty) { 1432 lwkt_reltoken(&vm_token); 1433 crit_exit(); 1434 return(0); 1435 } 1436 vm_page_busy(m); 1437 vm_page_protect(m, VM_PROT_NONE); 1438 vm_page_free(m); 1439 lwkt_reltoken(&vm_token); 1440 crit_exit(); 1441 return(1); 1442 } 1443 1444 /* 1445 * vm_page_cache 1446 * 1447 * Put the specified page onto the page cache queue (if appropriate). 1448 * 1449 * The caller must hold vm_token. 1450 * This routine may not block. 1451 */ 1452 void 1453 vm_page_cache(vm_page_t m) 1454 { 1455 ASSERT_IN_CRIT_SECTION(); 1456 ASSERT_LWKT_TOKEN_HELD(&vm_token); 1457 1458 if ((m->flags & (PG_BUSY|PG_UNMANAGED)) || m->busy || 1459 m->wire_count || m->hold_count) { 1460 kprintf("vm_page_cache: attempting to cache busy/held page\n"); 1461 return; 1462 } 1463 1464 /* 1465 * Already in the cache (and thus not mapped) 1466 */ 1467 if ((m->queue - m->pc) == PQ_CACHE) { 1468 KKASSERT((m->flags & PG_MAPPED) == 0); 1469 return; 1470 } 1471 1472 /* 1473 * Caller is required to test m->dirty, but note that the act of 1474 * removing the page from its maps can cause it to become dirty 1475 * on an SMP system due to another cpu running in usermode. 1476 */ 1477 if (m->dirty) { 1478 panic("vm_page_cache: caching a dirty page, pindex: %ld", 1479 (long)m->pindex); 1480 } 1481 1482 /* 1483 * Remove all pmaps and indicate that the page is not 1484 * writeable or mapped. Our vm_page_protect() call may 1485 * have blocked (especially w/ VM_PROT_NONE), so recheck 1486 * everything. 1487 */ 1488 vm_page_busy(m); 1489 vm_page_protect(m, VM_PROT_NONE); 1490 vm_page_wakeup(m); 1491 if ((m->flags & (PG_BUSY|PG_UNMANAGED|PG_MAPPED)) || m->busy || 1492 m->wire_count || m->hold_count) { 1493 /* do nothing */ 1494 } else if (m->dirty) { 1495 vm_page_deactivate(m); 1496 } else { 1497 vm_page_unqueue_nowakeup(m); 1498 m->queue = PQ_CACHE + m->pc; 1499 vm_page_queues[m->queue].lcnt++; 1500 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq); 1501 vmstats.v_cache_count++; 1502 vm_page_free_wakeup(); 1503 } 1504 } 1505 1506 /* 1507 * vm_page_dontneed() 1508 * 1509 * Cache, deactivate, or do nothing as appropriate. This routine 1510 * is typically used by madvise() MADV_DONTNEED. 1511 * 1512 * Generally speaking we want to move the page into the cache so 1513 * it gets reused quickly. However, this can result in a silly syndrome 1514 * due to the page recycling too quickly. Small objects will not be 1515 * fully cached. On the otherhand, if we move the page to the inactive 1516 * queue we wind up with a problem whereby very large objects 1517 * unnecessarily blow away our inactive and cache queues. 1518 * 1519 * The solution is to move the pages based on a fixed weighting. We 1520 * either leave them alone, deactivate them, or move them to the cache, 1521 * where moving them to the cache has the highest weighting. 1522 * By forcing some pages into other queues we eventually force the 1523 * system to balance the queues, potentially recovering other unrelated 1524 * space from active. The idea is to not force this to happen too 1525 * often. 1526 * 1527 * No requirements. 1528 */ 1529 void 1530 vm_page_dontneed(vm_page_t m) 1531 { 1532 static int dnweight; 1533 int dnw; 1534 int head; 1535 1536 dnw = ++dnweight; 1537 1538 /* 1539 * occassionally leave the page alone 1540 */ 1541 crit_enter(); 1542 lwkt_gettoken(&vm_token); 1543 if ((dnw & 0x01F0) == 0 || 1544 m->queue == PQ_INACTIVE || 1545 m->queue - m->pc == PQ_CACHE 1546 ) { 1547 if (m->act_count >= ACT_INIT) 1548 --m->act_count; 1549 lwkt_reltoken(&vm_token); 1550 crit_exit(); 1551 return; 1552 } 1553 1554 if (m->dirty == 0) 1555 vm_page_test_dirty(m); 1556 1557 if (m->dirty || (dnw & 0x0070) == 0) { 1558 /* 1559 * Deactivate the page 3 times out of 32. 1560 */ 1561 head = 0; 1562 } else { 1563 /* 1564 * Cache the page 28 times out of every 32. Note that 1565 * the page is deactivated instead of cached, but placed 1566 * at the head of the queue instead of the tail. 1567 */ 1568 head = 1; 1569 } 1570 _vm_page_deactivate(m, head); 1571 lwkt_reltoken(&vm_token); 1572 crit_exit(); 1573 } 1574 1575 /* 1576 * Grab a page, blocking if it is busy and allocating a page if necessary. 1577 * A busy page is returned or NULL. 1578 * 1579 * If VM_ALLOC_RETRY is specified VM_ALLOC_NORMAL must also be specified. 1580 * If VM_ALLOC_RETRY is not specified 1581 * 1582 * This routine may block, but if VM_ALLOC_RETRY is not set then NULL is 1583 * always returned if we had blocked. 1584 * This routine will never return NULL if VM_ALLOC_RETRY is set. 1585 * This routine may not be called from an interrupt. 1586 * The returned page may not be entirely valid. 1587 * 1588 * This routine may be called from mainline code without spl protection and 1589 * be guarenteed a busied page associated with the object at the specified 1590 * index. 1591 * 1592 * No requirements. 1593 */ 1594 vm_page_t 1595 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) 1596 { 1597 vm_page_t m; 1598 int generation; 1599 1600 KKASSERT(allocflags & 1601 (VM_ALLOC_NORMAL|VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM)); 1602 crit_enter(); 1603 lwkt_gettoken(&vm_token); 1604 retrylookup: 1605 if ((m = vm_page_lookup(object, pindex)) != NULL) { 1606 if (m->busy || (m->flags & PG_BUSY)) { 1607 generation = object->generation; 1608 1609 while ((object->generation == generation) && 1610 (m->busy || (m->flags & PG_BUSY))) { 1611 vm_page_flag_set(m, PG_WANTED | PG_REFERENCED); 1612 tsleep(m, 0, "pgrbwt", 0); 1613 if ((allocflags & VM_ALLOC_RETRY) == 0) { 1614 m = NULL; 1615 goto done; 1616 } 1617 } 1618 goto retrylookup; 1619 } else { 1620 vm_page_busy(m); 1621 goto done; 1622 } 1623 } 1624 m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_RETRY); 1625 if (m == NULL) { 1626 vm_wait(0); 1627 if ((allocflags & VM_ALLOC_RETRY) == 0) 1628 goto done; 1629 goto retrylookup; 1630 } 1631 done: 1632 lwkt_reltoken(&vm_token); 1633 crit_exit(); 1634 return(m); 1635 } 1636 1637 /* 1638 * Mapping function for valid bits or for dirty bits in 1639 * a page. May not block. 1640 * 1641 * Inputs are required to range within a page. 1642 * 1643 * No requirements. 1644 * Non blocking. 1645 */ 1646 int 1647 vm_page_bits(int base, int size) 1648 { 1649 int first_bit; 1650 int last_bit; 1651 1652 KASSERT( 1653 base + size <= PAGE_SIZE, 1654 ("vm_page_bits: illegal base/size %d/%d", base, size) 1655 ); 1656 1657 if (size == 0) /* handle degenerate case */ 1658 return(0); 1659 1660 first_bit = base >> DEV_BSHIFT; 1661 last_bit = (base + size - 1) >> DEV_BSHIFT; 1662 1663 return ((2 << last_bit) - (1 << first_bit)); 1664 } 1665 1666 /* 1667 * Sets portions of a page valid and clean. The arguments are expected 1668 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive 1669 * of any partial chunks touched by the range. The invalid portion of 1670 * such chunks will be zero'd. 1671 * 1672 * NOTE: When truncating a buffer vnode_pager_setsize() will automatically 1673 * align base to DEV_BSIZE so as not to mark clean a partially 1674 * truncated device block. Otherwise the dirty page status might be 1675 * lost. 1676 * 1677 * This routine may not block. 1678 * 1679 * (base + size) must be less then or equal to PAGE_SIZE. 1680 */ 1681 static void 1682 _vm_page_zero_valid(vm_page_t m, int base, int size) 1683 { 1684 int frag; 1685 int endoff; 1686 1687 if (size == 0) /* handle degenerate case */ 1688 return; 1689 1690 /* 1691 * If the base is not DEV_BSIZE aligned and the valid 1692 * bit is clear, we have to zero out a portion of the 1693 * first block. 1694 */ 1695 1696 if ((frag = base & ~(DEV_BSIZE - 1)) != base && 1697 (m->valid & (1 << (base >> DEV_BSHIFT))) == 0 1698 ) { 1699 pmap_zero_page_area( 1700 VM_PAGE_TO_PHYS(m), 1701 frag, 1702 base - frag 1703 ); 1704 } 1705 1706 /* 1707 * If the ending offset is not DEV_BSIZE aligned and the 1708 * valid bit is clear, we have to zero out a portion of 1709 * the last block. 1710 */ 1711 1712 endoff = base + size; 1713 1714 if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && 1715 (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0 1716 ) { 1717 pmap_zero_page_area( 1718 VM_PAGE_TO_PHYS(m), 1719 endoff, 1720 DEV_BSIZE - (endoff & (DEV_BSIZE - 1)) 1721 ); 1722 } 1723 } 1724 1725 /* 1726 * Set valid, clear dirty bits. If validating the entire 1727 * page we can safely clear the pmap modify bit. We also 1728 * use this opportunity to clear the PG_NOSYNC flag. If a process 1729 * takes a write fault on a MAP_NOSYNC memory area the flag will 1730 * be set again. 1731 * 1732 * We set valid bits inclusive of any overlap, but we can only 1733 * clear dirty bits for DEV_BSIZE chunks that are fully within 1734 * the range. 1735 * 1736 * Page must be busied? 1737 * No other requirements. 1738 */ 1739 void 1740 vm_page_set_valid(vm_page_t m, int base, int size) 1741 { 1742 _vm_page_zero_valid(m, base, size); 1743 m->valid |= vm_page_bits(base, size); 1744 } 1745 1746 1747 /* 1748 * Set valid bits and clear dirty bits. 1749 * 1750 * NOTE: This function does not clear the pmap modified bit. 1751 * Also note that e.g. NFS may use a byte-granular base 1752 * and size. 1753 * 1754 * Page must be busied? 1755 * No other requirements. 1756 */ 1757 void 1758 vm_page_set_validclean(vm_page_t m, int base, int size) 1759 { 1760 int pagebits; 1761 1762 _vm_page_zero_valid(m, base, size); 1763 pagebits = vm_page_bits(base, size); 1764 m->valid |= pagebits; 1765 m->dirty &= ~pagebits; 1766 if (base == 0 && size == PAGE_SIZE) { 1767 /*pmap_clear_modify(m);*/ 1768 vm_page_flag_clear(m, PG_NOSYNC); 1769 } 1770 } 1771 1772 /* 1773 * Set valid & dirty. Used by buwrite() 1774 * 1775 * Page must be busied? 1776 * No other requirements. 1777 */ 1778 void 1779 vm_page_set_validdirty(vm_page_t m, int base, int size) 1780 { 1781 int pagebits; 1782 1783 pagebits = vm_page_bits(base, size); 1784 m->valid |= pagebits; 1785 m->dirty |= pagebits; 1786 if (m->object) 1787 vm_object_set_writeable_dirty(m->object); 1788 } 1789 1790 /* 1791 * Clear dirty bits. 1792 * 1793 * NOTE: This function does not clear the pmap modified bit. 1794 * Also note that e.g. NFS may use a byte-granular base 1795 * and size. 1796 * 1797 * Page must be busied? 1798 * No other requirements. 1799 */ 1800 void 1801 vm_page_clear_dirty(vm_page_t m, int base, int size) 1802 { 1803 m->dirty &= ~vm_page_bits(base, size); 1804 if (base == 0 && size == PAGE_SIZE) { 1805 /*pmap_clear_modify(m);*/ 1806 vm_page_flag_clear(m, PG_NOSYNC); 1807 } 1808 } 1809 1810 /* 1811 * Make the page all-dirty. 1812 * 1813 * Also make sure the related object and vnode reflect the fact that the 1814 * object may now contain a dirty page. 1815 * 1816 * Page must be busied? 1817 * No other requirements. 1818 */ 1819 void 1820 vm_page_dirty(vm_page_t m) 1821 { 1822 #ifdef INVARIANTS 1823 int pqtype = m->queue - m->pc; 1824 #endif 1825 KASSERT(pqtype != PQ_CACHE && pqtype != PQ_FREE, 1826 ("vm_page_dirty: page in free/cache queue!")); 1827 if (m->dirty != VM_PAGE_BITS_ALL) { 1828 m->dirty = VM_PAGE_BITS_ALL; 1829 if (m->object) 1830 vm_object_set_writeable_dirty(m->object); 1831 } 1832 } 1833 1834 /* 1835 * Invalidates DEV_BSIZE'd chunks within a page. Both the 1836 * valid and dirty bits for the effected areas are cleared. 1837 * 1838 * Page must be busied? 1839 * Does not block. 1840 * No other requirements. 1841 */ 1842 void 1843 vm_page_set_invalid(vm_page_t m, int base, int size) 1844 { 1845 int bits; 1846 1847 bits = vm_page_bits(base, size); 1848 m->valid &= ~bits; 1849 m->dirty &= ~bits; 1850 m->object->generation++; 1851 } 1852 1853 /* 1854 * The kernel assumes that the invalid portions of a page contain 1855 * garbage, but such pages can be mapped into memory by user code. 1856 * When this occurs, we must zero out the non-valid portions of the 1857 * page so user code sees what it expects. 1858 * 1859 * Pages are most often semi-valid when the end of a file is mapped 1860 * into memory and the file's size is not page aligned. 1861 * 1862 * Page must be busied? 1863 * No other requirements. 1864 */ 1865 void 1866 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) 1867 { 1868 int b; 1869 int i; 1870 1871 /* 1872 * Scan the valid bits looking for invalid sections that 1873 * must be zerod. Invalid sub-DEV_BSIZE'd areas ( where the 1874 * valid bit may be set ) have already been zerod by 1875 * vm_page_set_validclean(). 1876 */ 1877 for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { 1878 if (i == (PAGE_SIZE / DEV_BSIZE) || 1879 (m->valid & (1 << i)) 1880 ) { 1881 if (i > b) { 1882 pmap_zero_page_area( 1883 VM_PAGE_TO_PHYS(m), 1884 b << DEV_BSHIFT, 1885 (i - b) << DEV_BSHIFT 1886 ); 1887 } 1888 b = i + 1; 1889 } 1890 } 1891 1892 /* 1893 * setvalid is TRUE when we can safely set the zero'd areas 1894 * as being valid. We can do this if there are no cache consistency 1895 * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. 1896 */ 1897 if (setvalid) 1898 m->valid = VM_PAGE_BITS_ALL; 1899 } 1900 1901 /* 1902 * Is a (partial) page valid? Note that the case where size == 0 1903 * will return FALSE in the degenerate case where the page is entirely 1904 * invalid, and TRUE otherwise. 1905 * 1906 * Does not block. 1907 * No other requirements. 1908 */ 1909 int 1910 vm_page_is_valid(vm_page_t m, int base, int size) 1911 { 1912 int bits = vm_page_bits(base, size); 1913 1914 if (m->valid && ((m->valid & bits) == bits)) 1915 return 1; 1916 else 1917 return 0; 1918 } 1919 1920 /* 1921 * update dirty bits from pmap/mmu. May not block. 1922 * 1923 * Caller must hold vm_token if non-blocking operation desired. 1924 * No other requirements. 1925 */ 1926 void 1927 vm_page_test_dirty(vm_page_t m) 1928 { 1929 if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(m)) { 1930 vm_page_dirty(m); 1931 } 1932 } 1933 1934 /* 1935 * Register an action, associating it with its vm_page 1936 */ 1937 void 1938 vm_page_register_action(vm_page_action_t action, vm_page_event_t event) 1939 { 1940 struct vm_page_action_list *list; 1941 int hv; 1942 1943 hv = (int)((intptr_t)action->m >> 8) & VMACTION_HMASK; 1944 list = &action_list[hv]; 1945 1946 lwkt_gettoken(&vm_token); 1947 vm_page_flag_set(action->m, PG_ACTIONLIST); 1948 action->event = event; 1949 LIST_INSERT_HEAD(list, action, entry); 1950 lwkt_reltoken(&vm_token); 1951 } 1952 1953 /* 1954 * Unregister an action, disassociating it from its related vm_page 1955 */ 1956 void 1957 vm_page_unregister_action(vm_page_action_t action) 1958 { 1959 struct vm_page_action_list *list; 1960 int hv; 1961 1962 lwkt_gettoken(&vm_token); 1963 if (action->event != VMEVENT_NONE) { 1964 action->event = VMEVENT_NONE; 1965 LIST_REMOVE(action, entry); 1966 1967 hv = (int)((intptr_t)action->m >> 8) & VMACTION_HMASK; 1968 list = &action_list[hv]; 1969 if (LIST_EMPTY(list)) 1970 vm_page_flag_clear(action->m, PG_ACTIONLIST); 1971 } 1972 lwkt_reltoken(&vm_token); 1973 } 1974 1975 /* 1976 * Issue an event on a VM page. Corresponding action structures are 1977 * removed from the page's list and called. 1978 * 1979 * If the vm_page has no more pending action events we clear its 1980 * PG_ACTIONLIST flag. 1981 */ 1982 void 1983 vm_page_event_internal(vm_page_t m, vm_page_event_t event) 1984 { 1985 struct vm_page_action_list *list; 1986 struct vm_page_action *scan; 1987 struct vm_page_action *next; 1988 int hv; 1989 int all; 1990 1991 hv = (int)((intptr_t)m >> 8) & VMACTION_HMASK; 1992 list = &action_list[hv]; 1993 all = 1; 1994 1995 lwkt_gettoken(&vm_token); 1996 LIST_FOREACH_MUTABLE(scan, list, entry, next) { 1997 if (scan->m == m) { 1998 if (scan->event == event) { 1999 scan->event = VMEVENT_NONE; 2000 LIST_REMOVE(scan, entry); 2001 scan->func(m, scan); 2002 /* XXX */ 2003 } else { 2004 all = 0; 2005 } 2006 } 2007 } 2008 if (all) 2009 vm_page_flag_clear(m, PG_ACTIONLIST); 2010 lwkt_reltoken(&vm_token); 2011 } 2012 2013 2014 #include "opt_ddb.h" 2015 #ifdef DDB 2016 #include <sys/kernel.h> 2017 2018 #include <ddb/ddb.h> 2019 2020 DB_SHOW_COMMAND(page, vm_page_print_page_info) 2021 { 2022 db_printf("vmstats.v_free_count: %d\n", vmstats.v_free_count); 2023 db_printf("vmstats.v_cache_count: %d\n", vmstats.v_cache_count); 2024 db_printf("vmstats.v_inactive_count: %d\n", vmstats.v_inactive_count); 2025 db_printf("vmstats.v_active_count: %d\n", vmstats.v_active_count); 2026 db_printf("vmstats.v_wire_count: %d\n", vmstats.v_wire_count); 2027 db_printf("vmstats.v_free_reserved: %d\n", vmstats.v_free_reserved); 2028 db_printf("vmstats.v_free_min: %d\n", vmstats.v_free_min); 2029 db_printf("vmstats.v_free_target: %d\n", vmstats.v_free_target); 2030 db_printf("vmstats.v_cache_min: %d\n", vmstats.v_cache_min); 2031 db_printf("vmstats.v_inactive_target: %d\n", vmstats.v_inactive_target); 2032 } 2033 2034 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) 2035 { 2036 int i; 2037 db_printf("PQ_FREE:"); 2038 for(i=0;i<PQ_L2_SIZE;i++) { 2039 db_printf(" %d", vm_page_queues[PQ_FREE + i].lcnt); 2040 } 2041 db_printf("\n"); 2042 2043 db_printf("PQ_CACHE:"); 2044 for(i=0;i<PQ_L2_SIZE;i++) { 2045 db_printf(" %d", vm_page_queues[PQ_CACHE + i].lcnt); 2046 } 2047 db_printf("\n"); 2048 2049 db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n", 2050 vm_page_queues[PQ_ACTIVE].lcnt, 2051 vm_page_queues[PQ_INACTIVE].lcnt); 2052 } 2053 #endif /* DDB */ 2054