1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * The Mach Operating System project at Carnegie-Mellon University. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 37 * $FreeBSD: src/sys/vm/vm_page.c,v 1.147.2.18 2002/03/10 05:03:19 alc Exp $ 38 * $DragonFly: src/sys/vm/vm_page.c,v 1.10 2003/09/14 21:14:53 dillon Exp $ 39 */ 40 41 /* 42 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 43 * All rights reserved. 44 * 45 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 46 * 47 * Permission to use, copy, modify and distribute this software and 48 * its documentation is hereby granted, provided that both the copyright 49 * notice and this permission notice appear in all copies of the 50 * software, derivative works or modified versions, and any portions 51 * thereof, and that both notices appear in supporting documentation. 52 * 53 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 54 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 55 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 56 * 57 * Carnegie Mellon requests users of this software to return to 58 * 59 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 60 * School of Computer Science 61 * Carnegie Mellon University 62 * Pittsburgh PA 15213-3890 63 * 64 * any improvements or extensions that they make and grant Carnegie the 65 * rights to redistribute these changes. 66 */ 67 68 /* 69 * Resident memory management module. 70 */ 71 72 #include <sys/param.h> 73 #include <sys/systm.h> 74 #include <sys/malloc.h> 75 #include <sys/proc.h> 76 #include <sys/vmmeter.h> 77 #include <sys/vnode.h> 78 79 #include <vm/vm.h> 80 #include <vm/vm_param.h> 81 #include <sys/lock.h> 82 #include <vm/vm_kern.h> 83 #include <vm/pmap.h> 84 #include <vm/vm_map.h> 85 #include <vm/vm_object.h> 86 #include <vm/vm_page.h> 87 #include <vm/vm_pageout.h> 88 #include <vm/vm_pager.h> 89 #include <vm/vm_extern.h> 90 #include <vm/vm_page2.h> 91 92 static void vm_page_queue_init (void); 93 static vm_page_t vm_page_select_cache (vm_object_t, vm_pindex_t); 94 95 /* 96 * Associated with page of user-allocatable memory is a 97 * page structure. 98 */ 99 100 static struct vm_page **vm_page_buckets; /* Array of buckets */ 101 static int vm_page_bucket_count; /* How big is array? */ 102 static int vm_page_hash_mask; /* Mask for hash function */ 103 static volatile int vm_page_bucket_generation; 104 105 struct vpgqueues vm_page_queues[PQ_COUNT]; 106 107 static void 108 vm_page_queue_init(void) { 109 int i; 110 111 for(i=0;i<PQ_L2_SIZE;i++) { 112 vm_page_queues[PQ_FREE+i].cnt = &vmstats.v_free_count; 113 } 114 vm_page_queues[PQ_INACTIVE].cnt = &vmstats.v_inactive_count; 115 116 vm_page_queues[PQ_ACTIVE].cnt = &vmstats.v_active_count; 117 vm_page_queues[PQ_HOLD].cnt = &vmstats.v_active_count; 118 for(i=0;i<PQ_L2_SIZE;i++) { 119 vm_page_queues[PQ_CACHE+i].cnt = &vmstats.v_cache_count; 120 } 121 for(i=0;i<PQ_COUNT;i++) { 122 TAILQ_INIT(&vm_page_queues[i].pl); 123 } 124 } 125 126 vm_page_t vm_page_array = 0; 127 int vm_page_array_size = 0; 128 long first_page = 0; 129 int vm_page_zero_count = 0; 130 131 static __inline int vm_page_hash (vm_object_t object, vm_pindex_t pindex); 132 static void vm_page_free_wakeup (void); 133 134 /* 135 * vm_set_page_size: 136 * 137 * Sets the page size, perhaps based upon the memory 138 * size. Must be called before any use of page-size 139 * dependent functions. 140 */ 141 void 142 vm_set_page_size(void) 143 { 144 if (vmstats.v_page_size == 0) 145 vmstats.v_page_size = PAGE_SIZE; 146 if (((vmstats.v_page_size - 1) & vmstats.v_page_size) != 0) 147 panic("vm_set_page_size: page size not a power of two"); 148 } 149 150 /* 151 * vm_add_new_page: 152 * 153 * Add a new page to the freelist for use by the system. New pages 154 * are added to both the head and tail of the associated free page 155 * queue in a bottom-up fashion, so both zero'd and non-zero'd page 156 * requests pull 'recent' adds (higher physical addresses) first. 157 * 158 * Must be called at splhigh(). 159 */ 160 vm_page_t 161 vm_add_new_page(vm_offset_t pa) 162 { 163 vm_page_t m; 164 struct vpgqueues *vpq; 165 166 ++vmstats.v_page_count; 167 ++vmstats.v_free_count; 168 m = PHYS_TO_VM_PAGE(pa); 169 m->phys_addr = pa; 170 m->flags = 0; 171 m->pc = (pa >> PAGE_SHIFT) & PQ_L2_MASK; 172 m->queue = m->pc + PQ_FREE; 173 vpq = &vm_page_queues[m->queue]; 174 if (vpq->flipflop) 175 TAILQ_INSERT_TAIL(&vpq->pl, m, pageq); 176 else 177 TAILQ_INSERT_HEAD(&vpq->pl, m, pageq); 178 vpq->flipflop = 1 - vpq->flipflop; 179 vm_page_queues[m->queue].lcnt++; 180 return (m); 181 } 182 183 /* 184 * vm_page_startup: 185 * 186 * Initializes the resident memory module. 187 * 188 * Allocates memory for the page cells, and 189 * for the object/offset-to-page hash table headers. 190 * Each page cell is initialized and placed on the free list. 191 */ 192 193 vm_offset_t 194 vm_page_startup(vm_offset_t starta, vm_offset_t enda, vm_offset_t vaddr) 195 { 196 vm_offset_t mapped; 197 struct vm_page **bucket; 198 vm_size_t npages, page_range; 199 vm_offset_t new_end; 200 int i; 201 vm_offset_t pa; 202 int nblocks; 203 vm_offset_t last_pa; 204 205 /* the biggest memory array is the second group of pages */ 206 vm_offset_t end; 207 vm_offset_t biggestone, biggestsize; 208 209 vm_offset_t total; 210 211 total = 0; 212 biggestsize = 0; 213 biggestone = 0; 214 nblocks = 0; 215 vaddr = round_page(vaddr); 216 217 for (i = 0; phys_avail[i + 1]; i += 2) { 218 phys_avail[i] = round_page(phys_avail[i]); 219 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 220 } 221 222 for (i = 0; phys_avail[i + 1]; i += 2) { 223 int size = phys_avail[i + 1] - phys_avail[i]; 224 225 if (size > biggestsize) { 226 biggestone = i; 227 biggestsize = size; 228 } 229 ++nblocks; 230 total += size; 231 } 232 233 end = phys_avail[biggestone+1]; 234 235 /* 236 * Initialize the queue headers for the free queue, the active queue 237 * and the inactive queue. 238 */ 239 240 vm_page_queue_init(); 241 242 /* 243 * Allocate (and initialize) the hash table buckets. 244 * 245 * The number of buckets MUST BE a power of 2, and the actual value is 246 * the next power of 2 greater than the number of physical pages in 247 * the system. 248 * 249 * We make the hash table approximately 2x the number of pages to 250 * reduce the chain length. This is about the same size using the 251 * singly-linked list as the 1x hash table we were using before 252 * using TAILQ but the chain length will be smaller. 253 * 254 * Note: This computation can be tweaked if desired. 255 */ 256 vm_page_buckets = (struct vm_page **)vaddr; 257 bucket = vm_page_buckets; 258 if (vm_page_bucket_count == 0) { 259 vm_page_bucket_count = 1; 260 while (vm_page_bucket_count < atop(total)) 261 vm_page_bucket_count <<= 1; 262 } 263 vm_page_bucket_count <<= 1; 264 vm_page_hash_mask = vm_page_bucket_count - 1; 265 266 /* 267 * Validate these addresses. 268 */ 269 new_end = end - vm_page_bucket_count * sizeof(struct vm_page *); 270 new_end = trunc_page(new_end); 271 mapped = round_page(vaddr); 272 vaddr = pmap_map(mapped, new_end, end, 273 VM_PROT_READ | VM_PROT_WRITE); 274 vaddr = round_page(vaddr); 275 bzero((caddr_t) mapped, vaddr - mapped); 276 277 for (i = 0; i < vm_page_bucket_count; i++) { 278 *bucket = NULL; 279 bucket++; 280 } 281 282 /* 283 * Compute the number of pages of memory that will be available for 284 * use (taking into account the overhead of a page structure per 285 * page). 286 */ 287 288 first_page = phys_avail[0] / PAGE_SIZE; 289 290 page_range = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE - first_page; 291 npages = (total - (page_range * sizeof(struct vm_page)) - 292 (end - new_end)) / PAGE_SIZE; 293 294 end = new_end; 295 /* 296 * Initialize the mem entry structures now, and put them in the free 297 * queue. 298 */ 299 vm_page_array = (vm_page_t) vaddr; 300 mapped = vaddr; 301 302 /* 303 * Validate these addresses. 304 */ 305 306 new_end = trunc_page(end - page_range * sizeof(struct vm_page)); 307 mapped = pmap_map(mapped, new_end, end, 308 VM_PROT_READ | VM_PROT_WRITE); 309 310 /* 311 * Clear all of the page structures 312 */ 313 bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page)); 314 vm_page_array_size = page_range; 315 316 /* 317 * Construct the free queue(s) in ascending order (by physical 318 * address) so that the first 16MB of physical memory is allocated 319 * last rather than first. On large-memory machines, this avoids 320 * the exhaustion of low physical memory before isa_dmainit has run. 321 */ 322 vmstats.v_page_count = 0; 323 vmstats.v_free_count = 0; 324 for (i = 0; phys_avail[i + 1] && npages > 0; i += 2) { 325 pa = phys_avail[i]; 326 if (i == biggestone) 327 last_pa = new_end; 328 else 329 last_pa = phys_avail[i + 1]; 330 while (pa < last_pa && npages-- > 0) { 331 vm_add_new_page(pa); 332 pa += PAGE_SIZE; 333 } 334 } 335 return (mapped); 336 } 337 338 /* 339 * vm_page_hash: 340 * 341 * Distributes the object/offset key pair among hash buckets. 342 * 343 * NOTE: This macro depends on vm_page_bucket_count being a power of 2. 344 * This routine may not block. 345 * 346 * We try to randomize the hash based on the object to spread the pages 347 * out in the hash table without it costing us too much. 348 */ 349 static __inline int 350 vm_page_hash(vm_object_t object, vm_pindex_t pindex) 351 { 352 int i = ((uintptr_t)object + pindex) ^ object->hash_rand; 353 354 return(i & vm_page_hash_mask); 355 } 356 357 void 358 vm_page_unhold(vm_page_t mem) 359 { 360 --mem->hold_count; 361 KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!")); 362 if (mem->hold_count == 0 && mem->queue == PQ_HOLD) 363 vm_page_free_toq(mem); 364 } 365 366 /* 367 * vm_page_insert: [ internal use only ] 368 * 369 * Inserts the given mem entry into the object and object list. 370 * 371 * The pagetables are not updated but will presumably fault the page 372 * in if necessary, or if a kernel page the caller will at some point 373 * enter the page into the kernel's pmap. We are not allowed to block 374 * here so we *can't* do this anyway. 375 * 376 * The object and page must be locked, and must be splhigh. 377 * This routine may not block. 378 */ 379 380 void 381 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) 382 { 383 struct vm_page **bucket; 384 385 if (m->object != NULL) 386 panic("vm_page_insert: already inserted"); 387 388 /* 389 * Record the object/offset pair in this page 390 */ 391 392 m->object = object; 393 m->pindex = pindex; 394 395 /* 396 * Insert it into the object_object/offset hash table 397 */ 398 399 bucket = &vm_page_buckets[vm_page_hash(object, pindex)]; 400 m->hnext = *bucket; 401 *bucket = m; 402 vm_page_bucket_generation++; 403 404 /* 405 * Now link into the object's list of backed pages. 406 */ 407 408 TAILQ_INSERT_TAIL(&object->memq, m, listq); 409 object->generation++; 410 411 /* 412 * show that the object has one more resident page. 413 */ 414 415 object->resident_page_count++; 416 417 /* 418 * Since we are inserting a new and possibly dirty page, 419 * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags. 420 */ 421 if (m->flags & PG_WRITEABLE) 422 vm_object_set_writeable_dirty(object); 423 } 424 425 /* 426 * vm_page_remove: 427 * NOTE: used by device pager as well -wfj 428 * 429 * Removes the given mem entry from the object/offset-page 430 * table and the object page list, but do not invalidate/terminate 431 * the backing store. 432 * 433 * The object and page must be locked, and at splhigh. 434 * The underlying pmap entry (if any) is NOT removed here. 435 * This routine may not block. 436 */ 437 438 void 439 vm_page_remove(vm_page_t m) 440 { 441 vm_object_t object; 442 443 if (m->object == NULL) 444 return; 445 446 if ((m->flags & PG_BUSY) == 0) { 447 panic("vm_page_remove: page not busy"); 448 } 449 450 /* 451 * Basically destroy the page. 452 */ 453 454 vm_page_wakeup(m); 455 456 object = m->object; 457 458 /* 459 * Remove from the object_object/offset hash table. The object 460 * must be on the hash queue, we will panic if it isn't 461 * 462 * Note: we must NULL-out m->hnext to prevent loops in detached 463 * buffers with vm_page_lookup(). 464 */ 465 466 { 467 struct vm_page **bucket; 468 469 bucket = &vm_page_buckets[vm_page_hash(m->object, m->pindex)]; 470 while (*bucket != m) { 471 if (*bucket == NULL) 472 panic("vm_page_remove(): page not found in hash"); 473 bucket = &(*bucket)->hnext; 474 } 475 *bucket = m->hnext; 476 m->hnext = NULL; 477 vm_page_bucket_generation++; 478 } 479 480 /* 481 * Now remove from the object's list of backed pages. 482 */ 483 484 TAILQ_REMOVE(&object->memq, m, listq); 485 486 /* 487 * And show that the object has one fewer resident page. 488 */ 489 490 object->resident_page_count--; 491 object->generation++; 492 493 m->object = NULL; 494 } 495 496 /* 497 * vm_page_lookup: 498 * 499 * Returns the page associated with the object/offset 500 * pair specified; if none is found, NULL is returned. 501 * 502 * NOTE: the code below does not lock. It will operate properly if 503 * an interrupt makes a change, but the generation algorithm will not 504 * operate properly in an SMP environment where both cpu's are able to run 505 * kernel code simultaneously. 506 * 507 * The object must be locked. No side effects. 508 * This routine may not block. 509 * This is a critical path routine 510 */ 511 512 vm_page_t 513 vm_page_lookup(vm_object_t object, vm_pindex_t pindex) 514 { 515 vm_page_t m; 516 struct vm_page **bucket; 517 int generation; 518 519 /* 520 * Search the hash table for this object/offset pair 521 */ 522 523 retry: 524 generation = vm_page_bucket_generation; 525 bucket = &vm_page_buckets[vm_page_hash(object, pindex)]; 526 for (m = *bucket; m != NULL; m = m->hnext) { 527 if ((m->object == object) && (m->pindex == pindex)) { 528 if (vm_page_bucket_generation != generation) 529 goto retry; 530 return (m); 531 } 532 } 533 if (vm_page_bucket_generation != generation) 534 goto retry; 535 return (NULL); 536 } 537 538 /* 539 * vm_page_rename: 540 * 541 * Move the given memory entry from its 542 * current object to the specified target object/offset. 543 * 544 * The object must be locked. 545 * This routine may not block. 546 * 547 * Note: this routine will raise itself to splvm(), the caller need not. 548 * 549 * Note: swap associated with the page must be invalidated by the move. We 550 * have to do this for several reasons: (1) we aren't freeing the 551 * page, (2) we are dirtying the page, (3) the VM system is probably 552 * moving the page from object A to B, and will then later move 553 * the backing store from A to B and we can't have a conflict. 554 * 555 * Note: we *always* dirty the page. It is necessary both for the 556 * fact that we moved it, and because we may be invalidating 557 * swap. If the page is on the cache, we have to deactivate it 558 * or vm_page_dirty() will panic. Dirty pages are not allowed 559 * on the cache. 560 */ 561 562 void 563 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) 564 { 565 int s; 566 567 s = splvm(); 568 vm_page_remove(m); 569 vm_page_insert(m, new_object, new_pindex); 570 if (m->queue - m->pc == PQ_CACHE) 571 vm_page_deactivate(m); 572 vm_page_dirty(m); 573 splx(s); 574 } 575 576 /* 577 * vm_page_unqueue_nowakeup: 578 * 579 * vm_page_unqueue() without any wakeup 580 * 581 * This routine must be called at splhigh(). 582 * This routine may not block. 583 */ 584 585 void 586 vm_page_unqueue_nowakeup(vm_page_t m) 587 { 588 int queue = m->queue; 589 struct vpgqueues *pq; 590 if (queue != PQ_NONE) { 591 pq = &vm_page_queues[queue]; 592 m->queue = PQ_NONE; 593 TAILQ_REMOVE(&pq->pl, m, pageq); 594 (*pq->cnt)--; 595 pq->lcnt--; 596 } 597 } 598 599 /* 600 * vm_page_unqueue: 601 * 602 * Remove a page from its queue. 603 * 604 * This routine must be called at splhigh(). 605 * This routine may not block. 606 */ 607 608 void 609 vm_page_unqueue(vm_page_t m) 610 { 611 int queue = m->queue; 612 struct vpgqueues *pq; 613 if (queue != PQ_NONE) { 614 m->queue = PQ_NONE; 615 pq = &vm_page_queues[queue]; 616 TAILQ_REMOVE(&pq->pl, m, pageq); 617 (*pq->cnt)--; 618 pq->lcnt--; 619 if ((queue - m->pc) == PQ_CACHE) { 620 if (vm_paging_needed()) 621 pagedaemon_wakeup(); 622 } 623 } 624 } 625 626 #if PQ_L2_SIZE > 1 627 628 /* 629 * vm_page_list_find: 630 * 631 * Find a page on the specified queue with color optimization. 632 * 633 * The page coloring optimization attempts to locate a page 634 * that does not overload other nearby pages in the object in 635 * the cpu's L1 or L2 caches. We need this optimization because 636 * cpu caches tend to be physical caches, while object spaces tend 637 * to be virtual. 638 * 639 * This routine must be called at splvm(). 640 * This routine may not block. 641 * 642 * This routine may only be called from the vm_page_list_find() macro 643 * in vm_page.h 644 */ 645 vm_page_t 646 _vm_page_list_find(int basequeue, int index) 647 { 648 int i; 649 vm_page_t m = NULL; 650 struct vpgqueues *pq; 651 652 pq = &vm_page_queues[basequeue]; 653 654 /* 655 * Note that for the first loop, index+i and index-i wind up at the 656 * same place. Even though this is not totally optimal, we've already 657 * blown it by missing the cache case so we do not care. 658 */ 659 660 for(i = PQ_L2_SIZE / 2; i > 0; --i) { 661 if ((m = TAILQ_FIRST(&pq[(index + i) & PQ_L2_MASK].pl)) != NULL) 662 break; 663 664 if ((m = TAILQ_FIRST(&pq[(index - i) & PQ_L2_MASK].pl)) != NULL) 665 break; 666 } 667 return(m); 668 } 669 670 #endif 671 672 /* 673 * vm_page_select_cache: 674 * 675 * Find a page on the cache queue with color optimization. As pages 676 * might be found, but not applicable, they are deactivated. This 677 * keeps us from using potentially busy cached pages. 678 * 679 * This routine must be called at splvm(). 680 * This routine may not block. 681 */ 682 vm_page_t 683 vm_page_select_cache(vm_object_t object, vm_pindex_t pindex) 684 { 685 vm_page_t m; 686 687 while (TRUE) { 688 m = vm_page_list_find( 689 PQ_CACHE, 690 (pindex + object->pg_color) & PQ_L2_MASK, 691 FALSE 692 ); 693 if (m && ((m->flags & (PG_BUSY|PG_UNMANAGED)) || m->busy || 694 m->hold_count || m->wire_count)) { 695 vm_page_deactivate(m); 696 continue; 697 } 698 return m; 699 } 700 } 701 702 /* 703 * vm_page_select_free: 704 * 705 * Find a free or zero page, with specified preference. We attempt to 706 * inline the nominal case and fall back to _vm_page_select_free() 707 * otherwise. 708 * 709 * This routine must be called at splvm(). 710 * This routine may not block. 711 */ 712 713 static __inline vm_page_t 714 vm_page_select_free(vm_object_t object, vm_pindex_t pindex, boolean_t prefer_zero) 715 { 716 vm_page_t m; 717 718 m = vm_page_list_find( 719 PQ_FREE, 720 (pindex + object->pg_color) & PQ_L2_MASK, 721 prefer_zero 722 ); 723 return(m); 724 } 725 726 /* 727 * vm_page_alloc: 728 * 729 * Allocate and return a memory cell associated 730 * with this VM object/offset pair. 731 * 732 * page_req classes: 733 * VM_ALLOC_NORMAL normal process request 734 * VM_ALLOC_SYSTEM system *really* needs a page 735 * VM_ALLOC_INTERRUPT interrupt time request 736 * VM_ALLOC_ZERO zero page 737 * 738 * Object must be locked. 739 * This routine may not block. 740 * 741 * Additional special handling is required when called from an 742 * interrupt (VM_ALLOC_INTERRUPT). We are not allowed to mess with 743 * the page cache in this case. 744 */ 745 746 vm_page_t 747 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req) 748 { 749 vm_page_t m = NULL; 750 int s; 751 752 KASSERT(!vm_page_lookup(object, pindex), 753 ("vm_page_alloc: page already allocated")); 754 755 /* 756 * The pager is allowed to eat deeper into the free page list. 757 */ 758 759 if ((curthread == pagethread) && (page_req != VM_ALLOC_INTERRUPT)) { 760 page_req = VM_ALLOC_SYSTEM; 761 }; 762 763 s = splvm(); 764 765 loop: 766 if (vmstats.v_free_count > vmstats.v_free_reserved) { 767 /* 768 * Allocate from the free queue if there are plenty of pages 769 * in it. 770 */ 771 if (page_req == VM_ALLOC_ZERO) 772 m = vm_page_select_free(object, pindex, TRUE); 773 else 774 m = vm_page_select_free(object, pindex, FALSE); 775 } else if ( 776 (page_req == VM_ALLOC_SYSTEM && 777 vmstats.v_cache_count == 0 && 778 vmstats.v_free_count > vmstats.v_interrupt_free_min) || 779 (page_req == VM_ALLOC_INTERRUPT && vmstats.v_free_count > 0) 780 ) { 781 /* 782 * Interrupt or system, dig deeper into the free list. 783 */ 784 m = vm_page_select_free(object, pindex, FALSE); 785 } else if (page_req != VM_ALLOC_INTERRUPT) { 786 /* 787 * Allocatable from cache (non-interrupt only). On success, 788 * we must free the page and try again, thus ensuring that 789 * vmstats.v_*_free_min counters are replenished. 790 */ 791 m = vm_page_select_cache(object, pindex); 792 if (m == NULL) { 793 splx(s); 794 #if defined(DIAGNOSTIC) 795 if (vmstats.v_cache_count > 0) 796 printf("vm_page_alloc(NORMAL): missing pages on cache queue: %d\n", vmstats.v_cache_count); 797 #endif 798 vm_pageout_deficit++; 799 pagedaemon_wakeup(); 800 return (NULL); 801 } 802 KASSERT(m->dirty == 0, ("Found dirty cache page %p", m)); 803 vm_page_busy(m); 804 vm_page_protect(m, VM_PROT_NONE); 805 vm_page_free(m); 806 goto loop; 807 } else { 808 /* 809 * Not allocatable from cache from interrupt, give up. 810 */ 811 splx(s); 812 vm_pageout_deficit++; 813 pagedaemon_wakeup(); 814 return (NULL); 815 } 816 817 /* 818 * At this point we had better have found a good page. 819 */ 820 821 KASSERT( 822 m != NULL, 823 ("vm_page_alloc(): missing page on free queue\n") 824 ); 825 826 /* 827 * Remove from free queue 828 */ 829 830 vm_page_unqueue_nowakeup(m); 831 832 /* 833 * Initialize structure. Only the PG_ZERO flag is inherited. 834 */ 835 836 if (m->flags & PG_ZERO) { 837 vm_page_zero_count--; 838 m->flags = PG_ZERO | PG_BUSY; 839 } else { 840 m->flags = PG_BUSY; 841 } 842 m->wire_count = 0; 843 m->hold_count = 0; 844 m->act_count = 0; 845 m->busy = 0; 846 m->valid = 0; 847 KASSERT(m->dirty == 0, ("vm_page_alloc: free/cache page %p was dirty", m)); 848 849 /* 850 * vm_page_insert() is safe prior to the splx(). Note also that 851 * inserting a page here does not insert it into the pmap (which 852 * could cause us to block allocating memory). We cannot block 853 * anywhere. 854 */ 855 856 vm_page_insert(m, object, pindex); 857 858 /* 859 * Don't wakeup too often - wakeup the pageout daemon when 860 * we would be nearly out of memory. 861 */ 862 if (vm_paging_needed()) 863 pagedaemon_wakeup(); 864 865 splx(s); 866 867 return (m); 868 } 869 870 /* 871 * vm_wait: (also see VM_WAIT macro) 872 * 873 * Block until free pages are available for allocation 874 * - Called in various places before memory allocations. 875 */ 876 877 void 878 vm_wait(void) 879 { 880 int s; 881 882 s = splvm(); 883 if (curthread == pagethread) { 884 vm_pageout_pages_needed = 1; 885 tsleep(&vm_pageout_pages_needed, 0, "VMWait", 0); 886 } else { 887 if (!vm_pages_needed) { 888 vm_pages_needed = 1; 889 wakeup(&vm_pages_needed); 890 } 891 tsleep(&vmstats.v_free_count, 0, "vmwait", 0); 892 } 893 splx(s); 894 } 895 896 /* 897 * vm_waitpfault: (also see VM_WAITPFAULT macro) 898 * 899 * Block until free pages are available for allocation 900 * - Called only in vm_fault so that processes page faulting 901 * can be easily tracked. 902 * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing 903 * processes will be able to grab memory first. Do not change 904 * this balance without careful testing first. 905 */ 906 907 void 908 vm_waitpfault(void) 909 { 910 int s; 911 912 s = splvm(); 913 if (!vm_pages_needed) { 914 vm_pages_needed = 1; 915 wakeup(&vm_pages_needed); 916 } 917 tsleep(&vmstats.v_free_count, 0, "pfault", 0); 918 splx(s); 919 } 920 921 /* 922 * vm_page_activate: 923 * 924 * Put the specified page on the active list (if appropriate). 925 * Ensure that act_count is at least ACT_INIT but do not otherwise 926 * mess with it. 927 * 928 * The page queues must be locked. 929 * This routine may not block. 930 */ 931 void 932 vm_page_activate(vm_page_t m) 933 { 934 int s; 935 936 s = splvm(); 937 if (m->queue != PQ_ACTIVE) { 938 if ((m->queue - m->pc) == PQ_CACHE) 939 mycpu->gd_cnt.v_reactivated++; 940 941 vm_page_unqueue(m); 942 943 if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) { 944 m->queue = PQ_ACTIVE; 945 vm_page_queues[PQ_ACTIVE].lcnt++; 946 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq); 947 if (m->act_count < ACT_INIT) 948 m->act_count = ACT_INIT; 949 vmstats.v_active_count++; 950 } 951 } else { 952 if (m->act_count < ACT_INIT) 953 m->act_count = ACT_INIT; 954 } 955 956 splx(s); 957 } 958 959 /* 960 * vm_page_free_wakeup: 961 * 962 * Helper routine for vm_page_free_toq() and vm_page_cache(). This 963 * routine is called when a page has been added to the cache or free 964 * queues. 965 * 966 * This routine may not block. 967 * This routine must be called at splvm() 968 */ 969 static __inline void 970 vm_page_free_wakeup(void) 971 { 972 /* 973 * if pageout daemon needs pages, then tell it that there are 974 * some free. 975 */ 976 if (vm_pageout_pages_needed && 977 vmstats.v_cache_count + vmstats.v_free_count >= vmstats.v_pageout_free_min) { 978 wakeup(&vm_pageout_pages_needed); 979 vm_pageout_pages_needed = 0; 980 } 981 /* 982 * wakeup processes that are waiting on memory if we hit a 983 * high water mark. And wakeup scheduler process if we have 984 * lots of memory. this process will swapin processes. 985 */ 986 if (vm_pages_needed && !vm_page_count_min()) { 987 vm_pages_needed = 0; 988 wakeup(&vmstats.v_free_count); 989 } 990 } 991 992 /* 993 * vm_page_free_toq: 994 * 995 * Returns the given page to the PQ_FREE list, 996 * disassociating it with any VM object. 997 * 998 * Object and page must be locked prior to entry. 999 * This routine may not block. 1000 */ 1001 1002 void 1003 vm_page_free_toq(vm_page_t m) 1004 { 1005 int s; 1006 struct vpgqueues *pq; 1007 vm_object_t object = m->object; 1008 1009 s = splvm(); 1010 1011 mycpu->gd_cnt.v_tfree++; 1012 1013 if (m->busy || ((m->queue - m->pc) == PQ_FREE)) { 1014 printf( 1015 "vm_page_free: pindex(%lu), busy(%d), PG_BUSY(%d), hold(%d)\n", 1016 (u_long)m->pindex, m->busy, (m->flags & PG_BUSY) ? 1 : 0, 1017 m->hold_count); 1018 if ((m->queue - m->pc) == PQ_FREE) 1019 panic("vm_page_free: freeing free page"); 1020 else 1021 panic("vm_page_free: freeing busy page"); 1022 } 1023 1024 /* 1025 * unqueue, then remove page. Note that we cannot destroy 1026 * the page here because we do not want to call the pager's 1027 * callback routine until after we've put the page on the 1028 * appropriate free queue. 1029 */ 1030 1031 vm_page_unqueue_nowakeup(m); 1032 vm_page_remove(m); 1033 1034 /* 1035 * If fictitious remove object association and 1036 * return, otherwise delay object association removal. 1037 */ 1038 1039 if ((m->flags & PG_FICTITIOUS) != 0) { 1040 splx(s); 1041 return; 1042 } 1043 1044 m->valid = 0; 1045 vm_page_undirty(m); 1046 1047 if (m->wire_count != 0) { 1048 if (m->wire_count > 1) { 1049 panic("vm_page_free: invalid wire count (%d), pindex: 0x%lx", 1050 m->wire_count, (long)m->pindex); 1051 } 1052 panic("vm_page_free: freeing wired page\n"); 1053 } 1054 1055 /* 1056 * If we've exhausted the object's resident pages we want to free 1057 * it up. 1058 */ 1059 1060 if (object && 1061 (object->type == OBJT_VNODE) && 1062 ((object->flags & OBJ_DEAD) == 0) 1063 ) { 1064 struct vnode *vp = (struct vnode *)object->handle; 1065 1066 if (vp && VSHOULDFREE(vp)) 1067 vfree(vp); 1068 } 1069 1070 /* 1071 * Clear the UNMANAGED flag when freeing an unmanaged page. 1072 */ 1073 1074 if (m->flags & PG_UNMANAGED) { 1075 m->flags &= ~PG_UNMANAGED; 1076 } else { 1077 #ifdef __alpha__ 1078 pmap_page_is_free(m); 1079 #endif 1080 } 1081 1082 if (m->hold_count != 0) { 1083 m->flags &= ~PG_ZERO; 1084 m->queue = PQ_HOLD; 1085 } else 1086 m->queue = PQ_FREE + m->pc; 1087 pq = &vm_page_queues[m->queue]; 1088 pq->lcnt++; 1089 ++(*pq->cnt); 1090 1091 /* 1092 * Put zero'd pages on the end ( where we look for zero'd pages 1093 * first ) and non-zerod pages at the head. 1094 */ 1095 1096 if (m->flags & PG_ZERO) { 1097 TAILQ_INSERT_TAIL(&pq->pl, m, pageq); 1098 ++vm_page_zero_count; 1099 } else { 1100 TAILQ_INSERT_HEAD(&pq->pl, m, pageq); 1101 } 1102 1103 vm_page_free_wakeup(); 1104 1105 splx(s); 1106 } 1107 1108 /* 1109 * vm_page_unmanage: 1110 * 1111 * Prevent PV management from being done on the page. The page is 1112 * removed from the paging queues as if it were wired, and as a 1113 * consequence of no longer being managed the pageout daemon will not 1114 * touch it (since there is no way to locate the pte mappings for the 1115 * page). madvise() calls that mess with the pmap will also no longer 1116 * operate on the page. 1117 * 1118 * Beyond that the page is still reasonably 'normal'. Freeing the page 1119 * will clear the flag. 1120 * 1121 * This routine is used by OBJT_PHYS objects - objects using unswappable 1122 * physical memory as backing store rather then swap-backed memory and 1123 * will eventually be extended to support 4MB unmanaged physical 1124 * mappings. 1125 */ 1126 1127 void 1128 vm_page_unmanage(vm_page_t m) 1129 { 1130 int s; 1131 1132 s = splvm(); 1133 if ((m->flags & PG_UNMANAGED) == 0) { 1134 if (m->wire_count == 0) 1135 vm_page_unqueue(m); 1136 } 1137 vm_page_flag_set(m, PG_UNMANAGED); 1138 splx(s); 1139 } 1140 1141 /* 1142 * vm_page_wire: 1143 * 1144 * Mark this page as wired down by yet 1145 * another map, removing it from paging queues 1146 * as necessary. 1147 * 1148 * The page queues must be locked. 1149 * This routine may not block. 1150 */ 1151 void 1152 vm_page_wire(vm_page_t m) 1153 { 1154 int s; 1155 1156 /* 1157 * Only bump the wire statistics if the page is not already wired, 1158 * and only unqueue the page if it is on some queue (if it is unmanaged 1159 * it is already off the queues). 1160 */ 1161 s = splvm(); 1162 if (m->wire_count == 0) { 1163 if ((m->flags & PG_UNMANAGED) == 0) 1164 vm_page_unqueue(m); 1165 vmstats.v_wire_count++; 1166 } 1167 m->wire_count++; 1168 KASSERT(m->wire_count != 0, 1169 ("vm_page_wire: wire_count overflow m=%p", m)); 1170 1171 splx(s); 1172 vm_page_flag_set(m, PG_MAPPED); 1173 } 1174 1175 /* 1176 * vm_page_unwire: 1177 * 1178 * Release one wiring of this page, potentially 1179 * enabling it to be paged again. 1180 * 1181 * Many pages placed on the inactive queue should actually go 1182 * into the cache, but it is difficult to figure out which. What 1183 * we do instead, if the inactive target is well met, is to put 1184 * clean pages at the head of the inactive queue instead of the tail. 1185 * This will cause them to be moved to the cache more quickly and 1186 * if not actively re-referenced, freed more quickly. If we just 1187 * stick these pages at the end of the inactive queue, heavy filesystem 1188 * meta-data accesses can cause an unnecessary paging load on memory bound 1189 * processes. This optimization causes one-time-use metadata to be 1190 * reused more quickly. 1191 * 1192 * BUT, if we are in a low-memory situation we have no choice but to 1193 * put clean pages on the cache queue. 1194 * 1195 * A number of routines use vm_page_unwire() to guarantee that the page 1196 * will go into either the inactive or active queues, and will NEVER 1197 * be placed in the cache - for example, just after dirtying a page. 1198 * dirty pages in the cache are not allowed. 1199 * 1200 * The page queues must be locked. 1201 * This routine may not block. 1202 */ 1203 void 1204 vm_page_unwire(vm_page_t m, int activate) 1205 { 1206 int s; 1207 1208 s = splvm(); 1209 1210 if (m->wire_count > 0) { 1211 m->wire_count--; 1212 if (m->wire_count == 0) { 1213 vmstats.v_wire_count--; 1214 if (m->flags & PG_UNMANAGED) { 1215 ; 1216 } else if (activate) { 1217 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq); 1218 m->queue = PQ_ACTIVE; 1219 vm_page_queues[PQ_ACTIVE].lcnt++; 1220 vmstats.v_active_count++; 1221 } else { 1222 vm_page_flag_clear(m, PG_WINATCFLS); 1223 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq); 1224 m->queue = PQ_INACTIVE; 1225 vm_page_queues[PQ_INACTIVE].lcnt++; 1226 vmstats.v_inactive_count++; 1227 } 1228 } 1229 } else { 1230 panic("vm_page_unwire: invalid wire count: %d\n", m->wire_count); 1231 } 1232 splx(s); 1233 } 1234 1235 1236 /* 1237 * Move the specified page to the inactive queue. If the page has 1238 * any associated swap, the swap is deallocated. 1239 * 1240 * Normally athead is 0 resulting in LRU operation. athead is set 1241 * to 1 if we want this page to be 'as if it were placed in the cache', 1242 * except without unmapping it from the process address space. 1243 * 1244 * This routine may not block. 1245 */ 1246 static __inline void 1247 _vm_page_deactivate(vm_page_t m, int athead) 1248 { 1249 int s; 1250 1251 /* 1252 * Ignore if already inactive. 1253 */ 1254 if (m->queue == PQ_INACTIVE) 1255 return; 1256 1257 s = splvm(); 1258 if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) { 1259 if ((m->queue - m->pc) == PQ_CACHE) 1260 mycpu->gd_cnt.v_reactivated++; 1261 vm_page_flag_clear(m, PG_WINATCFLS); 1262 vm_page_unqueue(m); 1263 if (athead) 1264 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, pageq); 1265 else 1266 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq); 1267 m->queue = PQ_INACTIVE; 1268 vm_page_queues[PQ_INACTIVE].lcnt++; 1269 vmstats.v_inactive_count++; 1270 } 1271 splx(s); 1272 } 1273 1274 void 1275 vm_page_deactivate(vm_page_t m) 1276 { 1277 _vm_page_deactivate(m, 0); 1278 } 1279 1280 /* 1281 * vm_page_try_to_cache: 1282 * 1283 * Returns 0 on failure, 1 on success 1284 */ 1285 int 1286 vm_page_try_to_cache(vm_page_t m) 1287 { 1288 if (m->dirty || m->hold_count || m->busy || m->wire_count || 1289 (m->flags & (PG_BUSY|PG_UNMANAGED))) { 1290 return(0); 1291 } 1292 vm_page_test_dirty(m); 1293 if (m->dirty) 1294 return(0); 1295 vm_page_cache(m); 1296 return(1); 1297 } 1298 1299 /* 1300 * vm_page_try_to_free() 1301 * 1302 * Attempt to free the page. If we cannot free it, we do nothing. 1303 * 1 is returned on success, 0 on failure. 1304 */ 1305 1306 int 1307 vm_page_try_to_free(vm_page_t m) 1308 { 1309 if (m->dirty || m->hold_count || m->busy || m->wire_count || 1310 (m->flags & (PG_BUSY|PG_UNMANAGED))) { 1311 return(0); 1312 } 1313 vm_page_test_dirty(m); 1314 if (m->dirty) 1315 return(0); 1316 vm_page_busy(m); 1317 vm_page_protect(m, VM_PROT_NONE); 1318 vm_page_free(m); 1319 return(1); 1320 } 1321 1322 1323 /* 1324 * vm_page_cache 1325 * 1326 * Put the specified page onto the page cache queue (if appropriate). 1327 * 1328 * This routine may not block. 1329 */ 1330 void 1331 vm_page_cache(vm_page_t m) 1332 { 1333 int s; 1334 1335 if ((m->flags & (PG_BUSY|PG_UNMANAGED)) || m->busy || m->wire_count) { 1336 printf("vm_page_cache: attempting to cache busy page\n"); 1337 return; 1338 } 1339 if ((m->queue - m->pc) == PQ_CACHE) 1340 return; 1341 1342 /* 1343 * Remove all pmaps and indicate that the page is not 1344 * writeable or mapped. 1345 */ 1346 1347 vm_page_protect(m, VM_PROT_NONE); 1348 if (m->dirty != 0) { 1349 panic("vm_page_cache: caching a dirty page, pindex: %ld", 1350 (long)m->pindex); 1351 } 1352 s = splvm(); 1353 vm_page_unqueue_nowakeup(m); 1354 m->queue = PQ_CACHE + m->pc; 1355 vm_page_queues[m->queue].lcnt++; 1356 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq); 1357 vmstats.v_cache_count++; 1358 vm_page_free_wakeup(); 1359 splx(s); 1360 } 1361 1362 /* 1363 * vm_page_dontneed 1364 * 1365 * Cache, deactivate, or do nothing as appropriate. This routine 1366 * is typically used by madvise() MADV_DONTNEED. 1367 * 1368 * Generally speaking we want to move the page into the cache so 1369 * it gets reused quickly. However, this can result in a silly syndrome 1370 * due to the page recycling too quickly. Small objects will not be 1371 * fully cached. On the otherhand, if we move the page to the inactive 1372 * queue we wind up with a problem whereby very large objects 1373 * unnecessarily blow away our inactive and cache queues. 1374 * 1375 * The solution is to move the pages based on a fixed weighting. We 1376 * either leave them alone, deactivate them, or move them to the cache, 1377 * where moving them to the cache has the highest weighting. 1378 * By forcing some pages into other queues we eventually force the 1379 * system to balance the queues, potentially recovering other unrelated 1380 * space from active. The idea is to not force this to happen too 1381 * often. 1382 */ 1383 1384 void 1385 vm_page_dontneed(vm_page_t m) 1386 { 1387 static int dnweight; 1388 int dnw; 1389 int head; 1390 1391 dnw = ++dnweight; 1392 1393 /* 1394 * occassionally leave the page alone 1395 */ 1396 1397 if ((dnw & 0x01F0) == 0 || 1398 m->queue == PQ_INACTIVE || 1399 m->queue - m->pc == PQ_CACHE 1400 ) { 1401 if (m->act_count >= ACT_INIT) 1402 --m->act_count; 1403 return; 1404 } 1405 1406 if (m->dirty == 0) 1407 vm_page_test_dirty(m); 1408 1409 if (m->dirty || (dnw & 0x0070) == 0) { 1410 /* 1411 * Deactivate the page 3 times out of 32. 1412 */ 1413 head = 0; 1414 } else { 1415 /* 1416 * Cache the page 28 times out of every 32. Note that 1417 * the page is deactivated instead of cached, but placed 1418 * at the head of the queue instead of the tail. 1419 */ 1420 head = 1; 1421 } 1422 _vm_page_deactivate(m, head); 1423 } 1424 1425 /* 1426 * Grab a page, waiting until we are waken up due to the page 1427 * changing state. We keep on waiting, if the page continues 1428 * to be in the object. If the page doesn't exist, allocate it. 1429 * 1430 * This routine may block. 1431 */ 1432 vm_page_t 1433 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) 1434 { 1435 1436 vm_page_t m; 1437 int s, generation; 1438 1439 retrylookup: 1440 if ((m = vm_page_lookup(object, pindex)) != NULL) { 1441 if (m->busy || (m->flags & PG_BUSY)) { 1442 generation = object->generation; 1443 1444 s = splvm(); 1445 while ((object->generation == generation) && 1446 (m->busy || (m->flags & PG_BUSY))) { 1447 vm_page_flag_set(m, PG_WANTED | PG_REFERENCED); 1448 tsleep(m, 0, "pgrbwt", 0); 1449 if ((allocflags & VM_ALLOC_RETRY) == 0) { 1450 splx(s); 1451 return NULL; 1452 } 1453 } 1454 splx(s); 1455 goto retrylookup; 1456 } else { 1457 vm_page_busy(m); 1458 return m; 1459 } 1460 } 1461 1462 m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_RETRY); 1463 if (m == NULL) { 1464 VM_WAIT; 1465 if ((allocflags & VM_ALLOC_RETRY) == 0) 1466 return NULL; 1467 goto retrylookup; 1468 } 1469 1470 return m; 1471 } 1472 1473 /* 1474 * Mapping function for valid bits or for dirty bits in 1475 * a page. May not block. 1476 * 1477 * Inputs are required to range within a page. 1478 */ 1479 1480 __inline int 1481 vm_page_bits(int base, int size) 1482 { 1483 int first_bit; 1484 int last_bit; 1485 1486 KASSERT( 1487 base + size <= PAGE_SIZE, 1488 ("vm_page_bits: illegal base/size %d/%d", base, size) 1489 ); 1490 1491 if (size == 0) /* handle degenerate case */ 1492 return(0); 1493 1494 first_bit = base >> DEV_BSHIFT; 1495 last_bit = (base + size - 1) >> DEV_BSHIFT; 1496 1497 return ((2 << last_bit) - (1 << first_bit)); 1498 } 1499 1500 /* 1501 * vm_page_set_validclean: 1502 * 1503 * Sets portions of a page valid and clean. The arguments are expected 1504 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive 1505 * of any partial chunks touched by the range. The invalid portion of 1506 * such chunks will be zero'd. 1507 * 1508 * This routine may not block. 1509 * 1510 * (base + size) must be less then or equal to PAGE_SIZE. 1511 */ 1512 void 1513 vm_page_set_validclean(vm_page_t m, int base, int size) 1514 { 1515 int pagebits; 1516 int frag; 1517 int endoff; 1518 1519 if (size == 0) /* handle degenerate case */ 1520 return; 1521 1522 /* 1523 * If the base is not DEV_BSIZE aligned and the valid 1524 * bit is clear, we have to zero out a portion of the 1525 * first block. 1526 */ 1527 1528 if ((frag = base & ~(DEV_BSIZE - 1)) != base && 1529 (m->valid & (1 << (base >> DEV_BSHIFT))) == 0 1530 ) { 1531 pmap_zero_page_area( 1532 VM_PAGE_TO_PHYS(m), 1533 frag, 1534 base - frag 1535 ); 1536 } 1537 1538 /* 1539 * If the ending offset is not DEV_BSIZE aligned and the 1540 * valid bit is clear, we have to zero out a portion of 1541 * the last block. 1542 */ 1543 1544 endoff = base + size; 1545 1546 if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && 1547 (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0 1548 ) { 1549 pmap_zero_page_area( 1550 VM_PAGE_TO_PHYS(m), 1551 endoff, 1552 DEV_BSIZE - (endoff & (DEV_BSIZE - 1)) 1553 ); 1554 } 1555 1556 /* 1557 * Set valid, clear dirty bits. If validating the entire 1558 * page we can safely clear the pmap modify bit. We also 1559 * use this opportunity to clear the PG_NOSYNC flag. If a process 1560 * takes a write fault on a MAP_NOSYNC memory area the flag will 1561 * be set again. 1562 * 1563 * We set valid bits inclusive of any overlap, but we can only 1564 * clear dirty bits for DEV_BSIZE chunks that are fully within 1565 * the range. 1566 */ 1567 1568 pagebits = vm_page_bits(base, size); 1569 m->valid |= pagebits; 1570 #if 0 /* NOT YET */ 1571 if ((frag = base & (DEV_BSIZE - 1)) != 0) { 1572 frag = DEV_BSIZE - frag; 1573 base += frag; 1574 size -= frag; 1575 if (size < 0) 1576 size = 0; 1577 } 1578 pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); 1579 #endif 1580 m->dirty &= ~pagebits; 1581 if (base == 0 && size == PAGE_SIZE) { 1582 pmap_clear_modify(m); 1583 vm_page_flag_clear(m, PG_NOSYNC); 1584 } 1585 } 1586 1587 #if 0 1588 1589 void 1590 vm_page_set_dirty(vm_page_t m, int base, int size) 1591 { 1592 m->dirty |= vm_page_bits(base, size); 1593 } 1594 1595 #endif 1596 1597 void 1598 vm_page_clear_dirty(vm_page_t m, int base, int size) 1599 { 1600 m->dirty &= ~vm_page_bits(base, size); 1601 } 1602 1603 /* 1604 * vm_page_set_invalid: 1605 * 1606 * Invalidates DEV_BSIZE'd chunks within a page. Both the 1607 * valid and dirty bits for the effected areas are cleared. 1608 * 1609 * May not block. 1610 */ 1611 void 1612 vm_page_set_invalid(vm_page_t m, int base, int size) 1613 { 1614 int bits; 1615 1616 bits = vm_page_bits(base, size); 1617 m->valid &= ~bits; 1618 m->dirty &= ~bits; 1619 m->object->generation++; 1620 } 1621 1622 /* 1623 * vm_page_zero_invalid() 1624 * 1625 * The kernel assumes that the invalid portions of a page contain 1626 * garbage, but such pages can be mapped into memory by user code. 1627 * When this occurs, we must zero out the non-valid portions of the 1628 * page so user code sees what it expects. 1629 * 1630 * Pages are most often semi-valid when the end of a file is mapped 1631 * into memory and the file's size is not page aligned. 1632 */ 1633 1634 void 1635 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) 1636 { 1637 int b; 1638 int i; 1639 1640 /* 1641 * Scan the valid bits looking for invalid sections that 1642 * must be zerod. Invalid sub-DEV_BSIZE'd areas ( where the 1643 * valid bit may be set ) have already been zerod by 1644 * vm_page_set_validclean(). 1645 */ 1646 1647 for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { 1648 if (i == (PAGE_SIZE / DEV_BSIZE) || 1649 (m->valid & (1 << i)) 1650 ) { 1651 if (i > b) { 1652 pmap_zero_page_area( 1653 VM_PAGE_TO_PHYS(m), 1654 b << DEV_BSHIFT, 1655 (i - b) << DEV_BSHIFT 1656 ); 1657 } 1658 b = i + 1; 1659 } 1660 } 1661 1662 /* 1663 * setvalid is TRUE when we can safely set the zero'd areas 1664 * as being valid. We can do this if there are no cache consistency 1665 * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. 1666 */ 1667 1668 if (setvalid) 1669 m->valid = VM_PAGE_BITS_ALL; 1670 } 1671 1672 /* 1673 * vm_page_is_valid: 1674 * 1675 * Is (partial) page valid? Note that the case where size == 0 1676 * will return FALSE in the degenerate case where the page is 1677 * entirely invalid, and TRUE otherwise. 1678 * 1679 * May not block. 1680 */ 1681 1682 int 1683 vm_page_is_valid(vm_page_t m, int base, int size) 1684 { 1685 int bits = vm_page_bits(base, size); 1686 1687 if (m->valid && ((m->valid & bits) == bits)) 1688 return 1; 1689 else 1690 return 0; 1691 } 1692 1693 /* 1694 * update dirty bits from pmap/mmu. May not block. 1695 */ 1696 1697 void 1698 vm_page_test_dirty(vm_page_t m) 1699 { 1700 if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(m)) { 1701 vm_page_dirty(m); 1702 } 1703 } 1704 1705 /* 1706 * This interface is for merging with malloc() someday. 1707 * Even if we never implement compaction so that contiguous allocation 1708 * works after initialization time, malloc()'s data structures are good 1709 * for statistics and for allocations of less than a page. 1710 */ 1711 void * 1712 contigmalloc1( 1713 unsigned long size, /* should be size_t here and for malloc() */ 1714 struct malloc_type *type, 1715 int flags, 1716 unsigned long low, 1717 unsigned long high, 1718 unsigned long alignment, 1719 unsigned long boundary, 1720 vm_map_t map) 1721 { 1722 int i, s, start; 1723 vm_offset_t addr, phys, tmp_addr; 1724 int pass; 1725 vm_page_t pga = vm_page_array; 1726 int count; 1727 1728 size = round_page(size); 1729 if (size == 0) 1730 panic("contigmalloc1: size must not be 0"); 1731 if ((alignment & (alignment - 1)) != 0) 1732 panic("contigmalloc1: alignment must be a power of 2"); 1733 if ((boundary & (boundary - 1)) != 0) 1734 panic("contigmalloc1: boundary must be a power of 2"); 1735 1736 start = 0; 1737 for (pass = 0; pass <= 1; pass++) { 1738 s = splvm(); 1739 again: 1740 /* 1741 * Find first page in array that is free, within range, aligned, and 1742 * such that the boundary won't be crossed. 1743 */ 1744 for (i = start; i < vmstats.v_page_count; i++) { 1745 int pqtype; 1746 phys = VM_PAGE_TO_PHYS(&pga[i]); 1747 pqtype = pga[i].queue - pga[i].pc; 1748 if (((pqtype == PQ_FREE) || (pqtype == PQ_CACHE)) && 1749 (phys >= low) && (phys < high) && 1750 ((phys & (alignment - 1)) == 0) && 1751 (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0)) 1752 break; 1753 } 1754 1755 /* 1756 * If the above failed or we will exceed the upper bound, fail. 1757 */ 1758 if ((i == vmstats.v_page_count) || 1759 ((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) { 1760 vm_page_t m, next; 1761 1762 again1: 1763 for (m = TAILQ_FIRST(&vm_page_queues[PQ_INACTIVE].pl); 1764 m != NULL; 1765 m = next) { 1766 1767 KASSERT(m->queue == PQ_INACTIVE, 1768 ("contigmalloc1: page %p is not PQ_INACTIVE", m)); 1769 1770 next = TAILQ_NEXT(m, pageq); 1771 if (vm_page_sleep_busy(m, TRUE, "vpctw0")) 1772 goto again1; 1773 vm_page_test_dirty(m); 1774 if (m->dirty) { 1775 if (m->object->type == OBJT_VNODE) { 1776 vn_lock(m->object->handle, LK_EXCLUSIVE | LK_RETRY, curthread); 1777 vm_object_page_clean(m->object, 0, 0, OBJPC_SYNC); 1778 VOP_UNLOCK(m->object->handle, 0, curthread); 1779 goto again1; 1780 } else if (m->object->type == OBJT_SWAP || 1781 m->object->type == OBJT_DEFAULT) { 1782 vm_pageout_flush(&m, 1, 0); 1783 goto again1; 1784 } 1785 } 1786 if ((m->dirty == 0) && (m->busy == 0) && (m->hold_count == 0)) 1787 vm_page_cache(m); 1788 } 1789 1790 for (m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl); 1791 m != NULL; 1792 m = next) { 1793 1794 KASSERT(m->queue == PQ_ACTIVE, 1795 ("contigmalloc1: page %p is not PQ_ACTIVE", m)); 1796 1797 next = TAILQ_NEXT(m, pageq); 1798 if (vm_page_sleep_busy(m, TRUE, "vpctw1")) 1799 goto again1; 1800 vm_page_test_dirty(m); 1801 if (m->dirty) { 1802 if (m->object->type == OBJT_VNODE) { 1803 vn_lock(m->object->handle, LK_EXCLUSIVE | LK_RETRY, curthread); 1804 vm_object_page_clean(m->object, 0, 0, OBJPC_SYNC); 1805 VOP_UNLOCK(m->object->handle, 0, curthread); 1806 goto again1; 1807 } else if (m->object->type == OBJT_SWAP || 1808 m->object->type == OBJT_DEFAULT) { 1809 vm_pageout_flush(&m, 1, 0); 1810 goto again1; 1811 } 1812 } 1813 if ((m->dirty == 0) && (m->busy == 0) && (m->hold_count == 0)) 1814 vm_page_cache(m); 1815 } 1816 1817 splx(s); 1818 continue; 1819 } 1820 start = i; 1821 1822 /* 1823 * Check successive pages for contiguous and free. 1824 */ 1825 for (i = start + 1; i < (start + size / PAGE_SIZE); i++) { 1826 int pqtype; 1827 pqtype = pga[i].queue - pga[i].pc; 1828 if ((VM_PAGE_TO_PHYS(&pga[i]) != 1829 (VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE)) || 1830 ((pqtype != PQ_FREE) && (pqtype != PQ_CACHE))) { 1831 start++; 1832 goto again; 1833 } 1834 } 1835 1836 for (i = start; i < (start + size / PAGE_SIZE); i++) { 1837 int pqtype; 1838 vm_page_t m = &pga[i]; 1839 1840 pqtype = m->queue - m->pc; 1841 if (pqtype == PQ_CACHE) { 1842 vm_page_busy(m); 1843 vm_page_free(m); 1844 } 1845 vm_page_unqueue_nowakeup(m); 1846 m->valid = VM_PAGE_BITS_ALL; 1847 if (m->flags & PG_ZERO) 1848 vm_page_zero_count--; 1849 m->flags = 0; 1850 KASSERT(m->dirty == 0, ("contigmalloc1: page %p was dirty", m)); 1851 m->wire_count = 0; 1852 m->busy = 0; 1853 m->object = NULL; 1854 } 1855 1856 /* 1857 * We've found a contiguous chunk that meets are requirements. 1858 * Allocate kernel VM, unfree and assign the physical pages to it and 1859 * return kernel VM pointer. 1860 */ 1861 vm_map_lock(map); 1862 count = vm_map_entry_reserve(MAP_RESERVE_COUNT); 1863 if (vm_map_findspace(map, vm_map_min(map), size, 1, &addr) != 1864 KERN_SUCCESS) { 1865 /* 1866 * XXX We almost never run out of kernel virtual 1867 * space, so we don't make the allocated memory 1868 * above available. 1869 */ 1870 vm_map_unlock(map); 1871 vm_map_entry_release(count); 1872 splx(s); 1873 return (NULL); 1874 } 1875 vm_object_reference(kernel_object); 1876 vm_map_insert(map, &count, 1877 kernel_object, addr - VM_MIN_KERNEL_ADDRESS, 1878 addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, 0); 1879 vm_map_unlock(map); 1880 vm_map_entry_release(count); 1881 1882 tmp_addr = addr; 1883 for (i = start; i < (start + size / PAGE_SIZE); i++) { 1884 vm_page_t m = &pga[i]; 1885 vm_page_insert(m, kernel_object, 1886 OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS)); 1887 tmp_addr += PAGE_SIZE; 1888 } 1889 vm_map_pageable(map, addr, addr + size, FALSE); 1890 1891 splx(s); 1892 return ((void *)addr); 1893 } 1894 return NULL; 1895 } 1896 1897 void * 1898 contigmalloc( 1899 unsigned long size, /* should be size_t here and for malloc() */ 1900 struct malloc_type *type, 1901 int flags, 1902 unsigned long low, 1903 unsigned long high, 1904 unsigned long alignment, 1905 unsigned long boundary) 1906 { 1907 return contigmalloc1(size, type, flags, low, high, alignment, boundary, 1908 kernel_map); 1909 } 1910 1911 void 1912 contigfree(void *addr, unsigned long size, struct malloc_type *type) 1913 { 1914 kmem_free(kernel_map, (vm_offset_t)addr, size); 1915 } 1916 1917 vm_offset_t 1918 vm_page_alloc_contig( 1919 vm_offset_t size, 1920 vm_offset_t low, 1921 vm_offset_t high, 1922 vm_offset_t alignment) 1923 { 1924 return ((vm_offset_t)contigmalloc1(size, M_DEVBUF, M_NOWAIT, low, high, 1925 alignment, 0ul, kernel_map)); 1926 } 1927 1928 #include "opt_ddb.h" 1929 #ifdef DDB 1930 #include <sys/kernel.h> 1931 1932 #include <ddb/ddb.h> 1933 1934 DB_SHOW_COMMAND(page, vm_page_print_page_info) 1935 { 1936 db_printf("vmstats.v_free_count: %d\n", vmstats.v_free_count); 1937 db_printf("vmstats.v_cache_count: %d\n", vmstats.v_cache_count); 1938 db_printf("vmstats.v_inactive_count: %d\n", vmstats.v_inactive_count); 1939 db_printf("vmstats.v_active_count: %d\n", vmstats.v_active_count); 1940 db_printf("vmstats.v_wire_count: %d\n", vmstats.v_wire_count); 1941 db_printf("vmstats.v_free_reserved: %d\n", vmstats.v_free_reserved); 1942 db_printf("vmstats.v_free_min: %d\n", vmstats.v_free_min); 1943 db_printf("vmstats.v_free_target: %d\n", vmstats.v_free_target); 1944 db_printf("vmstats.v_cache_min: %d\n", vmstats.v_cache_min); 1945 db_printf("vmstats.v_inactive_target: %d\n", vmstats.v_inactive_target); 1946 } 1947 1948 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) 1949 { 1950 int i; 1951 db_printf("PQ_FREE:"); 1952 for(i=0;i<PQ_L2_SIZE;i++) { 1953 db_printf(" %d", vm_page_queues[PQ_FREE + i].lcnt); 1954 } 1955 db_printf("\n"); 1956 1957 db_printf("PQ_CACHE:"); 1958 for(i=0;i<PQ_L2_SIZE;i++) { 1959 db_printf(" %d", vm_page_queues[PQ_CACHE + i].lcnt); 1960 } 1961 db_printf("\n"); 1962 1963 db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n", 1964 vm_page_queues[PQ_ACTIVE].lcnt, 1965 vm_page_queues[PQ_INACTIVE].lcnt); 1966 } 1967 #endif /* DDB */ 1968