1 /* 2 * Copyright (c) 1991, 1993, 2013 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * The Mach Operating System project at Carnegie-Mellon University. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 33 * 34 * 35 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 36 * All rights reserved. 37 * 38 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 39 * 40 * Permission to use, copy, modify and distribute this software and 41 * its documentation is hereby granted, provided that both the copyright 42 * notice and this permission notice appear in all copies of the 43 * software, derivative works or modified versions, and any portions 44 * thereof, and that both notices appear in supporting documentation. 45 * 46 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 47 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 48 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 49 * 50 * Carnegie Mellon requests users of this software to return to 51 * 52 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 53 * School of Computer Science 54 * Carnegie Mellon University 55 * Pittsburgh PA 15213-3890 56 * 57 * any improvements or extensions that they make and grant Carnegie the 58 * rights to redistribute these changes. 59 * 60 * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $ 61 */ 62 63 /* 64 * Virtual memory object module. 65 */ 66 67 #include <sys/param.h> 68 #include <sys/systm.h> 69 #include <sys/proc.h> /* for curproc, pageproc */ 70 #include <sys/thread.h> 71 #include <sys/vnode.h> 72 #include <sys/vmmeter.h> 73 #include <sys/mman.h> 74 #include <sys/mount.h> 75 #include <sys/kernel.h> 76 #include <sys/malloc.h> 77 #include <sys/sysctl.h> 78 #include <sys/refcount.h> 79 80 #include <vm/vm.h> 81 #include <vm/vm_param.h> 82 #include <vm/pmap.h> 83 #include <vm/vm_map.h> 84 #include <vm/vm_object.h> 85 #include <vm/vm_page.h> 86 #include <vm/vm_pageout.h> 87 #include <vm/vm_pager.h> 88 #include <vm/swap_pager.h> 89 #include <vm/vm_kern.h> 90 #include <vm/vm_extern.h> 91 #include <vm/vm_zone.h> 92 93 #include <vm/vm_page2.h> 94 95 #include <machine/specialreg.h> 96 97 #define EASY_SCAN_FACTOR 8 98 99 static void vm_object_page_collect_flush(vm_object_t object, vm_page_t p, 100 int pagerflags); 101 static void vm_object_lock_init(vm_object_t); 102 103 /* 104 * Virtual memory objects maintain the actual data 105 * associated with allocated virtual memory. A given 106 * page of memory exists within exactly one object. 107 * 108 * An object is only deallocated when all "references" 109 * are given up. Only one "reference" to a given 110 * region of an object should be writeable. 111 * 112 * Associated with each object is a list of all resident 113 * memory pages belonging to that object; this list is 114 * maintained by the "vm_page" module, and locked by the object's 115 * lock. 116 * 117 * Each object also records a "pager" routine which is 118 * used to retrieve (and store) pages to the proper backing 119 * storage. In addition, objects may be backed by other 120 * objects from which they were virtual-copied. 121 * 122 * The only items within the object structure which are 123 * modified after time of creation are: 124 * reference count locked by object's lock 125 * pager routine locked by object's lock 126 * 127 */ 128 129 struct vm_object kernel_object; 130 131 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE]; 132 133 MALLOC_DEFINE(M_VM_OBJECT, "vm_object", "vm_object structures"); 134 135 #define VMOBJ_HASH_PRIME1 66555444443333333ULL 136 #define VMOBJ_HASH_PRIME2 989042931893ULL 137 138 int vm_object_debug; 139 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, ""); 140 141 static __inline 142 struct vm_object_hash * 143 vmobj_hash(vm_object_t obj) 144 { 145 uintptr_t hash1; 146 uintptr_t hash2; 147 148 hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18); 149 hash1 %= VMOBJ_HASH_PRIME1; 150 hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24); 151 hash2 %= VMOBJ_HASH_PRIME2; 152 return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]); 153 } 154 155 #if defined(DEBUG_LOCKS) 156 157 #define vm_object_vndeallocate(obj, vpp) \ 158 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__) 159 160 /* 161 * Debug helper to track hold/drop/ref/deallocate calls. 162 */ 163 static void 164 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem) 165 { 166 int i; 167 168 i = atomic_fetchadd_int(&obj->debug_index, 1); 169 i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1); 170 ksnprintf(obj->debug_hold_thrs[i], 171 sizeof(obj->debug_hold_thrs[i]), 172 "%c%d:(%d):%s", 173 (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')), 174 (curthread->td_proc ? curthread->td_proc->p_pid : -1), 175 obj->ref_count, 176 curthread->td_comm); 177 obj->debug_hold_file[i] = file; 178 obj->debug_hold_line[i] = line; 179 #if 0 180 /* Uncomment for debugging obj refs/derefs in reproducable cases */ 181 if (strcmp(curthread->td_comm, "sshd") == 0) { 182 kprintf("%d %p refs=%d ar=%d file: %s/%d\n", 183 (curthread->td_proc ? curthread->td_proc->p_pid : -1), 184 obj, obj->ref_count, addrem, file, line); 185 } 186 #endif 187 } 188 189 #endif 190 191 /* 192 * Misc low level routines 193 */ 194 static void 195 vm_object_lock_init(vm_object_t obj) 196 { 197 #if defined(DEBUG_LOCKS) 198 int i; 199 200 obj->debug_index = 0; 201 for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) { 202 obj->debug_hold_thrs[i][0] = 0; 203 obj->debug_hold_file[i] = NULL; 204 obj->debug_hold_line[i] = 0; 205 } 206 #endif 207 } 208 209 void 210 vm_object_lock_swap(void) 211 { 212 lwkt_token_swap(); 213 } 214 215 void 216 vm_object_lock(vm_object_t obj) 217 { 218 lwkt_gettoken(&obj->token); 219 } 220 221 /* 222 * Returns TRUE on sucesss 223 */ 224 static int 225 vm_object_lock_try(vm_object_t obj) 226 { 227 return(lwkt_trytoken(&obj->token)); 228 } 229 230 void 231 vm_object_lock_shared(vm_object_t obj) 232 { 233 lwkt_gettoken_shared(&obj->token); 234 } 235 236 void 237 vm_object_unlock(vm_object_t obj) 238 { 239 lwkt_reltoken(&obj->token); 240 } 241 242 void 243 vm_object_upgrade(vm_object_t obj) 244 { 245 lwkt_reltoken(&obj->token); 246 lwkt_gettoken(&obj->token); 247 } 248 249 void 250 vm_object_downgrade(vm_object_t obj) 251 { 252 lwkt_reltoken(&obj->token); 253 lwkt_gettoken_shared(&obj->token); 254 } 255 256 static __inline void 257 vm_object_assert_held(vm_object_t obj) 258 { 259 ASSERT_LWKT_TOKEN_HELD(&obj->token); 260 } 261 262 int 263 vm_quickcolor(void) 264 { 265 globaldata_t gd = mycpu; 266 int pg_color; 267 268 pg_color = (int)(intptr_t)gd->gd_curthread >> 10; 269 pg_color += gd->gd_quick_color; 270 gd->gd_quick_color += PQ_PRIME2; 271 272 return pg_color; 273 } 274 275 void 276 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS) 277 { 278 KKASSERT(obj != NULL); 279 280 /* 281 * Object must be held (object allocation is stable due to callers 282 * context, typically already holding the token on a parent object) 283 * prior to potentially blocking on the lock, otherwise the object 284 * can get ripped away from us. 285 */ 286 refcount_acquire(&obj->hold_count); 287 vm_object_lock(obj); 288 289 #if defined(DEBUG_LOCKS) 290 debugvm_object_add(obj, file, line, 1); 291 #endif 292 } 293 294 int 295 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS) 296 { 297 KKASSERT(obj != NULL); 298 299 /* 300 * Object must be held (object allocation is stable due to callers 301 * context, typically already holding the token on a parent object) 302 * prior to potentially blocking on the lock, otherwise the object 303 * can get ripped away from us. 304 */ 305 refcount_acquire(&obj->hold_count); 306 if (vm_object_lock_try(obj) == 0) { 307 if (refcount_release(&obj->hold_count)) { 308 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) 309 kfree(obj, M_VM_OBJECT); 310 } 311 return(0); 312 } 313 314 #if defined(DEBUG_LOCKS) 315 debugvm_object_add(obj, file, line, 1); 316 #endif 317 return(1); 318 } 319 320 void 321 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS) 322 { 323 KKASSERT(obj != NULL); 324 325 /* 326 * Object must be held (object allocation is stable due to callers 327 * context, typically already holding the token on a parent object) 328 * prior to potentially blocking on the lock, otherwise the object 329 * can get ripped away from us. 330 */ 331 refcount_acquire(&obj->hold_count); 332 vm_object_lock_shared(obj); 333 334 #if defined(DEBUG_LOCKS) 335 debugvm_object_add(obj, file, line, 1); 336 #endif 337 } 338 339 /* 340 * Drop the token and hold_count on the object. 341 * 342 * WARNING! Token might be shared. 343 */ 344 void 345 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS) 346 { 347 if (obj == NULL) 348 return; 349 350 /* 351 * No new holders should be possible once we drop hold_count 1->0 as 352 * there is no longer any way to reference the object. 353 */ 354 KKASSERT(obj->hold_count > 0); 355 if (refcount_release(&obj->hold_count)) { 356 #if defined(DEBUG_LOCKS) 357 debugvm_object_add(obj, file, line, -1); 358 #endif 359 360 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) { 361 vm_object_unlock(obj); 362 kfree(obj, M_VM_OBJECT); 363 } else { 364 vm_object_unlock(obj); 365 } 366 } else { 367 #if defined(DEBUG_LOCKS) 368 debugvm_object_add(obj, file, line, -1); 369 #endif 370 vm_object_unlock(obj); 371 } 372 } 373 374 /* 375 * Initialize a freshly allocated object, returning a held object. 376 * 377 * Used only by vm_object_allocate(), zinitna() and vm_object_init(). 378 * 379 * No requirements. 380 */ 381 void 382 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object, 383 const char *ident) 384 { 385 struct vm_object_hash *hash; 386 387 RB_INIT(&object->rb_memq); 388 lwkt_token_init(&object->token, ident); 389 390 TAILQ_INIT(&object->backing_list); 391 lockinit(&object->backing_lk, "baclk", 0, 0); 392 393 object->type = type; 394 object->size = size; 395 object->ref_count = 1; 396 object->memattr = VM_MEMATTR_DEFAULT; 397 object->hold_count = 0; 398 object->flags = 0; 399 if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP)) 400 vm_object_set_flag(object, OBJ_ONEMAPPING); 401 object->paging_in_progress = 0; 402 object->resident_page_count = 0; 403 /* cpu localization twist */ 404 object->pg_color = vm_quickcolor(); 405 object->handle = NULL; 406 407 atomic_add_int(&object->generation, 1); 408 object->swblock_count = 0; 409 RB_INIT(&object->swblock_root); 410 vm_object_lock_init(object); 411 pmap_object_init(object); 412 413 vm_object_hold(object); 414 415 hash = vmobj_hash(object); 416 lwkt_gettoken(&hash->token); 417 TAILQ_INSERT_TAIL(&hash->list, object, object_entry); 418 lwkt_reltoken(&hash->token); 419 } 420 421 /* 422 * Initialize a VM object. 423 */ 424 void 425 vm_object_init(vm_object_t object, vm_pindex_t size) 426 { 427 _vm_object_allocate(OBJT_DEFAULT, size, object, "vmobj"); 428 vm_object_drop(object); 429 } 430 431 /* 432 * Initialize the VM objects module. 433 * 434 * Called from the low level boot code only. Note that this occurs before 435 * kmalloc is initialized so we cannot allocate any VM objects. 436 */ 437 void 438 vm_object_init1(void) 439 { 440 int i; 441 442 for (i = 0; i < VMOBJ_HSIZE; ++i) { 443 TAILQ_INIT(&vm_object_hash[i].list); 444 lwkt_token_init(&vm_object_hash[i].token, "vmobjlst"); 445 } 446 447 _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd), 448 &kernel_object, "kobj"); 449 vm_object_drop(&kernel_object); 450 } 451 452 void 453 vm_object_init2(void) 454 { 455 kmalloc_set_unlimited(M_VM_OBJECT); 456 } 457 458 /* 459 * Allocate and return a new object of the specified type and size. 460 * 461 * No requirements. 462 */ 463 vm_object_t 464 vm_object_allocate(objtype_t type, vm_pindex_t size) 465 { 466 vm_object_t obj; 467 468 obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO); 469 _vm_object_allocate(type, size, obj, "vmobj"); 470 vm_object_drop(obj); 471 472 return (obj); 473 } 474 475 /* 476 * This version returns a held object, allowing further atomic initialization 477 * of the object. 478 */ 479 vm_object_t 480 vm_object_allocate_hold(objtype_t type, vm_pindex_t size) 481 { 482 vm_object_t obj; 483 484 obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO); 485 _vm_object_allocate(type, size, obj, "vmobj"); 486 487 return (obj); 488 } 489 490 /* 491 * Add an additional reference to a vm_object. The object must already be 492 * held. The original non-lock version is no longer supported. The object 493 * must NOT be chain locked by anyone at the time the reference is added. 494 * 495 * The object must be held, but may be held shared if desired (hence why 496 * we use an atomic op). 497 */ 498 void 499 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS) 500 { 501 KKASSERT(object != NULL); 502 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 503 atomic_add_int(&object->ref_count, 1); 504 if (object->type == OBJT_VNODE) { 505 vref(object->handle); 506 /* XXX what if the vnode is being destroyed? */ 507 } 508 #if defined(DEBUG_LOCKS) 509 debugvm_object_add(object, file, line, 1); 510 #endif 511 } 512 513 /* 514 * This version is only allowed in situations where the caller 515 * already knows that the object is deterministically referenced 516 * (usually because its taken from a ref'd vnode, or during a map_entry 517 * replication). 518 */ 519 void 520 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS) 521 { 522 KKASSERT(object->type == OBJT_VNODE || object->ref_count > 0); 523 atomic_add_int(&object->ref_count, 1); 524 if (object->type == OBJT_VNODE) 525 vref(object->handle); 526 #if defined(DEBUG_LOCKS) 527 debugvm_object_add(object, file, line, 1); 528 #endif 529 } 530 531 /* 532 * Dereference an object and its underlying vnode. The object may be 533 * held shared. On return the object will remain held. 534 * 535 * This function may return a vnode in *vpp which the caller must release 536 * after the caller drops its own lock. If vpp is NULL, we assume that 537 * the caller was holding an exclusive lock on the object and we vrele() 538 * the vp ourselves. 539 */ 540 static void 541 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp 542 VMOBJDBARGS) 543 { 544 struct vnode *vp = (struct vnode *) object->handle; 545 int count; 546 547 KASSERT(object->type == OBJT_VNODE, 548 ("vm_object_vndeallocate: not a vnode object")); 549 KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); 550 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 551 #ifdef INVARIANTS 552 if (object->ref_count == 0) { 553 vprint("vm_object_vndeallocate", vp); 554 panic("vm_object_vndeallocate: bad object reference count"); 555 } 556 #endif 557 count = object->ref_count; 558 cpu_ccfence(); 559 for (;;) { 560 if (count == 1) { 561 vm_object_upgrade(object); 562 if (atomic_fcmpset_int(&object->ref_count, &count, 0)) { 563 vclrflags(vp, VTEXT); 564 break; 565 } 566 } else { 567 if (atomic_fcmpset_int(&object->ref_count, 568 &count, count - 1)) { 569 break; 570 } 571 } 572 cpu_pause(); 573 /* retry */ 574 } 575 #if defined(DEBUG_LOCKS) 576 debugvm_object_add(object, file, line, -1); 577 #endif 578 579 /* 580 * vrele or return the vp to vrele. We can only safely vrele(vp) 581 * if the object was locked exclusively. But there are two races 582 * here. 583 * 584 * We had to upgrade the object above to safely clear VTEXT 585 * but the alternative path where the shared lock is retained 586 * can STILL race to 0 in other paths and cause our own vrele() 587 * to terminate the vnode. We can't allow that if the VM object 588 * is still locked shared. 589 */ 590 if (vpp) 591 *vpp = vp; 592 else 593 vrele(vp); 594 } 595 596 /* 597 * Release a reference to the specified object, gained either through a 598 * vm_object_allocate or a vm_object_reference call. When all references 599 * are gone, storage associated with this object may be relinquished. 600 * 601 * The caller does not have to hold the object locked but must have control 602 * over the reference in question in order to guarantee that the object 603 * does not get ripped out from under us. 604 * 605 * XXX Currently all deallocations require an exclusive lock. 606 */ 607 void 608 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS) 609 { 610 struct vnode *vp; 611 int count; 612 613 if (object == NULL) 614 return; 615 616 count = object->ref_count; 617 cpu_ccfence(); 618 for (;;) { 619 /* 620 * If decrementing the count enters into special handling 621 * territory (0, 1, or 2) we have to do it the hard way. 622 * Fortunate though, objects with only a few refs like this 623 * are not likely to be heavily contended anyway. 624 * 625 * For vnode objects we only care about 1->0 transitions. 626 */ 627 if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) { 628 #if defined(DEBUG_LOCKS) 629 debugvm_object_add(object, file, line, 0); 630 #endif 631 vm_object_hold(object); 632 vm_object_deallocate_locked(object); 633 vm_object_drop(object); 634 break; 635 } 636 637 /* 638 * Try to decrement ref_count without acquiring a hold on 639 * the object. This is particularly important for the exec*() 640 * and exit*() code paths because the program binary may 641 * have a great deal of sharing and an exclusive lock will 642 * crowbar performance in those circumstances. 643 */ 644 if (object->type == OBJT_VNODE) { 645 vp = (struct vnode *)object->handle; 646 if (atomic_fcmpset_int(&object->ref_count, 647 &count, count - 1)) { 648 #if defined(DEBUG_LOCKS) 649 debugvm_object_add(object, file, line, -1); 650 #endif 651 652 vrele(vp); 653 break; 654 } 655 /* retry */ 656 } else { 657 if (atomic_fcmpset_int(&object->ref_count, 658 &count, count - 1)) { 659 #if defined(DEBUG_LOCKS) 660 debugvm_object_add(object, file, line, -1); 661 #endif 662 break; 663 } 664 /* retry */ 665 } 666 cpu_pause(); 667 /* retry */ 668 } 669 } 670 671 void 672 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS) 673 { 674 /* 675 * Degenerate case 676 */ 677 if (object == NULL) 678 return; 679 680 /* 681 * vnode case, caller either locked the object exclusively 682 * or this is a recursion with must_drop != 0 and the vnode 683 * object will be locked shared. 684 * 685 * If locked shared we have to drop the object before we can 686 * call vrele() or risk a shared/exclusive livelock. 687 */ 688 if (object->type == OBJT_VNODE) { 689 ASSERT_LWKT_TOKEN_HELD(&object->token); 690 vm_object_vndeallocate(object, NULL); 691 return; 692 } 693 ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token); 694 695 /* 696 * Normal case (object is locked exclusively) 697 */ 698 if (object->ref_count == 0) { 699 panic("vm_object_deallocate: object deallocated " 700 "too many times: %d", object->type); 701 } 702 if (object->ref_count > 2) { 703 atomic_add_int(&object->ref_count, -1); 704 #if defined(DEBUG_LOCKS) 705 debugvm_object_add(object, file, line, -1); 706 #endif 707 return; 708 } 709 710 /* 711 * Drop the ref and handle termination on the 1->0 transition. 712 * We may have blocked above so we have to recheck. 713 */ 714 KKASSERT(object->ref_count != 0); 715 if (object->ref_count >= 2) { 716 atomic_add_int(&object->ref_count, -1); 717 #if defined(DEBUG_LOCKS) 718 debugvm_object_add(object, file, line, -1); 719 #endif 720 return; 721 } 722 723 atomic_add_int(&object->ref_count, -1); 724 if ((object->flags & OBJ_DEAD) == 0) 725 vm_object_terminate(object); 726 } 727 728 /* 729 * Destroy the specified object, freeing up related resources. 730 * 731 * The object must have zero references. 732 * 733 * The object must held. The caller is responsible for dropping the object 734 * after terminate returns. Terminate does NOT drop the object. 735 */ 736 static int vm_object_terminate_callback(vm_page_t p, void *data); 737 738 void 739 vm_object_terminate(vm_object_t object) 740 { 741 struct rb_vm_page_scan_info info; 742 struct vm_object_hash *hash; 743 744 /* 745 * Make sure no one uses us. Once we set OBJ_DEAD we should be 746 * able to safely block. 747 */ 748 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 749 KKASSERT((object->flags & OBJ_DEAD) == 0); 750 vm_object_set_flag(object, OBJ_DEAD); 751 752 /* 753 * Wait for the pageout daemon to be done with the object 754 */ 755 vm_object_pip_wait(object, "objtrm1"); 756 757 KASSERT(!object->paging_in_progress, 758 ("vm_object_terminate: pageout in progress")); 759 760 /* 761 * Clean and free the pages, as appropriate. All references to the 762 * object are gone, so we don't need to lock it. 763 */ 764 if (object->type == OBJT_VNODE) { 765 struct vnode *vp; 766 767 /* 768 * Clean pages and flush buffers. 769 * 770 * NOTE! TMPFS buffer flushes do not typically flush the 771 * actual page to swap as this would be highly 772 * inefficient, and normal filesystems usually wrap 773 * page flushes with buffer cache buffers. 774 * 775 * To deal with this we have to call vinvalbuf() both 776 * before and after the vm_object_page_clean(). 777 */ 778 vp = (struct vnode *) object->handle; 779 vinvalbuf(vp, V_SAVE, 0, 0); 780 vm_object_page_clean(object, 0, 0, OBJPC_SYNC); 781 vinvalbuf(vp, V_SAVE, 0, 0); 782 } 783 784 /* 785 * Wait for any I/O to complete, after which there had better not 786 * be any references left on the object. 787 */ 788 vm_object_pip_wait(object, "objtrm2"); 789 790 if (object->ref_count != 0) { 791 panic("vm_object_terminate: object with references, " 792 "ref_count=%d", object->ref_count); 793 } 794 795 /* 796 * Cleanup any shared pmaps associated with this object. 797 */ 798 pmap_object_free(object); 799 800 /* 801 * Now free any remaining pages. For internal objects, this also 802 * removes them from paging queues. Don't free wired pages, just 803 * remove them from the object. 804 */ 805 info.count = 0; 806 info.object = object; 807 do { 808 info.error = 0; 809 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 810 vm_object_terminate_callback, &info); 811 } while (info.error); 812 813 /* 814 * Let the pager know object is dead. 815 */ 816 vm_pager_deallocate(object); 817 818 /* 819 * Wait for the object hold count to hit 1, clean out pages as 820 * we go. vmobj_token interlocks any race conditions that might 821 * pick the object up from the vm_object_list after we have cleared 822 * rb_memq. 823 */ 824 for (;;) { 825 if (RB_ROOT(&object->rb_memq) == NULL) 826 break; 827 kprintf("vm_object_terminate: Warning, object %p " 828 "still has %ld pages\n", 829 object, object->resident_page_count); 830 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 831 vm_object_terminate_callback, &info); 832 } 833 834 /* 835 * There had better not be any pages left 836 */ 837 KKASSERT(object->resident_page_count == 0); 838 839 /* 840 * Remove the object from the global object list. 841 */ 842 hash = vmobj_hash(object); 843 lwkt_gettoken(&hash->token); 844 TAILQ_REMOVE(&hash->list, object, object_entry); 845 lwkt_reltoken(&hash->token); 846 847 if (object->ref_count != 0) { 848 panic("vm_object_terminate2: object with references, " 849 "ref_count=%d", object->ref_count); 850 } 851 852 /* 853 * NOTE: The object hold_count is at least 1, so we cannot kfree() 854 * the object here. See vm_object_drop(). 855 */ 856 } 857 858 /* 859 * The caller must hold the object. 860 */ 861 static int 862 vm_object_terminate_callback(vm_page_t p, void *data) 863 { 864 struct rb_vm_page_scan_info *info = data; 865 vm_object_t object; 866 867 object = p->object; 868 KKASSERT(object == info->object); 869 if (vm_page_busy_try(p, TRUE)) { 870 vm_page_sleep_busy(p, TRUE, "vmotrm"); 871 info->error = 1; 872 return 0; 873 } 874 if (object != p->object) { 875 /* XXX remove once we determine it can't happen */ 876 kprintf("vm_object_terminate: Warning: Encountered " 877 "busied page %p on queue %d\n", p, p->queue); 878 vm_page_wakeup(p); 879 info->error = 1; 880 } else if (p->wire_count == 0) { 881 /* 882 * NOTE: p->dirty and PG_NEED_COMMIT are ignored. 883 */ 884 vm_page_free(p); 885 mycpu->gd_cnt.v_pfree++; 886 } else { 887 if (p->queue != PQ_NONE) { 888 kprintf("vm_object_terminate: Warning: Encountered " 889 "wired page %p on queue %d\n", p, p->queue); 890 if (vm_object_debug > 0) { 891 --vm_object_debug; 892 print_backtrace(10); 893 } 894 } 895 vm_page_remove(p); 896 vm_page_wakeup(p); 897 } 898 899 /* 900 * Must be at end to avoid SMP races, caller holds object token 901 */ 902 if ((++info->count & 63) == 0) 903 lwkt_user_yield(); 904 return(0); 905 } 906 907 /* 908 * Clean all dirty pages in the specified range of object. Leaves page 909 * on whatever queue it is currently on. If NOSYNC is set then do not 910 * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC), 911 * leaving the object dirty. 912 * 913 * When stuffing pages asynchronously, allow clustering. XXX we need a 914 * synchronous clustering mode implementation. 915 * 916 * Odd semantics: if start == end, we clean everything. 917 * 918 * The object must be locked? XXX 919 */ 920 static int vm_object_page_clean_pass1(struct vm_page *p, void *data); 921 static int vm_object_page_clean_pass2(struct vm_page *p, void *data); 922 923 void 924 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, 925 int flags) 926 { 927 struct rb_vm_page_scan_info info; 928 struct vnode *vp; 929 int wholescan; 930 int pagerflags; 931 int generation; 932 933 vm_object_hold(object); 934 if (object->type != OBJT_VNODE || 935 (object->flags & OBJ_MIGHTBEDIRTY) == 0) { 936 vm_object_drop(object); 937 return; 938 } 939 940 pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? 941 VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK; 942 pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0; 943 944 vp = object->handle; 945 946 /* 947 * Interlock other major object operations. This allows us to 948 * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY. 949 */ 950 vm_object_set_flag(object, OBJ_CLEANING); 951 952 /* 953 * Handle 'entire object' case 954 */ 955 info.start_pindex = start; 956 if (end == 0) { 957 info.end_pindex = object->size - 1; 958 } else { 959 info.end_pindex = end - 1; 960 } 961 wholescan = (start == 0 && info.end_pindex == object->size - 1); 962 info.limit = flags; 963 info.pagerflags = pagerflags; 964 info.object = object; 965 966 /* 967 * If cleaning the entire object do a pass to mark the pages read-only. 968 * If everything worked out ok, clear OBJ_WRITEABLE and 969 * OBJ_MIGHTBEDIRTY. 970 */ 971 if (wholescan) { 972 info.error = 0; 973 info.count = 0; 974 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 975 vm_object_page_clean_pass1, &info); 976 if (info.error == 0) { 977 vm_object_clear_flag(object, 978 OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); 979 if (object->type == OBJT_VNODE && 980 (vp = (struct vnode *)object->handle) != NULL) { 981 /* 982 * Use new-style interface to clear VISDIRTY 983 * because the vnode is not necessarily removed 984 * from the syncer list(s) as often as it was 985 * under the old interface, which can leave 986 * the vnode on the syncer list after reclaim. 987 */ 988 vclrobjdirty(vp); 989 } 990 } 991 } 992 993 /* 994 * Do a pass to clean all the dirty pages we find. 995 */ 996 do { 997 info.error = 0; 998 info.count = 0; 999 generation = object->generation; 1000 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 1001 vm_object_page_clean_pass2, &info); 1002 } while (info.error || generation != object->generation); 1003 1004 vm_object_clear_flag(object, OBJ_CLEANING); 1005 vm_object_drop(object); 1006 } 1007 1008 /* 1009 * The caller must hold the object. 1010 */ 1011 static 1012 int 1013 vm_object_page_clean_pass1(struct vm_page *p, void *data) 1014 { 1015 struct rb_vm_page_scan_info *info = data; 1016 1017 KKASSERT(p->object == info->object); 1018 1019 vm_page_flag_set(p, PG_CLEANCHK); 1020 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { 1021 info->error = 1; 1022 } else if (vm_page_busy_try(p, FALSE)) { 1023 info->error = 1; 1024 } else { 1025 KKASSERT(p->object == info->object); 1026 vm_page_protect(p, VM_PROT_READ); 1027 vm_page_wakeup(p); 1028 } 1029 1030 /* 1031 * Must be at end to avoid SMP races, caller holds object token 1032 */ 1033 if ((++info->count & 63) == 0) 1034 lwkt_user_yield(); 1035 return(0); 1036 } 1037 1038 /* 1039 * The caller must hold the object 1040 */ 1041 static 1042 int 1043 vm_object_page_clean_pass2(struct vm_page *p, void *data) 1044 { 1045 struct rb_vm_page_scan_info *info = data; 1046 int generation; 1047 1048 KKASSERT(p->object == info->object); 1049 1050 /* 1051 * Do not mess with pages that were inserted after we started 1052 * the cleaning pass. 1053 */ 1054 if ((p->flags & PG_CLEANCHK) == 0) 1055 goto done; 1056 1057 generation = info->object->generation; 1058 1059 if (vm_page_busy_try(p, TRUE)) { 1060 vm_page_sleep_busy(p, TRUE, "vpcwai"); 1061 info->error = 1; 1062 goto done; 1063 } 1064 1065 KKASSERT(p->object == info->object && 1066 info->object->generation == generation); 1067 1068 /* 1069 * Before wasting time traversing the pmaps, check for trivial 1070 * cases where the page cannot be dirty. 1071 */ 1072 if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) { 1073 KKASSERT((p->dirty & p->valid) == 0 && 1074 (p->flags & PG_NEED_COMMIT) == 0); 1075 vm_page_wakeup(p); 1076 goto done; 1077 } 1078 1079 /* 1080 * Check whether the page is dirty or not. The page has been set 1081 * to be read-only so the check will not race a user dirtying the 1082 * page. 1083 */ 1084 vm_page_test_dirty(p); 1085 if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) { 1086 vm_page_flag_clear(p, PG_CLEANCHK); 1087 vm_page_wakeup(p); 1088 goto done; 1089 } 1090 1091 /* 1092 * If we have been asked to skip nosync pages and this is a 1093 * nosync page, skip it. Note that the object flags were 1094 * not cleared in this case (because pass1 will have returned an 1095 * error), so we do not have to set them. 1096 */ 1097 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { 1098 vm_page_flag_clear(p, PG_CLEANCHK); 1099 vm_page_wakeup(p); 1100 goto done; 1101 } 1102 1103 /* 1104 * Flush as many pages as we can. PG_CLEANCHK will be cleared on 1105 * the pages that get successfully flushed. Set info->error if 1106 * we raced an object modification. 1107 */ 1108 vm_object_page_collect_flush(info->object, p, info->pagerflags); 1109 /* vm_wait_nominal(); this can deadlock the system in syncer/pageout */ 1110 1111 /* 1112 * Must be at end to avoid SMP races, caller holds object token 1113 */ 1114 done: 1115 if ((++info->count & 63) == 0) 1116 lwkt_user_yield(); 1117 return(0); 1118 } 1119 1120 /* 1121 * Collect the specified page and nearby pages and flush them out. 1122 * The number of pages flushed is returned. The passed page is busied 1123 * by the caller and we are responsible for its disposition. 1124 * 1125 * The caller must hold the object. 1126 */ 1127 static void 1128 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags) 1129 { 1130 int error; 1131 int is; 1132 int ib; 1133 int i; 1134 int page_base; 1135 vm_pindex_t pi; 1136 vm_page_t ma[BLIST_MAX_ALLOC]; 1137 1138 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 1139 1140 pi = p->pindex; 1141 page_base = pi % BLIST_MAX_ALLOC; 1142 ma[page_base] = p; 1143 ib = page_base - 1; 1144 is = page_base + 1; 1145 1146 while (ib >= 0) { 1147 vm_page_t tp; 1148 1149 tp = vm_page_lookup_busy_try(object, pi - page_base + ib, 1150 TRUE, &error); 1151 if (error) 1152 break; 1153 if (tp == NULL) 1154 break; 1155 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 && 1156 (tp->flags & PG_CLEANCHK) == 0) { 1157 vm_page_wakeup(tp); 1158 break; 1159 } 1160 if ((tp->queue - tp->pc) == PQ_CACHE) { 1161 vm_page_flag_clear(tp, PG_CLEANCHK); 1162 vm_page_wakeup(tp); 1163 break; 1164 } 1165 vm_page_test_dirty(tp); 1166 if ((tp->dirty & tp->valid) == 0 && 1167 (tp->flags & PG_NEED_COMMIT) == 0) { 1168 vm_page_flag_clear(tp, PG_CLEANCHK); 1169 vm_page_wakeup(tp); 1170 break; 1171 } 1172 ma[ib] = tp; 1173 --ib; 1174 } 1175 ++ib; /* fixup */ 1176 1177 while (is < BLIST_MAX_ALLOC && 1178 pi - page_base + is < object->size) { 1179 vm_page_t tp; 1180 1181 tp = vm_page_lookup_busy_try(object, pi - page_base + is, 1182 TRUE, &error); 1183 if (error) 1184 break; 1185 if (tp == NULL) 1186 break; 1187 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 && 1188 (tp->flags & PG_CLEANCHK) == 0) { 1189 vm_page_wakeup(tp); 1190 break; 1191 } 1192 if ((tp->queue - tp->pc) == PQ_CACHE) { 1193 vm_page_flag_clear(tp, PG_CLEANCHK); 1194 vm_page_wakeup(tp); 1195 break; 1196 } 1197 vm_page_test_dirty(tp); 1198 if ((tp->dirty & tp->valid) == 0 && 1199 (tp->flags & PG_NEED_COMMIT) == 0) { 1200 vm_page_flag_clear(tp, PG_CLEANCHK); 1201 vm_page_wakeup(tp); 1202 break; 1203 } 1204 ma[is] = tp; 1205 ++is; 1206 } 1207 1208 /* 1209 * All pages in the ma[] array are busied now 1210 */ 1211 for (i = ib; i < is; ++i) { 1212 vm_page_flag_clear(ma[i], PG_CLEANCHK); 1213 vm_page_hold(ma[i]); /* XXX need this any more? */ 1214 } 1215 vm_pageout_flush(&ma[ib], is - ib, pagerflags); 1216 for (i = ib; i < is; ++i) /* XXX need this any more? */ 1217 vm_page_unhold(ma[i]); 1218 } 1219 1220 /* 1221 * Implements the madvise function at the object/page level. 1222 * 1223 * MADV_WILLNEED (any object) 1224 * 1225 * Activate the specified pages if they are resident. 1226 * 1227 * MADV_DONTNEED (any object) 1228 * 1229 * Deactivate the specified pages if they are resident. 1230 * 1231 * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only) 1232 * 1233 * Deactivate and clean the specified pages if they are 1234 * resident. This permits the process to reuse the pages 1235 * without faulting or the kernel to reclaim the pages 1236 * without I/O. 1237 * 1238 * No requirements. 1239 */ 1240 void 1241 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, 1242 vm_pindex_t count, int advise) 1243 { 1244 vm_pindex_t end; 1245 vm_page_t m; 1246 int error; 1247 1248 if (object == NULL) 1249 return; 1250 1251 end = pindex + count; 1252 1253 vm_object_hold(object); 1254 1255 /* 1256 * Locate and adjust resident pages. This only applies to the 1257 * primary object in the mapping. 1258 */ 1259 for (; pindex < end; pindex += 1) { 1260 relookup: 1261 /* 1262 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages 1263 * and those pages must be OBJ_ONEMAPPING. 1264 */ 1265 if (advise == MADV_FREE) { 1266 if ((object->type != OBJT_DEFAULT && 1267 object->type != OBJT_SWAP) || 1268 (object->flags & OBJ_ONEMAPPING) == 0) { 1269 continue; 1270 } 1271 } 1272 1273 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error); 1274 1275 if (error) { 1276 vm_page_sleep_busy(m, TRUE, "madvpo"); 1277 goto relookup; 1278 } 1279 if (m == NULL) { 1280 /* 1281 * There may be swap even if there is no backing page 1282 */ 1283 if (advise == MADV_FREE && object->type == OBJT_SWAP) 1284 swap_pager_freespace(object, pindex, 1); 1285 continue; 1286 } 1287 1288 /* 1289 * If the page is not in a normal active state, we skip it. 1290 * If the page is not managed there are no page queues to 1291 * mess with. Things can break if we mess with pages in 1292 * any of the below states. 1293 */ 1294 if (m->wire_count || 1295 (m->flags & (PG_FICTITIOUS | PG_UNQUEUED | 1296 PG_NEED_COMMIT)) || 1297 m->valid != VM_PAGE_BITS_ALL 1298 ) { 1299 vm_page_wakeup(m); 1300 continue; 1301 } 1302 1303 /* 1304 * Theoretically once a page is known not to be busy, an 1305 * interrupt cannot come along and rip it out from under us. 1306 */ 1307 if (advise == MADV_WILLNEED) { 1308 vm_page_activate(m); 1309 } else if (advise == MADV_DONTNEED) { 1310 vm_page_dontneed(m); 1311 } else if (advise == MADV_FREE) { 1312 /* 1313 * Mark the page clean. This will allow the page 1314 * to be freed up by the system. However, such pages 1315 * are often reused quickly by malloc()/free() 1316 * so we do not do anything that would cause 1317 * a page fault if we can help it. 1318 * 1319 * Specifically, we do not try to actually free 1320 * the page now nor do we try to put it in the 1321 * cache (which would cause a page fault on reuse). 1322 * 1323 * But we do make the page is freeable as we 1324 * can without actually taking the step of unmapping 1325 * it. 1326 */ 1327 pmap_clear_modify(m); 1328 m->dirty = 0; 1329 m->act_count = 0; 1330 vm_page_dontneed(m); 1331 if (object->type == OBJT_SWAP) 1332 swap_pager_freespace(object, pindex, 1); 1333 } 1334 vm_page_wakeup(m); 1335 } 1336 vm_object_drop(object); 1337 } 1338 1339 /* 1340 * Removes all physical pages in the specified object range from the 1341 * object's list of pages. 1342 * 1343 * No requirements. 1344 */ 1345 static int vm_object_page_remove_callback(vm_page_t p, void *data); 1346 1347 void 1348 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, 1349 boolean_t clean_only) 1350 { 1351 struct rb_vm_page_scan_info info; 1352 int all; 1353 1354 /* 1355 * Degenerate cases and assertions 1356 */ 1357 vm_object_hold(object); 1358 if (object == NULL || 1359 (object->resident_page_count == 0 && object->swblock_count == 0)) { 1360 vm_object_drop(object); 1361 return; 1362 } 1363 KASSERT(object->type != OBJT_PHYS, 1364 ("attempt to remove pages from a physical object")); 1365 1366 /* 1367 * Indicate that paging is occuring on the object 1368 */ 1369 vm_object_pip_add(object, 1); 1370 1371 /* 1372 * Figure out the actual removal range and whether we are removing 1373 * the entire contents of the object or not. If removing the entire 1374 * contents, be sure to get all pages, even those that might be 1375 * beyond the end of the object. 1376 */ 1377 info.object = object; 1378 info.start_pindex = start; 1379 if (end == 0) 1380 info.end_pindex = (vm_pindex_t)-1; 1381 else 1382 info.end_pindex = end - 1; 1383 info.limit = clean_only; 1384 info.count = 0; 1385 all = (start == 0 && info.end_pindex >= object->size - 1); 1386 1387 /* 1388 * Loop until we are sure we have gotten them all. 1389 */ 1390 do { 1391 info.error = 0; 1392 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 1393 vm_object_page_remove_callback, &info); 1394 } while (info.error); 1395 1396 /* 1397 * Remove any related swap if throwing away pages, or for 1398 * non-swap objects (the swap is a clean copy in that case). 1399 */ 1400 if (object->type != OBJT_SWAP || clean_only == FALSE) { 1401 if (all) 1402 swap_pager_freespace_all(object); 1403 else 1404 swap_pager_freespace(object, info.start_pindex, 1405 info.end_pindex - info.start_pindex + 1); 1406 } 1407 1408 /* 1409 * Cleanup 1410 */ 1411 vm_object_pip_wakeup(object); 1412 vm_object_drop(object); 1413 } 1414 1415 /* 1416 * The caller must hold the object. 1417 * 1418 * NOTE: User yields are allowed when removing more than one page, but not 1419 * allowed if only removing one page (the path for single page removals 1420 * might hold a spinlock). 1421 */ 1422 static int 1423 vm_object_page_remove_callback(vm_page_t p, void *data) 1424 { 1425 struct rb_vm_page_scan_info *info = data; 1426 1427 if (info->object != p->object || 1428 p->pindex < info->start_pindex || 1429 p->pindex > info->end_pindex) { 1430 kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n", 1431 info->object, p); 1432 return(0); 1433 } 1434 if (vm_page_busy_try(p, TRUE)) { 1435 vm_page_sleep_busy(p, TRUE, "vmopar"); 1436 info->error = 1; 1437 return(0); 1438 } 1439 if (info->object != p->object) { 1440 /* this should never happen */ 1441 kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n", 1442 info->object, p); 1443 vm_page_wakeup(p); 1444 return(0); 1445 } 1446 1447 /* 1448 * Wired pages cannot be destroyed, but they can be invalidated 1449 * and we do so if clean_only (limit) is not set. 1450 * 1451 * WARNING! The page may be wired due to being part of a buffer 1452 * cache buffer, and the buffer might be marked B_CACHE. 1453 * This is fine as part of a truncation but VFSs must be 1454 * sure to fix the buffer up when re-extending the file. 1455 * 1456 * NOTE! PG_NEED_COMMIT is ignored. 1457 */ 1458 if (p->wire_count != 0) { 1459 vm_page_protect(p, VM_PROT_NONE); 1460 if (info->limit == 0) 1461 p->valid = 0; 1462 vm_page_wakeup(p); 1463 goto done; 1464 } 1465 1466 /* 1467 * limit is our clean_only flag. If set and the page is dirty or 1468 * requires a commit, do not free it. If set and the page is being 1469 * held by someone, do not free it. 1470 */ 1471 if (info->limit && p->valid) { 1472 vm_page_test_dirty(p); 1473 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) { 1474 vm_page_wakeup(p); 1475 goto done; 1476 } 1477 } 1478 1479 /* 1480 * Destroy the page. But we have to re-test whether its dirty after 1481 * removing it from its pmaps. 1482 */ 1483 vm_page_protect(p, VM_PROT_NONE); 1484 if (info->limit && p->valid) { 1485 vm_page_test_dirty(p); 1486 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) { 1487 vm_page_wakeup(p); 1488 goto done; 1489 } 1490 } 1491 vm_page_free(p); 1492 1493 /* 1494 * Must be at end to avoid SMP races, caller holds object token 1495 */ 1496 done: 1497 if ((++info->count & 63) == 0) 1498 lwkt_user_yield(); 1499 1500 return(0); 1501 } 1502 1503 /* 1504 * Try to extend prev_object into an adjoining region of virtual 1505 * memory, return TRUE on success. 1506 * 1507 * The caller does not need to hold (prev_object) but must have a stable 1508 * pointer to it (typically by holding the vm_map locked). 1509 * 1510 * This function only works for anonymous memory objects which either 1511 * have (a) one reference or (b) we are extending the object's size. 1512 * Otherwise the related VM pages we want to use for the object might 1513 * be in use by another mapping. 1514 */ 1515 boolean_t 1516 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex, 1517 vm_size_t prev_size, vm_size_t next_size) 1518 { 1519 vm_pindex_t next_pindex; 1520 1521 if (prev_object == NULL) 1522 return (TRUE); 1523 1524 vm_object_hold(prev_object); 1525 1526 if (prev_object->type != OBJT_DEFAULT && 1527 prev_object->type != OBJT_SWAP) { 1528 vm_object_drop(prev_object); 1529 return (FALSE); 1530 } 1531 1532 #if 0 1533 /* caller now checks this */ 1534 /* 1535 * Try to collapse the object first 1536 */ 1537 vm_object_collapse(prev_object, NULL); 1538 #endif 1539 1540 #if 0 1541 /* caller now checks this */ 1542 /* 1543 * We can't coalesce if we shadow another object (figuring out the 1544 * relationships become too complex). 1545 */ 1546 if (prev_object->backing_object != NULL) { 1547 vm_object_chain_release(prev_object); 1548 vm_object_drop(prev_object); 1549 return (FALSE); 1550 } 1551 #endif 1552 1553 prev_size >>= PAGE_SHIFT; 1554 next_size >>= PAGE_SHIFT; 1555 next_pindex = prev_pindex + prev_size; 1556 1557 /* 1558 * We can't if the object has more than one ref count unless we 1559 * are extending it into newly minted space. 1560 */ 1561 if (prev_object->ref_count > 1 && 1562 prev_object->size != next_pindex) { 1563 vm_object_drop(prev_object); 1564 return (FALSE); 1565 } 1566 1567 /* 1568 * Remove any pages that may still be in the object from a previous 1569 * deallocation. 1570 */ 1571 if (next_pindex < prev_object->size) { 1572 vm_object_page_remove(prev_object, 1573 next_pindex, 1574 next_pindex + next_size, FALSE); 1575 if (prev_object->type == OBJT_SWAP) 1576 swap_pager_freespace(prev_object, 1577 next_pindex, next_size); 1578 } 1579 1580 /* 1581 * Extend the object if necessary. 1582 */ 1583 if (next_pindex + next_size > prev_object->size) 1584 prev_object->size = next_pindex + next_size; 1585 vm_object_drop(prev_object); 1586 1587 return (TRUE); 1588 } 1589 1590 /* 1591 * Make the object writable and flag is being possibly dirty. 1592 * 1593 * The object might not be held (or might be held but held shared), 1594 * the related vnode is probably not held either. Object and vnode are 1595 * stable by virtue of the vm_page busied by the caller preventing 1596 * destruction. 1597 * 1598 * If the related mount is flagged MNTK_THR_SYNC we need to call 1599 * vsetobjdirty(). Filesystems using this option usually shortcut 1600 * synchronization by only scanning the syncer list. 1601 */ 1602 void 1603 vm_object_set_writeable_dirty(vm_object_t object) 1604 { 1605 struct vnode *vp; 1606 1607 /*vm_object_assert_held(object);*/ 1608 /* 1609 * Avoid contention in vm fault path by checking the state before 1610 * issuing an atomic op on it. 1611 */ 1612 if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) != 1613 (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) { 1614 vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); 1615 } 1616 if (object->type == OBJT_VNODE && 1617 (vp = (struct vnode *)object->handle) != NULL) { 1618 if ((vp->v_flag & VOBJDIRTY) == 0) { 1619 if (vp->v_mount && 1620 (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) { 1621 /* 1622 * New style THR_SYNC places vnodes on the 1623 * syncer list more deterministically. 1624 */ 1625 vsetobjdirty(vp); 1626 } else { 1627 /* 1628 * Old style scan would not necessarily place 1629 * a vnode on the syncer list when possibly 1630 * modified via mmap. 1631 */ 1632 vsetflags(vp, VOBJDIRTY); 1633 } 1634 } 1635 } 1636 } 1637 1638 #include "opt_ddb.h" 1639 #ifdef DDB 1640 #include <sys/cons.h> 1641 1642 #include <ddb/ddb.h> 1643 1644 static int _vm_object_in_map (vm_map_t map, vm_object_t object, 1645 vm_map_entry_t entry); 1646 static int vm_object_in_map (vm_object_t object); 1647 1648 /* 1649 * The caller must hold the object. 1650 */ 1651 static int 1652 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) 1653 { 1654 vm_map_backing_t ba; 1655 vm_map_t tmpm; 1656 vm_map_entry_t tmpe; 1657 int entcount; 1658 1659 if (map == NULL) 1660 return 0; 1661 if (entry == NULL) { 1662 tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root); 1663 entcount = map->nentries; 1664 while (entcount-- && tmpe) { 1665 if( _vm_object_in_map(map, object, tmpe)) { 1666 return 1; 1667 } 1668 tmpe = vm_map_rb_tree_RB_NEXT(tmpe); 1669 } 1670 return (0); 1671 } 1672 switch(entry->maptype) { 1673 case VM_MAPTYPE_SUBMAP: 1674 tmpm = entry->ba.sub_map; 1675 tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root); 1676 entcount = tmpm->nentries; 1677 while (entcount-- && tmpe) { 1678 if( _vm_object_in_map(tmpm, object, tmpe)) { 1679 return 1; 1680 } 1681 tmpe = vm_map_rb_tree_RB_NEXT(tmpe); 1682 } 1683 break; 1684 case VM_MAPTYPE_NORMAL: 1685 case VM_MAPTYPE_VPAGETABLE: 1686 ba = &entry->ba; 1687 while (ba) { 1688 if (ba->object == object) 1689 return TRUE; 1690 ba = ba->backing_ba; 1691 } 1692 break; 1693 default: 1694 break; 1695 } 1696 return 0; 1697 } 1698 1699 static int vm_object_in_map_callback(struct proc *p, void *data); 1700 1701 struct vm_object_in_map_info { 1702 vm_object_t object; 1703 int rv; 1704 }; 1705 1706 /* 1707 * Debugging only 1708 */ 1709 static int 1710 vm_object_in_map(vm_object_t object) 1711 { 1712 struct vm_object_in_map_info info; 1713 1714 info.rv = 0; 1715 info.object = object; 1716 1717 allproc_scan(vm_object_in_map_callback, &info, 0); 1718 if (info.rv) 1719 return 1; 1720 if( _vm_object_in_map(&kernel_map, object, 0)) 1721 return 1; 1722 if( _vm_object_in_map(&pager_map, object, 0)) 1723 return 1; 1724 if( _vm_object_in_map(&buffer_map, object, 0)) 1725 return 1; 1726 return 0; 1727 } 1728 1729 /* 1730 * Debugging only 1731 */ 1732 static int 1733 vm_object_in_map_callback(struct proc *p, void *data) 1734 { 1735 struct vm_object_in_map_info *info = data; 1736 1737 if (p->p_vmspace) { 1738 if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) { 1739 info->rv = 1; 1740 return -1; 1741 } 1742 } 1743 return (0); 1744 } 1745 1746 DB_SHOW_COMMAND(vmochk, vm_object_check) 1747 { 1748 struct vm_object_hash *hash; 1749 vm_object_t object; 1750 int n; 1751 1752 /* 1753 * make sure that internal objs are in a map somewhere 1754 * and none have zero ref counts. 1755 */ 1756 for (n = 0; n < VMOBJ_HSIZE; ++n) { 1757 hash = &vm_object_hash[n]; 1758 for (object = TAILQ_FIRST(&hash->list); 1759 object != NULL; 1760 object = TAILQ_NEXT(object, object_entry)) { 1761 if (object->type == OBJT_MARKER) 1762 continue; 1763 if (object->handle != NULL || 1764 (object->type != OBJT_DEFAULT && 1765 object->type != OBJT_SWAP)) { 1766 continue; 1767 } 1768 if (object->ref_count == 0) { 1769 db_printf("vmochk: internal obj has " 1770 "zero ref count: %ld\n", 1771 (long)object->size); 1772 } 1773 if (vm_object_in_map(object)) 1774 continue; 1775 db_printf("vmochk: internal obj is not in a map: " 1776 "ref: %d, size: %lu: 0x%lx\n", 1777 object->ref_count, (u_long)object->size, 1778 (u_long)object->size); 1779 } 1780 } 1781 } 1782 1783 /* 1784 * Debugging only 1785 */ 1786 DB_SHOW_COMMAND(object, vm_object_print_static) 1787 { 1788 /* XXX convert args. */ 1789 vm_object_t object = (vm_object_t)addr; 1790 boolean_t full = have_addr; 1791 1792 vm_page_t p; 1793 1794 /* XXX count is an (unused) arg. Avoid shadowing it. */ 1795 #define count was_count 1796 1797 int count; 1798 1799 if (object == NULL) 1800 return; 1801 1802 db_iprintf( 1803 "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n", 1804 object, (int)object->type, (u_long)object->size, 1805 object->resident_page_count, object->ref_count, object->flags); 1806 /* 1807 * XXX no %qd in kernel. Truncate object->backing_object_offset. 1808 */ 1809 db_iprintf("\n"); 1810 1811 if (!full) 1812 return; 1813 1814 db_indent += 2; 1815 count = 0; 1816 RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) { 1817 if (count == 0) 1818 db_iprintf("memory:="); 1819 else if (count == 6) { 1820 db_printf("\n"); 1821 db_iprintf(" ..."); 1822 count = 0; 1823 } else 1824 db_printf(","); 1825 count++; 1826 1827 db_printf("(off=0x%lx,page=0x%lx)", 1828 (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p)); 1829 } 1830 if (count != 0) 1831 db_printf("\n"); 1832 db_indent -= 2; 1833 } 1834 1835 /* XXX. */ 1836 #undef count 1837 1838 /* 1839 * XXX need this non-static entry for calling from vm_map_print. 1840 * 1841 * Debugging only 1842 */ 1843 void 1844 vm_object_print(/* db_expr_t */ long addr, 1845 boolean_t have_addr, 1846 /* db_expr_t */ long count, 1847 char *modif) 1848 { 1849 vm_object_print_static(addr, have_addr, count, modif); 1850 } 1851 1852 /* 1853 * Debugging only 1854 */ 1855 DB_SHOW_COMMAND(vmopag, vm_object_print_pages) 1856 { 1857 struct vm_object_hash *hash; 1858 vm_object_t object; 1859 int nl = 0; 1860 int c; 1861 int n; 1862 1863 for (n = 0; n < VMOBJ_HSIZE; ++n) { 1864 hash = &vm_object_hash[n]; 1865 for (object = TAILQ_FIRST(&hash->list); 1866 object != NULL; 1867 object = TAILQ_NEXT(object, object_entry)) { 1868 vm_pindex_t idx, fidx; 1869 vm_pindex_t osize; 1870 vm_paddr_t pa = -1, padiff; 1871 int rcount; 1872 vm_page_t m; 1873 1874 if (object->type == OBJT_MARKER) 1875 continue; 1876 db_printf("new object: %p\n", (void *)object); 1877 if ( nl > 18) { 1878 c = cngetc(); 1879 if (c != ' ') 1880 return; 1881 nl = 0; 1882 } 1883 nl++; 1884 rcount = 0; 1885 fidx = 0; 1886 osize = object->size; 1887 if (osize > 128) 1888 osize = 128; 1889 for (idx = 0; idx < osize; idx++) { 1890 m = vm_page_lookup(object, idx); 1891 if (m == NULL) { 1892 if (rcount) { 1893 db_printf(" index(%ld)run(%d)pa(0x%lx)\n", 1894 (long)fidx, rcount, (long)pa); 1895 if ( nl > 18) { 1896 c = cngetc(); 1897 if (c != ' ') 1898 return; 1899 nl = 0; 1900 } 1901 nl++; 1902 rcount = 0; 1903 } 1904 continue; 1905 } 1906 1907 if (rcount && 1908 (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { 1909 ++rcount; 1910 continue; 1911 } 1912 if (rcount) { 1913 padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m); 1914 padiff >>= PAGE_SHIFT; 1915 padiff &= PQ_L2_MASK; 1916 if (padiff == 0) { 1917 pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE; 1918 ++rcount; 1919 continue; 1920 } 1921 db_printf(" index(%ld)run(%d)pa(0x%lx)", 1922 (long)fidx, rcount, (long)pa); 1923 db_printf("pd(%ld)\n", (long)padiff); 1924 if ( nl > 18) { 1925 c = cngetc(); 1926 if (c != ' ') 1927 return; 1928 nl = 0; 1929 } 1930 nl++; 1931 } 1932 fidx = idx; 1933 pa = VM_PAGE_TO_PHYS(m); 1934 rcount = 1; 1935 } 1936 if (rcount) { 1937 db_printf(" index(%ld)run(%d)pa(0x%lx)\n", 1938 (long)fidx, rcount, (long)pa); 1939 if ( nl > 18) { 1940 c = cngetc(); 1941 if (c != ' ') 1942 return; 1943 nl = 0; 1944 } 1945 nl++; 1946 } 1947 } 1948 } 1949 } 1950 #endif /* DDB */ 1951