1 /* 2 * Copyright (c) 1991, 1993, 2013 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * The Mach Operating System project at Carnegie-Mellon University. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 33 * 34 * 35 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 36 * All rights reserved. 37 * 38 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 39 * 40 * Permission to use, copy, modify and distribute this software and 41 * its documentation is hereby granted, provided that both the copyright 42 * notice and this permission notice appear in all copies of the 43 * software, derivative works or modified versions, and any portions 44 * thereof, and that both notices appear in supporting documentation. 45 * 46 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 47 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 48 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 49 * 50 * Carnegie Mellon requests users of this software to return to 51 * 52 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 53 * School of Computer Science 54 * Carnegie Mellon University 55 * Pittsburgh PA 15213-3890 56 * 57 * any improvements or extensions that they make and grant Carnegie the 58 * rights to redistribute these changes. 59 * 60 * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $ 61 */ 62 63 /* 64 * Virtual memory object module. 65 */ 66 67 #include <sys/param.h> 68 #include <sys/systm.h> 69 #include <sys/proc.h> /* for curproc, pageproc */ 70 #include <sys/thread.h> 71 #include <sys/vnode.h> 72 #include <sys/vmmeter.h> 73 #include <sys/mman.h> 74 #include <sys/mount.h> 75 #include <sys/kernel.h> 76 #include <sys/sysctl.h> 77 #include <sys/refcount.h> 78 79 #include <vm/vm.h> 80 #include <vm/vm_param.h> 81 #include <vm/pmap.h> 82 #include <vm/vm_map.h> 83 #include <vm/vm_object.h> 84 #include <vm/vm_page.h> 85 #include <vm/vm_pageout.h> 86 #include <vm/vm_pager.h> 87 #include <vm/swap_pager.h> 88 #include <vm/vm_kern.h> 89 #include <vm/vm_extern.h> 90 #include <vm/vm_zone.h> 91 92 #include <vm/vm_page2.h> 93 94 #include <machine/specialreg.h> 95 96 #define EASY_SCAN_FACTOR 8 97 98 static void vm_object_page_collect_flush(vm_object_t object, vm_page_t p, 99 int pagerflags); 100 static void vm_object_lock_init(vm_object_t); 101 102 /* 103 * Virtual memory objects maintain the actual data 104 * associated with allocated virtual memory. A given 105 * page of memory exists within exactly one object. 106 * 107 * An object is only deallocated when all "references" 108 * are given up. Only one "reference" to a given 109 * region of an object should be writeable. 110 * 111 * Associated with each object is a list of all resident 112 * memory pages belonging to that object; this list is 113 * maintained by the "vm_page" module, and locked by the object's 114 * lock. 115 * 116 * Each object also records a "pager" routine which is 117 * used to retrieve (and store) pages to the proper backing 118 * storage. In addition, objects may be backed by other 119 * objects from which they were virtual-copied. 120 * 121 * The only items within the object structure which are 122 * modified after time of creation are: 123 * reference count locked by object's lock 124 * pager routine locked by object's lock 125 * 126 */ 127 128 struct vm_object kernel_object; 129 130 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE]; 131 132 MALLOC_DEFINE(M_VM_OBJECT, "vm_object", "vm_object structures"); 133 134 #define VMOBJ_HASH_PRIME1 66555444443333333ULL 135 #define VMOBJ_HASH_PRIME2 989042931893ULL 136 137 int vm_object_debug; 138 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, ""); 139 140 static __inline 141 struct vm_object_hash * 142 vmobj_hash(vm_object_t obj) 143 { 144 uintptr_t hash1; 145 uintptr_t hash2; 146 147 hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18); 148 hash1 %= VMOBJ_HASH_PRIME1; 149 hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24); 150 hash2 %= VMOBJ_HASH_PRIME2; 151 return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]); 152 } 153 154 #if defined(DEBUG_LOCKS) 155 156 #define vm_object_vndeallocate(obj, vpp) \ 157 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__) 158 159 /* 160 * Debug helper to track hold/drop/ref/deallocate calls. 161 */ 162 static void 163 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem) 164 { 165 int i; 166 167 i = atomic_fetchadd_int(&obj->debug_index, 1); 168 i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1); 169 ksnprintf(obj->debug_hold_thrs[i], 170 sizeof(obj->debug_hold_thrs[i]), 171 "%c%d:(%d):%s", 172 (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')), 173 (curthread->td_proc ? curthread->td_proc->p_pid : -1), 174 obj->ref_count, 175 curthread->td_comm); 176 obj->debug_hold_file[i] = file; 177 obj->debug_hold_line[i] = line; 178 #if 0 179 /* Uncomment for debugging obj refs/derefs in reproducable cases */ 180 if (strcmp(curthread->td_comm, "sshd") == 0) { 181 kprintf("%d %p refs=%d ar=%d file: %s/%d\n", 182 (curthread->td_proc ? curthread->td_proc->p_pid : -1), 183 obj, obj->ref_count, addrem, file, line); 184 } 185 #endif 186 } 187 188 #endif 189 190 /* 191 * Misc low level routines 192 */ 193 static void 194 vm_object_lock_init(vm_object_t obj) 195 { 196 #if defined(DEBUG_LOCKS) 197 int i; 198 199 obj->debug_index = 0; 200 for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) { 201 obj->debug_hold_thrs[i][0] = 0; 202 obj->debug_hold_file[i] = NULL; 203 obj->debug_hold_line[i] = 0; 204 } 205 #endif 206 } 207 208 void 209 vm_object_lock_swap(void) 210 { 211 lwkt_token_swap(); 212 } 213 214 void 215 vm_object_lock(vm_object_t obj) 216 { 217 lwkt_gettoken(&obj->token); 218 } 219 220 /* 221 * Returns TRUE on sucesss 222 */ 223 static int 224 vm_object_lock_try(vm_object_t obj) 225 { 226 return(lwkt_trytoken(&obj->token)); 227 } 228 229 void 230 vm_object_lock_shared(vm_object_t obj) 231 { 232 lwkt_gettoken_shared(&obj->token); 233 } 234 235 void 236 vm_object_unlock(vm_object_t obj) 237 { 238 lwkt_reltoken(&obj->token); 239 } 240 241 void 242 vm_object_upgrade(vm_object_t obj) 243 { 244 lwkt_reltoken(&obj->token); 245 lwkt_gettoken(&obj->token); 246 } 247 248 void 249 vm_object_downgrade(vm_object_t obj) 250 { 251 lwkt_reltoken(&obj->token); 252 lwkt_gettoken_shared(&obj->token); 253 } 254 255 static __inline void 256 vm_object_assert_held(vm_object_t obj) 257 { 258 ASSERT_LWKT_TOKEN_HELD(&obj->token); 259 } 260 261 int 262 vm_quickcolor(void) 263 { 264 globaldata_t gd = mycpu; 265 int pg_color; 266 267 pg_color = (int)(intptr_t)gd->gd_curthread >> 10; 268 pg_color += gd->gd_quick_color; 269 gd->gd_quick_color += PQ_PRIME2; 270 271 return pg_color; 272 } 273 274 void 275 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS) 276 { 277 KKASSERT(obj != NULL); 278 279 /* 280 * Object must be held (object allocation is stable due to callers 281 * context, typically already holding the token on a parent object) 282 * prior to potentially blocking on the lock, otherwise the object 283 * can get ripped away from us. 284 */ 285 refcount_acquire(&obj->hold_count); 286 vm_object_lock(obj); 287 288 #if defined(DEBUG_LOCKS) 289 debugvm_object_add(obj, file, line, 1); 290 #endif 291 } 292 293 int 294 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS) 295 { 296 KKASSERT(obj != NULL); 297 298 /* 299 * Object must be held (object allocation is stable due to callers 300 * context, typically already holding the token on a parent object) 301 * prior to potentially blocking on the lock, otherwise the object 302 * can get ripped away from us. 303 */ 304 refcount_acquire(&obj->hold_count); 305 if (vm_object_lock_try(obj) == 0) { 306 if (refcount_release(&obj->hold_count)) { 307 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) 308 kfree(obj, M_VM_OBJECT); 309 } 310 return(0); 311 } 312 313 #if defined(DEBUG_LOCKS) 314 debugvm_object_add(obj, file, line, 1); 315 #endif 316 return(1); 317 } 318 319 void 320 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS) 321 { 322 KKASSERT(obj != NULL); 323 324 /* 325 * Object must be held (object allocation is stable due to callers 326 * context, typically already holding the token on a parent object) 327 * prior to potentially blocking on the lock, otherwise the object 328 * can get ripped away from us. 329 */ 330 refcount_acquire(&obj->hold_count); 331 vm_object_lock_shared(obj); 332 333 #if defined(DEBUG_LOCKS) 334 debugvm_object_add(obj, file, line, 1); 335 #endif 336 } 337 338 /* 339 * Drop the token and hold_count on the object. 340 * 341 * WARNING! Token might be shared. 342 */ 343 void 344 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS) 345 { 346 if (obj == NULL) 347 return; 348 349 /* 350 * No new holders should be possible once we drop hold_count 1->0 as 351 * there is no longer any way to reference the object. 352 */ 353 KKASSERT(obj->hold_count > 0); 354 if (refcount_release(&obj->hold_count)) { 355 #if defined(DEBUG_LOCKS) 356 debugvm_object_add(obj, file, line, -1); 357 #endif 358 359 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) { 360 vm_object_unlock(obj); 361 kfree(obj, M_VM_OBJECT); 362 } else { 363 vm_object_unlock(obj); 364 } 365 } else { 366 #if defined(DEBUG_LOCKS) 367 debugvm_object_add(obj, file, line, -1); 368 #endif 369 vm_object_unlock(obj); 370 } 371 } 372 373 /* 374 * Initialize a freshly allocated object, returning a held object. 375 * 376 * Used only by vm_object_allocate(), zinitna() and vm_object_init(). 377 * 378 * No requirements. 379 */ 380 void 381 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object) 382 { 383 struct vm_object_hash *hash; 384 385 RB_INIT(&object->rb_memq); 386 lwkt_token_init(&object->token, "vmobj"); 387 388 TAILQ_INIT(&object->backing_list); 389 object->type = type; 390 object->size = size; 391 object->ref_count = 1; 392 object->memattr = VM_MEMATTR_DEFAULT; 393 object->hold_count = 0; 394 object->flags = 0; 395 if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP)) 396 vm_object_set_flag(object, OBJ_ONEMAPPING); 397 object->paging_in_progress = 0; 398 object->resident_page_count = 0; 399 /* cpu localization twist */ 400 object->pg_color = vm_quickcolor(); 401 object->handle = NULL; 402 403 atomic_add_int(&object->generation, 1); 404 object->swblock_count = 0; 405 RB_INIT(&object->swblock_root); 406 vm_object_lock_init(object); 407 pmap_object_init(object); 408 409 vm_object_hold(object); 410 411 hash = vmobj_hash(object); 412 lwkt_gettoken(&hash->token); 413 TAILQ_INSERT_TAIL(&hash->list, object, object_entry); 414 lwkt_reltoken(&hash->token); 415 } 416 417 /* 418 * Initialize a VM object. 419 */ 420 void 421 vm_object_init(vm_object_t object, vm_pindex_t size) 422 { 423 _vm_object_allocate(OBJT_DEFAULT, size, object); 424 vm_object_drop(object); 425 } 426 427 /* 428 * Initialize the VM objects module. 429 * 430 * Called from the low level boot code only. Note that this occurs before 431 * kmalloc is initialized so we cannot allocate any VM objects. 432 */ 433 void 434 vm_object_init1(void) 435 { 436 int i; 437 438 for (i = 0; i < VMOBJ_HSIZE; ++i) { 439 TAILQ_INIT(&vm_object_hash[i].list); 440 lwkt_token_init(&vm_object_hash[i].token, "vmobjlst"); 441 } 442 443 _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd), 444 &kernel_object); 445 vm_object_drop(&kernel_object); 446 } 447 448 void 449 vm_object_init2(void) 450 { 451 kmalloc_set_unlimited(M_VM_OBJECT); 452 } 453 454 /* 455 * Allocate and return a new object of the specified type and size. 456 * 457 * No requirements. 458 */ 459 vm_object_t 460 vm_object_allocate(objtype_t type, vm_pindex_t size) 461 { 462 vm_object_t obj; 463 464 obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO); 465 _vm_object_allocate(type, size, obj); 466 vm_object_drop(obj); 467 468 return (obj); 469 } 470 471 /* 472 * This version returns a held object, allowing further atomic initialization 473 * of the object. 474 */ 475 vm_object_t 476 vm_object_allocate_hold(objtype_t type, vm_pindex_t size) 477 { 478 vm_object_t obj; 479 480 obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO); 481 _vm_object_allocate(type, size, obj); 482 483 return (obj); 484 } 485 486 /* 487 * Add an additional reference to a vm_object. The object must already be 488 * held. The original non-lock version is no longer supported. The object 489 * must NOT be chain locked by anyone at the time the reference is added. 490 * 491 * The object must be held, but may be held shared if desired (hence why 492 * we use an atomic op). 493 */ 494 void 495 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS) 496 { 497 KKASSERT(object != NULL); 498 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 499 atomic_add_int(&object->ref_count, 1); 500 if (object->type == OBJT_VNODE) { 501 vref(object->handle); 502 /* XXX what if the vnode is being destroyed? */ 503 } 504 #if defined(DEBUG_LOCKS) 505 debugvm_object_add(object, file, line, 1); 506 #endif 507 } 508 509 /* 510 * This version is only allowed in situations where the caller 511 * already knows that the object is deterministically referenced 512 * (usually because its taken from a ref'd vnode, or during a map_entry 513 * replication). 514 */ 515 void 516 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS) 517 { 518 KKASSERT(object->type == OBJT_VNODE || object->ref_count > 0); 519 atomic_add_int(&object->ref_count, 1); 520 if (object->type == OBJT_VNODE) 521 vref(object->handle); 522 #if defined(DEBUG_LOCKS) 523 debugvm_object_add(object, file, line, 1); 524 #endif 525 } 526 527 /* 528 * Dereference an object and its underlying vnode. The object may be 529 * held shared. On return the object will remain held. 530 * 531 * This function may return a vnode in *vpp which the caller must release 532 * after the caller drops its own lock. If vpp is NULL, we assume that 533 * the caller was holding an exclusive lock on the object and we vrele() 534 * the vp ourselves. 535 */ 536 static void 537 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp 538 VMOBJDBARGS) 539 { 540 struct vnode *vp = (struct vnode *) object->handle; 541 542 KASSERT(object->type == OBJT_VNODE, 543 ("vm_object_vndeallocate: not a vnode object")); 544 KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); 545 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 546 #ifdef INVARIANTS 547 if (object->ref_count == 0) { 548 vprint("vm_object_vndeallocate", vp); 549 panic("vm_object_vndeallocate: bad object reference count"); 550 } 551 #endif 552 for (;;) { 553 int count = object->ref_count; 554 cpu_ccfence(); 555 if (count == 1) { 556 vm_object_upgrade(object); 557 if (atomic_cmpset_int(&object->ref_count, count, 0)) { 558 vclrflags(vp, VTEXT); 559 break; 560 } 561 } else { 562 if (atomic_cmpset_int(&object->ref_count, 563 count, count - 1)) { 564 break; 565 } 566 } 567 /* retry */ 568 } 569 #if defined(DEBUG_LOCKS) 570 debugvm_object_add(object, file, line, -1); 571 #endif 572 573 /* 574 * vrele or return the vp to vrele. We can only safely vrele(vp) 575 * if the object was locked exclusively. But there are two races 576 * here. 577 * 578 * We had to upgrade the object above to safely clear VTEXT 579 * but the alternative path where the shared lock is retained 580 * can STILL race to 0 in other paths and cause our own vrele() 581 * to terminate the vnode. We can't allow that if the VM object 582 * is still locked shared. 583 */ 584 if (vpp) 585 *vpp = vp; 586 else 587 vrele(vp); 588 } 589 590 /* 591 * Release a reference to the specified object, gained either through a 592 * vm_object_allocate or a vm_object_reference call. When all references 593 * are gone, storage associated with this object may be relinquished. 594 * 595 * The caller does not have to hold the object locked but must have control 596 * over the reference in question in order to guarantee that the object 597 * does not get ripped out from under us. 598 * 599 * XXX Currently all deallocations require an exclusive lock. 600 */ 601 void 602 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS) 603 { 604 struct vnode *vp; 605 int count; 606 607 if (object == NULL) 608 return; 609 610 for (;;) { 611 count = object->ref_count; 612 cpu_ccfence(); 613 614 /* 615 * If decrementing the count enters into special handling 616 * territory (0, 1, or 2) we have to do it the hard way. 617 * Fortunate though, objects with only a few refs like this 618 * are not likely to be heavily contended anyway. 619 * 620 * For vnode objects we only care about 1->0 transitions. 621 */ 622 if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) { 623 #if defined(DEBUG_LOCKS) 624 debugvm_object_add(object, file, line, 0); 625 #endif 626 vm_object_hold(object); 627 vm_object_deallocate_locked(object); 628 vm_object_drop(object); 629 break; 630 } 631 632 /* 633 * Try to decrement ref_count without acquiring a hold on 634 * the object. This is particularly important for the exec*() 635 * and exit*() code paths because the program binary may 636 * have a great deal of sharing and an exclusive lock will 637 * crowbar performance in those circumstances. 638 */ 639 if (object->type == OBJT_VNODE) { 640 vp = (struct vnode *)object->handle; 641 if (atomic_cmpset_int(&object->ref_count, 642 count, count - 1)) { 643 #if defined(DEBUG_LOCKS) 644 debugvm_object_add(object, file, line, -1); 645 #endif 646 647 vrele(vp); 648 break; 649 } 650 /* retry */ 651 } else { 652 if (atomic_cmpset_int(&object->ref_count, 653 count, count - 1)) { 654 #if defined(DEBUG_LOCKS) 655 debugvm_object_add(object, file, line, -1); 656 #endif 657 break; 658 } 659 /* retry */ 660 } 661 /* retry */ 662 } 663 } 664 665 void 666 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS) 667 { 668 /* 669 * Degenerate case 670 */ 671 if (object == NULL) 672 return; 673 674 /* 675 * vnode case, caller either locked the object exclusively 676 * or this is a recursion with must_drop != 0 and the vnode 677 * object will be locked shared. 678 * 679 * If locked shared we have to drop the object before we can 680 * call vrele() or risk a shared/exclusive livelock. 681 */ 682 if (object->type == OBJT_VNODE) { 683 ASSERT_LWKT_TOKEN_HELD(&object->token); 684 vm_object_vndeallocate(object, NULL); 685 return; 686 } 687 ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token); 688 689 /* 690 * Normal case (object is locked exclusively) 691 */ 692 if (object->ref_count == 0) { 693 panic("vm_object_deallocate: object deallocated " 694 "too many times: %d", object->type); 695 } 696 if (object->ref_count > 2) { 697 atomic_add_int(&object->ref_count, -1); 698 #if defined(DEBUG_LOCKS) 699 debugvm_object_add(object, file, line, -1); 700 #endif 701 return; 702 } 703 704 /* 705 * Drop the ref and handle termination on the 1->0 transition. 706 * We may have blocked above so we have to recheck. 707 */ 708 KKASSERT(object->ref_count != 0); 709 if (object->ref_count >= 2) { 710 atomic_add_int(&object->ref_count, -1); 711 #if defined(DEBUG_LOCKS) 712 debugvm_object_add(object, file, line, -1); 713 #endif 714 return; 715 } 716 717 atomic_add_int(&object->ref_count, -1); 718 if ((object->flags & OBJ_DEAD) == 0) 719 vm_object_terminate(object); 720 } 721 722 /* 723 * Destroy the specified object, freeing up related resources. 724 * 725 * The object must have zero references. 726 * 727 * The object must held. The caller is responsible for dropping the object 728 * after terminate returns. Terminate does NOT drop the object. 729 */ 730 static int vm_object_terminate_callback(vm_page_t p, void *data); 731 732 void 733 vm_object_terminate(vm_object_t object) 734 { 735 struct rb_vm_page_scan_info info; 736 struct vm_object_hash *hash; 737 738 /* 739 * Make sure no one uses us. Once we set OBJ_DEAD we should be 740 * able to safely block. 741 */ 742 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 743 KKASSERT((object->flags & OBJ_DEAD) == 0); 744 vm_object_set_flag(object, OBJ_DEAD); 745 746 /* 747 * Wait for the pageout daemon to be done with the object 748 */ 749 vm_object_pip_wait(object, "objtrm1"); 750 751 KASSERT(!object->paging_in_progress, 752 ("vm_object_terminate: pageout in progress")); 753 754 /* 755 * Clean and free the pages, as appropriate. All references to the 756 * object are gone, so we don't need to lock it. 757 */ 758 if (object->type == OBJT_VNODE) { 759 struct vnode *vp; 760 761 /* 762 * Clean pages and flush buffers. 763 * 764 * NOTE! TMPFS buffer flushes do not typically flush the 765 * actual page to swap as this would be highly 766 * inefficient, and normal filesystems usually wrap 767 * page flushes with buffer cache buffers. 768 * 769 * To deal with this we have to call vinvalbuf() both 770 * before and after the vm_object_page_clean(). 771 */ 772 vp = (struct vnode *) object->handle; 773 vinvalbuf(vp, V_SAVE, 0, 0); 774 vm_object_page_clean(object, 0, 0, OBJPC_SYNC); 775 vinvalbuf(vp, V_SAVE, 0, 0); 776 } 777 778 /* 779 * Wait for any I/O to complete, after which there had better not 780 * be any references left on the object. 781 */ 782 vm_object_pip_wait(object, "objtrm2"); 783 784 if (object->ref_count != 0) { 785 panic("vm_object_terminate: object with references, " 786 "ref_count=%d", object->ref_count); 787 } 788 789 /* 790 * Cleanup any shared pmaps associated with this object. 791 */ 792 pmap_object_free(object); 793 794 /* 795 * Now free any remaining pages. For internal objects, this also 796 * removes them from paging queues. Don't free wired pages, just 797 * remove them from the object. 798 */ 799 info.count = 0; 800 info.object = object; 801 do { 802 info.error = 0; 803 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 804 vm_object_terminate_callback, &info); 805 } while (info.error); 806 807 /* 808 * Let the pager know object is dead. 809 */ 810 vm_pager_deallocate(object); 811 812 /* 813 * Wait for the object hold count to hit 1, clean out pages as 814 * we go. vmobj_token interlocks any race conditions that might 815 * pick the object up from the vm_object_list after we have cleared 816 * rb_memq. 817 */ 818 for (;;) { 819 if (RB_ROOT(&object->rb_memq) == NULL) 820 break; 821 kprintf("vm_object_terminate: Warning, object %p " 822 "still has %ld pages\n", 823 object, object->resident_page_count); 824 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 825 vm_object_terminate_callback, &info); 826 } 827 828 /* 829 * There had better not be any pages left 830 */ 831 KKASSERT(object->resident_page_count == 0); 832 833 /* 834 * Remove the object from the global object list. 835 */ 836 hash = vmobj_hash(object); 837 lwkt_gettoken(&hash->token); 838 TAILQ_REMOVE(&hash->list, object, object_entry); 839 lwkt_reltoken(&hash->token); 840 841 if (object->ref_count != 0) { 842 panic("vm_object_terminate2: object with references, " 843 "ref_count=%d", object->ref_count); 844 } 845 846 /* 847 * NOTE: The object hold_count is at least 1, so we cannot kfree() 848 * the object here. See vm_object_drop(). 849 */ 850 } 851 852 /* 853 * The caller must hold the object. 854 */ 855 static int 856 vm_object_terminate_callback(vm_page_t p, void *data) 857 { 858 struct rb_vm_page_scan_info *info = data; 859 vm_object_t object; 860 861 object = p->object; 862 KKASSERT(object == info->object); 863 if (vm_page_busy_try(p, TRUE)) { 864 vm_page_sleep_busy(p, TRUE, "vmotrm"); 865 info->error = 1; 866 return 0; 867 } 868 if (object != p->object) { 869 /* XXX remove once we determine it can't happen */ 870 kprintf("vm_object_terminate: Warning: Encountered " 871 "busied page %p on queue %d\n", p, p->queue); 872 vm_page_wakeup(p); 873 info->error = 1; 874 } else if (p->wire_count == 0) { 875 /* 876 * NOTE: p->dirty and PG_NEED_COMMIT are ignored. 877 */ 878 vm_page_free(p); 879 mycpu->gd_cnt.v_pfree++; 880 } else { 881 if (p->queue != PQ_NONE) { 882 kprintf("vm_object_terminate: Warning: Encountered " 883 "wired page %p on queue %d\n", p, p->queue); 884 if (vm_object_debug > 0) { 885 --vm_object_debug; 886 print_backtrace(10); 887 } 888 } 889 vm_page_remove(p); 890 vm_page_wakeup(p); 891 } 892 893 /* 894 * Must be at end to avoid SMP races, caller holds object token 895 */ 896 if ((++info->count & 63) == 0) 897 lwkt_user_yield(); 898 return(0); 899 } 900 901 /* 902 * Clean all dirty pages in the specified range of object. Leaves page 903 * on whatever queue it is currently on. If NOSYNC is set then do not 904 * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC), 905 * leaving the object dirty. 906 * 907 * When stuffing pages asynchronously, allow clustering. XXX we need a 908 * synchronous clustering mode implementation. 909 * 910 * Odd semantics: if start == end, we clean everything. 911 * 912 * The object must be locked? XXX 913 */ 914 static int vm_object_page_clean_pass1(struct vm_page *p, void *data); 915 static int vm_object_page_clean_pass2(struct vm_page *p, void *data); 916 917 void 918 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, 919 int flags) 920 { 921 struct rb_vm_page_scan_info info; 922 struct vnode *vp; 923 int wholescan; 924 int pagerflags; 925 int generation; 926 927 vm_object_hold(object); 928 if (object->type != OBJT_VNODE || 929 (object->flags & OBJ_MIGHTBEDIRTY) == 0) { 930 vm_object_drop(object); 931 return; 932 } 933 934 pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? 935 VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK; 936 pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0; 937 938 vp = object->handle; 939 940 /* 941 * Interlock other major object operations. This allows us to 942 * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY. 943 */ 944 vm_object_set_flag(object, OBJ_CLEANING); 945 946 /* 947 * Handle 'entire object' case 948 */ 949 info.start_pindex = start; 950 if (end == 0) { 951 info.end_pindex = object->size - 1; 952 } else { 953 info.end_pindex = end - 1; 954 } 955 wholescan = (start == 0 && info.end_pindex == object->size - 1); 956 info.limit = flags; 957 info.pagerflags = pagerflags; 958 info.object = object; 959 960 /* 961 * If cleaning the entire object do a pass to mark the pages read-only. 962 * If everything worked out ok, clear OBJ_WRITEABLE and 963 * OBJ_MIGHTBEDIRTY. 964 */ 965 if (wholescan) { 966 info.error = 0; 967 info.count = 0; 968 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 969 vm_object_page_clean_pass1, &info); 970 if (info.error == 0) { 971 vm_object_clear_flag(object, 972 OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); 973 if (object->type == OBJT_VNODE && 974 (vp = (struct vnode *)object->handle) != NULL) { 975 /* 976 * Use new-style interface to clear VISDIRTY 977 * because the vnode is not necessarily removed 978 * from the syncer list(s) as often as it was 979 * under the old interface, which can leave 980 * the vnode on the syncer list after reclaim. 981 */ 982 vclrobjdirty(vp); 983 } 984 } 985 } 986 987 /* 988 * Do a pass to clean all the dirty pages we find. 989 */ 990 do { 991 info.error = 0; 992 info.count = 0; 993 generation = object->generation; 994 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 995 vm_object_page_clean_pass2, &info); 996 } while (info.error || generation != object->generation); 997 998 vm_object_clear_flag(object, OBJ_CLEANING); 999 vm_object_drop(object); 1000 } 1001 1002 /* 1003 * The caller must hold the object. 1004 */ 1005 static 1006 int 1007 vm_object_page_clean_pass1(struct vm_page *p, void *data) 1008 { 1009 struct rb_vm_page_scan_info *info = data; 1010 1011 KKASSERT(p->object == info->object); 1012 1013 vm_page_flag_set(p, PG_CLEANCHK); 1014 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { 1015 info->error = 1; 1016 } else if (vm_page_busy_try(p, FALSE)) { 1017 info->error = 1; 1018 } else { 1019 KKASSERT(p->object == info->object); 1020 vm_page_protect(p, VM_PROT_READ); 1021 vm_page_wakeup(p); 1022 } 1023 1024 /* 1025 * Must be at end to avoid SMP races, caller holds object token 1026 */ 1027 if ((++info->count & 63) == 0) 1028 lwkt_user_yield(); 1029 return(0); 1030 } 1031 1032 /* 1033 * The caller must hold the object 1034 */ 1035 static 1036 int 1037 vm_object_page_clean_pass2(struct vm_page *p, void *data) 1038 { 1039 struct rb_vm_page_scan_info *info = data; 1040 int generation; 1041 1042 KKASSERT(p->object == info->object); 1043 1044 /* 1045 * Do not mess with pages that were inserted after we started 1046 * the cleaning pass. 1047 */ 1048 if ((p->flags & PG_CLEANCHK) == 0) 1049 goto done; 1050 1051 generation = info->object->generation; 1052 1053 if (vm_page_busy_try(p, TRUE)) { 1054 vm_page_sleep_busy(p, TRUE, "vpcwai"); 1055 info->error = 1; 1056 goto done; 1057 } 1058 1059 KKASSERT(p->object == info->object && 1060 info->object->generation == generation); 1061 1062 /* 1063 * Before wasting time traversing the pmaps, check for trivial 1064 * cases where the page cannot be dirty. 1065 */ 1066 if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) { 1067 KKASSERT((p->dirty & p->valid) == 0 && 1068 (p->flags & PG_NEED_COMMIT) == 0); 1069 vm_page_wakeup(p); 1070 goto done; 1071 } 1072 1073 /* 1074 * Check whether the page is dirty or not. The page has been set 1075 * to be read-only so the check will not race a user dirtying the 1076 * page. 1077 */ 1078 vm_page_test_dirty(p); 1079 if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) { 1080 vm_page_flag_clear(p, PG_CLEANCHK); 1081 vm_page_wakeup(p); 1082 goto done; 1083 } 1084 1085 /* 1086 * If we have been asked to skip nosync pages and this is a 1087 * nosync page, skip it. Note that the object flags were 1088 * not cleared in this case (because pass1 will have returned an 1089 * error), so we do not have to set them. 1090 */ 1091 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { 1092 vm_page_flag_clear(p, PG_CLEANCHK); 1093 vm_page_wakeup(p); 1094 goto done; 1095 } 1096 1097 /* 1098 * Flush as many pages as we can. PG_CLEANCHK will be cleared on 1099 * the pages that get successfully flushed. Set info->error if 1100 * we raced an object modification. 1101 */ 1102 vm_object_page_collect_flush(info->object, p, info->pagerflags); 1103 /* vm_wait_nominal(); this can deadlock the system in syncer/pageout */ 1104 1105 /* 1106 * Must be at end to avoid SMP races, caller holds object token 1107 */ 1108 done: 1109 if ((++info->count & 63) == 0) 1110 lwkt_user_yield(); 1111 return(0); 1112 } 1113 1114 /* 1115 * Collect the specified page and nearby pages and flush them out. 1116 * The number of pages flushed is returned. The passed page is busied 1117 * by the caller and we are responsible for its disposition. 1118 * 1119 * The caller must hold the object. 1120 */ 1121 static void 1122 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags) 1123 { 1124 int error; 1125 int is; 1126 int ib; 1127 int i; 1128 int page_base; 1129 vm_pindex_t pi; 1130 vm_page_t ma[BLIST_MAX_ALLOC]; 1131 1132 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 1133 1134 pi = p->pindex; 1135 page_base = pi % BLIST_MAX_ALLOC; 1136 ma[page_base] = p; 1137 ib = page_base - 1; 1138 is = page_base + 1; 1139 1140 while (ib >= 0) { 1141 vm_page_t tp; 1142 1143 tp = vm_page_lookup_busy_try(object, pi - page_base + ib, 1144 TRUE, &error); 1145 if (error) 1146 break; 1147 if (tp == NULL) 1148 break; 1149 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 && 1150 (tp->flags & PG_CLEANCHK) == 0) { 1151 vm_page_wakeup(tp); 1152 break; 1153 } 1154 if ((tp->queue - tp->pc) == PQ_CACHE) { 1155 vm_page_flag_clear(tp, PG_CLEANCHK); 1156 vm_page_wakeup(tp); 1157 break; 1158 } 1159 vm_page_test_dirty(tp); 1160 if ((tp->dirty & tp->valid) == 0 && 1161 (tp->flags & PG_NEED_COMMIT) == 0) { 1162 vm_page_flag_clear(tp, PG_CLEANCHK); 1163 vm_page_wakeup(tp); 1164 break; 1165 } 1166 ma[ib] = tp; 1167 --ib; 1168 } 1169 ++ib; /* fixup */ 1170 1171 while (is < BLIST_MAX_ALLOC && 1172 pi - page_base + is < object->size) { 1173 vm_page_t tp; 1174 1175 tp = vm_page_lookup_busy_try(object, pi - page_base + is, 1176 TRUE, &error); 1177 if (error) 1178 break; 1179 if (tp == NULL) 1180 break; 1181 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 && 1182 (tp->flags & PG_CLEANCHK) == 0) { 1183 vm_page_wakeup(tp); 1184 break; 1185 } 1186 if ((tp->queue - tp->pc) == PQ_CACHE) { 1187 vm_page_flag_clear(tp, PG_CLEANCHK); 1188 vm_page_wakeup(tp); 1189 break; 1190 } 1191 vm_page_test_dirty(tp); 1192 if ((tp->dirty & tp->valid) == 0 && 1193 (tp->flags & PG_NEED_COMMIT) == 0) { 1194 vm_page_flag_clear(tp, PG_CLEANCHK); 1195 vm_page_wakeup(tp); 1196 break; 1197 } 1198 ma[is] = tp; 1199 ++is; 1200 } 1201 1202 /* 1203 * All pages in the ma[] array are busied now 1204 */ 1205 for (i = ib; i < is; ++i) { 1206 vm_page_flag_clear(ma[i], PG_CLEANCHK); 1207 vm_page_hold(ma[i]); /* XXX need this any more? */ 1208 } 1209 vm_pageout_flush(&ma[ib], is - ib, pagerflags); 1210 for (i = ib; i < is; ++i) /* XXX need this any more? */ 1211 vm_page_unhold(ma[i]); 1212 } 1213 1214 /* 1215 * Implements the madvise function at the object/page level. 1216 * 1217 * MADV_WILLNEED (any object) 1218 * 1219 * Activate the specified pages if they are resident. 1220 * 1221 * MADV_DONTNEED (any object) 1222 * 1223 * Deactivate the specified pages if they are resident. 1224 * 1225 * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only) 1226 * 1227 * Deactivate and clean the specified pages if they are 1228 * resident. This permits the process to reuse the pages 1229 * without faulting or the kernel to reclaim the pages 1230 * without I/O. 1231 * 1232 * No requirements. 1233 */ 1234 void 1235 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, 1236 vm_pindex_t count, int advise) 1237 { 1238 vm_pindex_t end; 1239 vm_page_t m; 1240 int error; 1241 1242 if (object == NULL) 1243 return; 1244 1245 end = pindex + count; 1246 1247 vm_object_hold(object); 1248 1249 /* 1250 * Locate and adjust resident pages. This only applies to the 1251 * primary object in the mapping. 1252 */ 1253 for (; pindex < end; pindex += 1) { 1254 relookup: 1255 /* 1256 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages 1257 * and those pages must be OBJ_ONEMAPPING. 1258 */ 1259 if (advise == MADV_FREE) { 1260 if ((object->type != OBJT_DEFAULT && 1261 object->type != OBJT_SWAP) || 1262 (object->flags & OBJ_ONEMAPPING) == 0) { 1263 continue; 1264 } 1265 } 1266 1267 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error); 1268 1269 if (error) { 1270 vm_page_sleep_busy(m, TRUE, "madvpo"); 1271 goto relookup; 1272 } 1273 if (m == NULL) { 1274 /* 1275 * There may be swap even if there is no backing page 1276 */ 1277 if (advise == MADV_FREE && object->type == OBJT_SWAP) 1278 swap_pager_freespace(object, pindex, 1); 1279 continue; 1280 } 1281 1282 /* 1283 * If the page is not in a normal active state, we skip it. 1284 * If the page is not managed there are no page queues to 1285 * mess with. Things can break if we mess with pages in 1286 * any of the below states. 1287 */ 1288 if (m->wire_count || 1289 (m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) || 1290 m->valid != VM_PAGE_BITS_ALL 1291 ) { 1292 vm_page_wakeup(m); 1293 continue; 1294 } 1295 1296 /* 1297 * Theoretically once a page is known not to be busy, an 1298 * interrupt cannot come along and rip it out from under us. 1299 */ 1300 if (advise == MADV_WILLNEED) { 1301 vm_page_activate(m); 1302 } else if (advise == MADV_DONTNEED) { 1303 vm_page_dontneed(m); 1304 } else if (advise == MADV_FREE) { 1305 /* 1306 * Mark the page clean. This will allow the page 1307 * to be freed up by the system. However, such pages 1308 * are often reused quickly by malloc()/free() 1309 * so we do not do anything that would cause 1310 * a page fault if we can help it. 1311 * 1312 * Specifically, we do not try to actually free 1313 * the page now nor do we try to put it in the 1314 * cache (which would cause a page fault on reuse). 1315 * 1316 * But we do make the page is freeable as we 1317 * can without actually taking the step of unmapping 1318 * it. 1319 */ 1320 pmap_clear_modify(m); 1321 m->dirty = 0; 1322 m->act_count = 0; 1323 vm_page_dontneed(m); 1324 if (object->type == OBJT_SWAP) 1325 swap_pager_freespace(object, pindex, 1); 1326 } 1327 vm_page_wakeup(m); 1328 } 1329 vm_object_drop(object); 1330 } 1331 1332 /* 1333 * Removes all physical pages in the specified object range from the 1334 * object's list of pages. 1335 * 1336 * No requirements. 1337 */ 1338 static int vm_object_page_remove_callback(vm_page_t p, void *data); 1339 1340 void 1341 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, 1342 boolean_t clean_only) 1343 { 1344 struct rb_vm_page_scan_info info; 1345 int all; 1346 1347 /* 1348 * Degenerate cases and assertions 1349 */ 1350 vm_object_hold(object); 1351 if (object == NULL || 1352 (object->resident_page_count == 0 && object->swblock_count == 0)) { 1353 vm_object_drop(object); 1354 return; 1355 } 1356 KASSERT(object->type != OBJT_PHYS, 1357 ("attempt to remove pages from a physical object")); 1358 1359 /* 1360 * Indicate that paging is occuring on the object 1361 */ 1362 vm_object_pip_add(object, 1); 1363 1364 /* 1365 * Figure out the actual removal range and whether we are removing 1366 * the entire contents of the object or not. If removing the entire 1367 * contents, be sure to get all pages, even those that might be 1368 * beyond the end of the object. 1369 */ 1370 info.object = object; 1371 info.start_pindex = start; 1372 if (end == 0) 1373 info.end_pindex = (vm_pindex_t)-1; 1374 else 1375 info.end_pindex = end - 1; 1376 info.limit = clean_only; 1377 info.count = 0; 1378 all = (start == 0 && info.end_pindex >= object->size - 1); 1379 1380 /* 1381 * Loop until we are sure we have gotten them all. 1382 */ 1383 do { 1384 info.error = 0; 1385 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 1386 vm_object_page_remove_callback, &info); 1387 } while (info.error); 1388 1389 /* 1390 * Remove any related swap if throwing away pages, or for 1391 * non-swap objects (the swap is a clean copy in that case). 1392 */ 1393 if (object->type != OBJT_SWAP || clean_only == FALSE) { 1394 if (all) 1395 swap_pager_freespace_all(object); 1396 else 1397 swap_pager_freespace(object, info.start_pindex, 1398 info.end_pindex - info.start_pindex + 1); 1399 } 1400 1401 /* 1402 * Cleanup 1403 */ 1404 vm_object_pip_wakeup(object); 1405 vm_object_drop(object); 1406 } 1407 1408 /* 1409 * The caller must hold the object. 1410 * 1411 * NOTE: User yields are allowed when removing more than one page, but not 1412 * allowed if only removing one page (the path for single page removals 1413 * might hold a spinlock). 1414 */ 1415 static int 1416 vm_object_page_remove_callback(vm_page_t p, void *data) 1417 { 1418 struct rb_vm_page_scan_info *info = data; 1419 1420 if (info->object != p->object || 1421 p->pindex < info->start_pindex || 1422 p->pindex > info->end_pindex) { 1423 kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n", 1424 info->object, p); 1425 return(0); 1426 } 1427 if (vm_page_busy_try(p, TRUE)) { 1428 vm_page_sleep_busy(p, TRUE, "vmopar"); 1429 info->error = 1; 1430 return(0); 1431 } 1432 if (info->object != p->object) { 1433 /* this should never happen */ 1434 kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n", 1435 info->object, p); 1436 vm_page_wakeup(p); 1437 return(0); 1438 } 1439 1440 /* 1441 * Wired pages cannot be destroyed, but they can be invalidated 1442 * and we do so if clean_only (limit) is not set. 1443 * 1444 * WARNING! The page may be wired due to being part of a buffer 1445 * cache buffer, and the buffer might be marked B_CACHE. 1446 * This is fine as part of a truncation but VFSs must be 1447 * sure to fix the buffer up when re-extending the file. 1448 * 1449 * NOTE! PG_NEED_COMMIT is ignored. 1450 */ 1451 if (p->wire_count != 0) { 1452 vm_page_protect(p, VM_PROT_NONE); 1453 if (info->limit == 0) 1454 p->valid = 0; 1455 vm_page_wakeup(p); 1456 goto done; 1457 } 1458 1459 /* 1460 * limit is our clean_only flag. If set and the page is dirty or 1461 * requires a commit, do not free it. If set and the page is being 1462 * held by someone, do not free it. 1463 */ 1464 if (info->limit && p->valid) { 1465 vm_page_test_dirty(p); 1466 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) { 1467 vm_page_wakeup(p); 1468 goto done; 1469 } 1470 } 1471 1472 /* 1473 * Destroy the page 1474 */ 1475 vm_page_protect(p, VM_PROT_NONE); 1476 vm_page_free(p); 1477 1478 /* 1479 * Must be at end to avoid SMP races, caller holds object token 1480 */ 1481 done: 1482 if ((++info->count & 63) == 0) 1483 lwkt_user_yield(); 1484 1485 return(0); 1486 } 1487 1488 /* 1489 * Try to extend prev_object into an adjoining region of virtual 1490 * memory, return TRUE on success. 1491 * 1492 * The caller does not need to hold (prev_object) but must have a stable 1493 * pointer to it (typically by holding the vm_map locked). 1494 * 1495 * This function only works for anonymous memory objects which either 1496 * have (a) one reference or (b) we are extending the object's size. 1497 * Otherwise the related VM pages we want to use for the object might 1498 * be in use by another mapping. 1499 */ 1500 boolean_t 1501 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex, 1502 vm_size_t prev_size, vm_size_t next_size) 1503 { 1504 vm_pindex_t next_pindex; 1505 1506 if (prev_object == NULL) 1507 return (TRUE); 1508 1509 vm_object_hold(prev_object); 1510 1511 if (prev_object->type != OBJT_DEFAULT && 1512 prev_object->type != OBJT_SWAP) { 1513 vm_object_drop(prev_object); 1514 return (FALSE); 1515 } 1516 1517 #if 0 1518 /* caller now checks this */ 1519 /* 1520 * Try to collapse the object first 1521 */ 1522 vm_object_collapse(prev_object, NULL); 1523 #endif 1524 1525 #if 0 1526 /* caller now checks this */ 1527 /* 1528 * We can't coalesce if we shadow another object (figuring out the 1529 * relationships become too complex). 1530 */ 1531 if (prev_object->backing_object != NULL) { 1532 vm_object_chain_release(prev_object); 1533 vm_object_drop(prev_object); 1534 return (FALSE); 1535 } 1536 #endif 1537 1538 prev_size >>= PAGE_SHIFT; 1539 next_size >>= PAGE_SHIFT; 1540 next_pindex = prev_pindex + prev_size; 1541 1542 /* 1543 * We can't if the object has more than one ref count unless we 1544 * are extending it into newly minted space. 1545 */ 1546 if (prev_object->ref_count > 1 && 1547 prev_object->size != next_pindex) { 1548 vm_object_drop(prev_object); 1549 return (FALSE); 1550 } 1551 1552 /* 1553 * Remove any pages that may still be in the object from a previous 1554 * deallocation. 1555 */ 1556 if (next_pindex < prev_object->size) { 1557 vm_object_page_remove(prev_object, 1558 next_pindex, 1559 next_pindex + next_size, FALSE); 1560 if (prev_object->type == OBJT_SWAP) 1561 swap_pager_freespace(prev_object, 1562 next_pindex, next_size); 1563 } 1564 1565 /* 1566 * Extend the object if necessary. 1567 */ 1568 if (next_pindex + next_size > prev_object->size) 1569 prev_object->size = next_pindex + next_size; 1570 vm_object_drop(prev_object); 1571 1572 return (TRUE); 1573 } 1574 1575 /* 1576 * Make the object writable and flag is being possibly dirty. 1577 * 1578 * The object might not be held (or might be held but held shared), 1579 * the related vnode is probably not held either. Object and vnode are 1580 * stable by virtue of the vm_page busied by the caller preventing 1581 * destruction. 1582 * 1583 * If the related mount is flagged MNTK_THR_SYNC we need to call 1584 * vsetobjdirty(). Filesystems using this option usually shortcut 1585 * synchronization by only scanning the syncer list. 1586 */ 1587 void 1588 vm_object_set_writeable_dirty(vm_object_t object) 1589 { 1590 struct vnode *vp; 1591 1592 /*vm_object_assert_held(object);*/ 1593 /* 1594 * Avoid contention in vm fault path by checking the state before 1595 * issuing an atomic op on it. 1596 */ 1597 if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) != 1598 (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) { 1599 vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); 1600 } 1601 if (object->type == OBJT_VNODE && 1602 (vp = (struct vnode *)object->handle) != NULL) { 1603 if ((vp->v_flag & VOBJDIRTY) == 0) { 1604 if (vp->v_mount && 1605 (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) { 1606 /* 1607 * New style THR_SYNC places vnodes on the 1608 * syncer list more deterministically. 1609 */ 1610 vsetobjdirty(vp); 1611 } else { 1612 /* 1613 * Old style scan would not necessarily place 1614 * a vnode on the syncer list when possibly 1615 * modified via mmap. 1616 */ 1617 vsetflags(vp, VOBJDIRTY); 1618 } 1619 } 1620 } 1621 } 1622 1623 #include "opt_ddb.h" 1624 #ifdef DDB 1625 #include <sys/cons.h> 1626 1627 #include <ddb/ddb.h> 1628 1629 static int _vm_object_in_map (vm_map_t map, vm_object_t object, 1630 vm_map_entry_t entry); 1631 static int vm_object_in_map (vm_object_t object); 1632 1633 /* 1634 * The caller must hold the object. 1635 */ 1636 static int 1637 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) 1638 { 1639 vm_map_backing_t ba; 1640 vm_map_t tmpm; 1641 vm_map_entry_t tmpe; 1642 int entcount; 1643 1644 if (map == NULL) 1645 return 0; 1646 if (entry == NULL) { 1647 tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root); 1648 entcount = map->nentries; 1649 while (entcount-- && tmpe) { 1650 if( _vm_object_in_map(map, object, tmpe)) { 1651 return 1; 1652 } 1653 tmpe = vm_map_rb_tree_RB_NEXT(tmpe); 1654 } 1655 return (0); 1656 } 1657 switch(entry->maptype) { 1658 case VM_MAPTYPE_SUBMAP: 1659 tmpm = entry->ba.sub_map; 1660 tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root); 1661 entcount = tmpm->nentries; 1662 while (entcount-- && tmpe) { 1663 if( _vm_object_in_map(tmpm, object, tmpe)) { 1664 return 1; 1665 } 1666 tmpe = vm_map_rb_tree_RB_NEXT(tmpe); 1667 } 1668 break; 1669 case VM_MAPTYPE_NORMAL: 1670 case VM_MAPTYPE_VPAGETABLE: 1671 ba = &entry->ba; 1672 while (ba) { 1673 if (ba->object == object) 1674 return TRUE; 1675 ba = ba->backing_ba; 1676 } 1677 break; 1678 default: 1679 break; 1680 } 1681 return 0; 1682 } 1683 1684 static int vm_object_in_map_callback(struct proc *p, void *data); 1685 1686 struct vm_object_in_map_info { 1687 vm_object_t object; 1688 int rv; 1689 }; 1690 1691 /* 1692 * Debugging only 1693 */ 1694 static int 1695 vm_object_in_map(vm_object_t object) 1696 { 1697 struct vm_object_in_map_info info; 1698 1699 info.rv = 0; 1700 info.object = object; 1701 1702 allproc_scan(vm_object_in_map_callback, &info, 0); 1703 if (info.rv) 1704 return 1; 1705 if( _vm_object_in_map(&kernel_map, object, 0)) 1706 return 1; 1707 if( _vm_object_in_map(&pager_map, object, 0)) 1708 return 1; 1709 if( _vm_object_in_map(&buffer_map, object, 0)) 1710 return 1; 1711 return 0; 1712 } 1713 1714 /* 1715 * Debugging only 1716 */ 1717 static int 1718 vm_object_in_map_callback(struct proc *p, void *data) 1719 { 1720 struct vm_object_in_map_info *info = data; 1721 1722 if (p->p_vmspace) { 1723 if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) { 1724 info->rv = 1; 1725 return -1; 1726 } 1727 } 1728 return (0); 1729 } 1730 1731 DB_SHOW_COMMAND(vmochk, vm_object_check) 1732 { 1733 struct vm_object_hash *hash; 1734 vm_object_t object; 1735 int n; 1736 1737 /* 1738 * make sure that internal objs are in a map somewhere 1739 * and none have zero ref counts. 1740 */ 1741 for (n = 0; n < VMOBJ_HSIZE; ++n) { 1742 hash = &vm_object_hash[n]; 1743 for (object = TAILQ_FIRST(&hash->list); 1744 object != NULL; 1745 object = TAILQ_NEXT(object, object_entry)) { 1746 if (object->type == OBJT_MARKER) 1747 continue; 1748 if (object->handle != NULL || 1749 (object->type != OBJT_DEFAULT && 1750 object->type != OBJT_SWAP)) { 1751 continue; 1752 } 1753 if (object->ref_count == 0) { 1754 db_printf("vmochk: internal obj has " 1755 "zero ref count: %ld\n", 1756 (long)object->size); 1757 } 1758 if (vm_object_in_map(object)) 1759 continue; 1760 db_printf("vmochk: internal obj is not in a map: " 1761 "ref: %d, size: %lu: 0x%lx\n", 1762 object->ref_count, (u_long)object->size, 1763 (u_long)object->size); 1764 } 1765 } 1766 } 1767 1768 /* 1769 * Debugging only 1770 */ 1771 DB_SHOW_COMMAND(object, vm_object_print_static) 1772 { 1773 /* XXX convert args. */ 1774 vm_object_t object = (vm_object_t)addr; 1775 boolean_t full = have_addr; 1776 1777 vm_page_t p; 1778 1779 /* XXX count is an (unused) arg. Avoid shadowing it. */ 1780 #define count was_count 1781 1782 int count; 1783 1784 if (object == NULL) 1785 return; 1786 1787 db_iprintf( 1788 "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n", 1789 object, (int)object->type, (u_long)object->size, 1790 object->resident_page_count, object->ref_count, object->flags); 1791 /* 1792 * XXX no %qd in kernel. Truncate object->backing_object_offset. 1793 */ 1794 db_iprintf("\n"); 1795 1796 if (!full) 1797 return; 1798 1799 db_indent += 2; 1800 count = 0; 1801 RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) { 1802 if (count == 0) 1803 db_iprintf("memory:="); 1804 else if (count == 6) { 1805 db_printf("\n"); 1806 db_iprintf(" ..."); 1807 count = 0; 1808 } else 1809 db_printf(","); 1810 count++; 1811 1812 db_printf("(off=0x%lx,page=0x%lx)", 1813 (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p)); 1814 } 1815 if (count != 0) 1816 db_printf("\n"); 1817 db_indent -= 2; 1818 } 1819 1820 /* XXX. */ 1821 #undef count 1822 1823 /* 1824 * XXX need this non-static entry for calling from vm_map_print. 1825 * 1826 * Debugging only 1827 */ 1828 void 1829 vm_object_print(/* db_expr_t */ long addr, 1830 boolean_t have_addr, 1831 /* db_expr_t */ long count, 1832 char *modif) 1833 { 1834 vm_object_print_static(addr, have_addr, count, modif); 1835 } 1836 1837 /* 1838 * Debugging only 1839 */ 1840 DB_SHOW_COMMAND(vmopag, vm_object_print_pages) 1841 { 1842 struct vm_object_hash *hash; 1843 vm_object_t object; 1844 int nl = 0; 1845 int c; 1846 int n; 1847 1848 for (n = 0; n < VMOBJ_HSIZE; ++n) { 1849 hash = &vm_object_hash[n]; 1850 for (object = TAILQ_FIRST(&hash->list); 1851 object != NULL; 1852 object = TAILQ_NEXT(object, object_entry)) { 1853 vm_pindex_t idx, fidx; 1854 vm_pindex_t osize; 1855 vm_paddr_t pa = -1, padiff; 1856 int rcount; 1857 vm_page_t m; 1858 1859 if (object->type == OBJT_MARKER) 1860 continue; 1861 db_printf("new object: %p\n", (void *)object); 1862 if ( nl > 18) { 1863 c = cngetc(); 1864 if (c != ' ') 1865 return; 1866 nl = 0; 1867 } 1868 nl++; 1869 rcount = 0; 1870 fidx = 0; 1871 osize = object->size; 1872 if (osize > 128) 1873 osize = 128; 1874 for (idx = 0; idx < osize; idx++) { 1875 m = vm_page_lookup(object, idx); 1876 if (m == NULL) { 1877 if (rcount) { 1878 db_printf(" index(%ld)run(%d)pa(0x%lx)\n", 1879 (long)fidx, rcount, (long)pa); 1880 if ( nl > 18) { 1881 c = cngetc(); 1882 if (c != ' ') 1883 return; 1884 nl = 0; 1885 } 1886 nl++; 1887 rcount = 0; 1888 } 1889 continue; 1890 } 1891 1892 if (rcount && 1893 (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { 1894 ++rcount; 1895 continue; 1896 } 1897 if (rcount) { 1898 padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m); 1899 padiff >>= PAGE_SHIFT; 1900 padiff &= PQ_L2_MASK; 1901 if (padiff == 0) { 1902 pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE; 1903 ++rcount; 1904 continue; 1905 } 1906 db_printf(" index(%ld)run(%d)pa(0x%lx)", 1907 (long)fidx, rcount, (long)pa); 1908 db_printf("pd(%ld)\n", (long)padiff); 1909 if ( nl > 18) { 1910 c = cngetc(); 1911 if (c != ' ') 1912 return; 1913 nl = 0; 1914 } 1915 nl++; 1916 } 1917 fidx = idx; 1918 pa = VM_PAGE_TO_PHYS(m); 1919 rcount = 1; 1920 } 1921 if (rcount) { 1922 db_printf(" index(%ld)run(%d)pa(0x%lx)\n", 1923 (long)fidx, rcount, (long)pa); 1924 if ( nl > 18) { 1925 c = cngetc(); 1926 if (c != ' ') 1927 return; 1928 nl = 0; 1929 } 1930 nl++; 1931 } 1932 } 1933 } 1934 } 1935 #endif /* DDB */ 1936