1 /* 2 * Copyright (c) 1998,2004 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1994 John S. Dyson 35 * Copyright (c) 1990 University of Utah. 36 * Copyright (c) 1991, 1993 37 * The Regents of the University of California. All rights reserved. 38 * 39 * This code is derived from software contributed to Berkeley by 40 * the Systems Programming Group of the University of Utah Computer 41 * Science Department. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. All advertising materials mentioning features or use of this software 52 * must display the following acknowledgement: 53 * This product includes software developed by the University of 54 * California, Berkeley and its contributors. 55 * 4. Neither the name of the University nor the names of its contributors 56 * may be used to endorse or promote products derived from this software 57 * without specific prior written permission. 58 * 59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 62 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 69 * SUCH DAMAGE. 70 * 71 * New Swap System 72 * Matthew Dillon 73 * 74 * Radix Bitmap 'blists'. 75 * 76 * - The new swapper uses the new radix bitmap code. This should scale 77 * to arbitrarily small or arbitrarily large swap spaces and an almost 78 * arbitrary degree of fragmentation. 79 * 80 * Features: 81 * 82 * - on the fly reallocation of swap during putpages. The new system 83 * does not try to keep previously allocated swap blocks for dirty 84 * pages. 85 * 86 * - on the fly deallocation of swap 87 * 88 * - No more garbage collection required. Unnecessarily allocated swap 89 * blocks only exist for dirty vm_page_t's now and these are already 90 * cycled (in a high-load system) by the pager. We also do on-the-fly 91 * removal of invalidated swap blocks when a page is destroyed 92 * or renamed. 93 * 94 * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ 95 * 96 * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94 97 * 98 * $FreeBSD: src/sys/vm/swap_pager.c,v 1.130.2.12 2002/08/31 21:15:55 dillon Exp $ 99 * $DragonFly: src/sys/vm/swap_pager.c,v 1.32 2008/07/01 02:02:56 dillon Exp $ 100 */ 101 102 #include <sys/param.h> 103 #include <sys/systm.h> 104 #include <sys/conf.h> 105 #include <sys/kernel.h> 106 #include <sys/proc.h> 107 #include <sys/buf.h> 108 #include <sys/vnode.h> 109 #include <sys/malloc.h> 110 #include <sys/vmmeter.h> 111 #include <sys/sysctl.h> 112 #include <sys/blist.h> 113 #include <sys/lock.h> 114 #include <sys/thread2.h> 115 116 #ifndef MAX_PAGEOUT_CLUSTER 117 #define MAX_PAGEOUT_CLUSTER 16 118 #endif 119 120 #define SWB_NPAGES MAX_PAGEOUT_CLUSTER 121 122 #include "opt_swap.h" 123 #include <vm/vm.h> 124 #include <vm/vm_object.h> 125 #include <vm/vm_page.h> 126 #include <vm/vm_pager.h> 127 #include <vm/vm_pageout.h> 128 #include <vm/swap_pager.h> 129 #include <vm/vm_extern.h> 130 #include <vm/vm_zone.h> 131 #include <vm/vnode_pager.h> 132 133 #include <sys/buf2.h> 134 #include <vm/vm_page2.h> 135 136 #define SWM_FREE 0x02 /* free, period */ 137 #define SWM_POP 0x04 /* pop out */ 138 139 #define SWBIO_READ 0x01 140 #define SWBIO_WRITE 0x02 141 #define SWBIO_SYNC 0x04 142 143 struct swfreeinfo { 144 vm_object_t object; 145 vm_pindex_t basei; 146 vm_pindex_t begi; 147 vm_pindex_t endi; /* inclusive */ 148 }; 149 150 /* 151 * vm_swap_size is in page-sized chunks now. It was DEV_BSIZE'd chunks 152 * in the old system. 153 */ 154 155 int swap_pager_full; /* swap space exhaustion (task killing) */ 156 int vm_swap_cache_use; 157 int vm_swap_anon_use; 158 159 static int swap_pager_almost_full; /* swap space exhaustion (w/ hysteresis)*/ 160 static int nsw_rcount; /* free read buffers */ 161 static int nsw_wcount_sync; /* limit write buffers / synchronous */ 162 static int nsw_wcount_async; /* limit write buffers / asynchronous */ 163 static int nsw_wcount_async_max;/* assigned maximum */ 164 static int nsw_cluster_max; /* maximum VOP I/O allowed */ 165 166 struct blist *swapblist; 167 static int swap_async_max = 4; /* maximum in-progress async I/O's */ 168 static int swap_burst_read = 0; /* allow burst reading */ 169 170 extern struct vnode *swapdev_vp; /* from vm_swap.c */ 171 172 SYSCTL_INT(_vm, OID_AUTO, swap_async_max, 173 CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops"); 174 SYSCTL_INT(_vm, OID_AUTO, swap_burst_read, 175 CTLFLAG_RW, &swap_burst_read, 0, "Allow burst reads for pageins"); 176 177 SYSCTL_INT(_vm, OID_AUTO, swap_cache_use, 178 CTLFLAG_RD, &vm_swap_cache_use, 0, ""); 179 SYSCTL_INT(_vm, OID_AUTO, swap_anon_use, 180 CTLFLAG_RD, &vm_swap_anon_use, 0, ""); 181 182 vm_zone_t swap_zone; 183 184 /* 185 * Red-Black tree for swblock entries 186 */ 187 RB_GENERATE2(swblock_rb_tree, swblock, swb_entry, rb_swblock_compare, 188 vm_pindex_t, swb_index); 189 190 int 191 rb_swblock_compare(struct swblock *swb1, struct swblock *swb2) 192 { 193 if (swb1->swb_index < swb2->swb_index) 194 return(-1); 195 if (swb1->swb_index > swb2->swb_index) 196 return(1); 197 return(0); 198 } 199 200 static 201 int 202 rb_swblock_scancmp(struct swblock *swb, void *data) 203 { 204 struct swfreeinfo *info = data; 205 206 if (swb->swb_index < info->basei) 207 return(-1); 208 if (swb->swb_index > info->endi) 209 return(1); 210 return(0); 211 } 212 213 static 214 int 215 rb_swblock_condcmp(struct swblock *swb, void *data) 216 { 217 struct swfreeinfo *info = data; 218 219 if (swb->swb_index < info->basei) 220 return(-1); 221 return(0); 222 } 223 224 /* 225 * pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure 226 * calls hooked from other parts of the VM system and do not appear here. 227 * (see vm/swap_pager.h). 228 */ 229 230 static void swap_pager_dealloc (vm_object_t object); 231 static int swap_pager_getpage (vm_object_t, vm_page_t *, int); 232 static void swap_chain_iodone(struct bio *biox); 233 234 struct pagerops swappagerops = { 235 swap_pager_dealloc, /* deallocate an OBJT_SWAP object */ 236 swap_pager_getpage, /* pagein */ 237 swap_pager_putpages, /* pageout */ 238 swap_pager_haspage /* get backing store status for page */ 239 }; 240 241 /* 242 * dmmax is in page-sized chunks with the new swap system. It was 243 * dev-bsized chunks in the old. dmmax is always a power of 2. 244 * 245 * swap_*() routines are externally accessible. swp_*() routines are 246 * internal. 247 */ 248 249 int dmmax; 250 static int dmmax_mask; 251 int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */ 252 int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */ 253 254 static __inline void swp_sizecheck (void); 255 static void swp_pager_async_iodone (struct bio *bio); 256 257 /* 258 * Swap bitmap functions 259 */ 260 261 static __inline void swp_pager_freeswapspace (vm_object_t object, daddr_t blk, int npages); 262 static __inline daddr_t swp_pager_getswapspace (vm_object_t object, int npages); 263 264 /* 265 * Metadata functions 266 */ 267 268 static void swp_pager_meta_convert (vm_object_t); 269 static void swp_pager_meta_build (vm_object_t, vm_pindex_t, daddr_t); 270 static void swp_pager_meta_free (vm_object_t, vm_pindex_t, vm_pindex_t); 271 static void swp_pager_meta_free_all (vm_object_t); 272 static daddr_t swp_pager_meta_ctl (vm_object_t, vm_pindex_t, int); 273 274 /* 275 * SWP_SIZECHECK() - update swap_pager_full indication 276 * 277 * update the swap_pager_almost_full indication and warn when we are 278 * about to run out of swap space, using lowat/hiwat hysteresis. 279 * 280 * Clear swap_pager_full ( task killing ) indication when lowat is met. 281 * 282 * No restrictions on call 283 * This routine may not block. 284 * This routine must be called at splvm() 285 */ 286 287 static __inline void 288 swp_sizecheck(void) 289 { 290 if (vm_swap_size < nswap_lowat) { 291 if (swap_pager_almost_full == 0) { 292 kprintf("swap_pager: out of swap space\n"); 293 swap_pager_almost_full = 1; 294 } 295 } else { 296 swap_pager_full = 0; 297 if (vm_swap_size > nswap_hiwat) 298 swap_pager_almost_full = 0; 299 } 300 } 301 302 /* 303 * SWAP_PAGER_INIT() - initialize the swap pager! 304 * 305 * Expected to be started from system init. NOTE: This code is run 306 * before much else so be careful what you depend on. Most of the VM 307 * system has yet to be initialized at this point. 308 */ 309 static void 310 swap_pager_init(void *arg __unused) 311 { 312 /* 313 * Device Stripe, in PAGE_SIZE'd blocks 314 */ 315 dmmax = SWB_NPAGES * 2; 316 dmmax_mask = ~(dmmax - 1); 317 } 318 SYSINIT(vm_mem, SI_BOOT1_VM, SI_ORDER_THIRD, swap_pager_init, NULL) 319 320 /* 321 * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process 322 * 323 * Expected to be started from pageout process once, prior to entering 324 * its main loop. 325 */ 326 327 void 328 swap_pager_swap_init(void) 329 { 330 int n, n2; 331 332 /* 333 * Number of in-transit swap bp operations. Don't 334 * exhaust the pbufs completely. Make sure we 335 * initialize workable values (0 will work for hysteresis 336 * but it isn't very efficient). 337 * 338 * The nsw_cluster_max is constrained by the number of pages an XIO 339 * holds, i.e., (MAXPHYS/PAGE_SIZE) and our locally defined 340 * MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are 341 * constrained by the swap device interleave stripe size. 342 * 343 * Currently we hardwire nsw_wcount_async to 4. This limit is 344 * designed to prevent other I/O from having high latencies due to 345 * our pageout I/O. The value 4 works well for one or two active swap 346 * devices but is probably a little low if you have more. Even so, 347 * a higher value would probably generate only a limited improvement 348 * with three or four active swap devices since the system does not 349 * typically have to pageout at extreme bandwidths. We will want 350 * at least 2 per swap devices, and 4 is a pretty good value if you 351 * have one NFS swap device due to the command/ack latency over NFS. 352 * So it all works out pretty well. 353 */ 354 355 nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER); 356 357 nsw_rcount = (nswbuf + 1) / 2; 358 nsw_wcount_sync = (nswbuf + 3) / 4; 359 nsw_wcount_async = 4; 360 nsw_wcount_async_max = nsw_wcount_async; 361 362 /* 363 * The zone is dynamically allocated so generally size it to 364 * maxswzone (32MB to 512MB of KVM). Set a minimum size based 365 * on physical memory of around 8x (each swblock can hold 16 pages). 366 * 367 * With the advent of SSDs (vs HDs) the practical (swap:memory) ratio 368 * has increased dramatically. 369 */ 370 n = vmstats.v_page_count / 2; 371 if (maxswzone && n < maxswzone / sizeof(struct swblock)) 372 n = maxswzone / sizeof(struct swblock); 373 n2 = n; 374 375 do { 376 swap_zone = zinit( 377 "SWAPMETA", 378 sizeof(struct swblock), 379 n, 380 ZONE_INTERRUPT, 381 1); 382 if (swap_zone != NULL) 383 break; 384 /* 385 * if the allocation failed, try a zone two thirds the 386 * size of the previous attempt. 387 */ 388 n -= ((n + 2) / 3); 389 } while (n > 0); 390 391 if (swap_zone == NULL) 392 panic("swap_pager_swap_init: swap_zone == NULL"); 393 if (n2 != n) 394 kprintf("Swap zone entries reduced from %d to %d.\n", n2, n); 395 } 396 397 /* 398 * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate 399 * its metadata structures. 400 * 401 * This routine is called from the mmap and fork code to create a new 402 * OBJT_SWAP object. We do this by creating an OBJT_DEFAULT object 403 * and then converting it with swp_pager_meta_convert(). 404 * 405 * This routine may block in vm_object_allocate() and create a named 406 * object lookup race, so we must interlock. We must also run at 407 * splvm() for the object lookup to handle races with interrupts, but 408 * we do not have to maintain splvm() in between the lookup and the 409 * add because (I believe) it is not possible to attempt to create 410 * a new swap object w/handle when a default object with that handle 411 * already exists. 412 */ 413 414 vm_object_t 415 swap_pager_alloc(void *handle, off_t size, vm_prot_t prot, off_t offset) 416 { 417 vm_object_t object; 418 419 KKASSERT(handle == NULL); 420 #if 0 421 if (handle) { 422 /* 423 * Reference existing named region or allocate new one. There 424 * should not be a race here against swp_pager_meta_build() 425 * as called from vm_page_remove() in regards to the lookup 426 * of the handle. 427 */ 428 while (sw_alloc_interlock) { 429 sw_alloc_interlock = -1; 430 tsleep(&sw_alloc_interlock, 0, "swpalc", 0); 431 } 432 sw_alloc_interlock = 1; 433 434 object = vm_pager_object_lookup(NOBJLIST(handle), handle); 435 436 if (object != NULL) { 437 vm_object_reference(object); 438 } else { 439 object = vm_object_allocate(OBJT_DEFAULT, 440 OFF_TO_IDX(offset + PAGE_MASK + size)); 441 object->handle = handle; 442 swp_pager_meta_convert(object); 443 } 444 445 if (sw_alloc_interlock < 0) 446 wakeup(&sw_alloc_interlock); 447 sw_alloc_interlock = 0; 448 } else { ... } 449 #endif 450 object = vm_object_allocate(OBJT_DEFAULT, 451 OFF_TO_IDX(offset + PAGE_MASK + size)); 452 swp_pager_meta_convert(object); 453 454 return (object); 455 } 456 457 /* 458 * SWAP_PAGER_DEALLOC() - remove swap metadata from object 459 * 460 * The swap backing for the object is destroyed. The code is 461 * designed such that we can reinstantiate it later, but this 462 * routine is typically called only when the entire object is 463 * about to be destroyed. 464 * 465 * This routine may block, but no longer does. 466 * 467 * The object must be locked or unreferenceable. 468 */ 469 470 static void 471 swap_pager_dealloc(vm_object_t object) 472 { 473 vm_object_pip_wait(object, "swpdea"); 474 475 /* 476 * Free all remaining metadata. We only bother to free it from 477 * the swap meta data. We do not attempt to free swapblk's still 478 * associated with vm_page_t's for this object. We do not care 479 * if paging is still in progress on some objects. 480 */ 481 crit_enter(); 482 swp_pager_meta_free_all(object); 483 crit_exit(); 484 } 485 486 /************************************************************************ 487 * SWAP PAGER BITMAP ROUTINES * 488 ************************************************************************/ 489 490 /* 491 * SWP_PAGER_GETSWAPSPACE() - allocate raw swap space 492 * 493 * Allocate swap for the requested number of pages. The starting 494 * swap block number (a page index) is returned or SWAPBLK_NONE 495 * if the allocation failed. 496 * 497 * Also has the side effect of advising that somebody made a mistake 498 * when they configured swap and didn't configure enough. 499 * 500 * Must be called at splvm() to avoid races with bitmap frees from 501 * vm_page_remove() aka swap_pager_page_removed(). 502 * 503 * This routine may not block 504 * This routine must be called at splvm(). 505 */ 506 static __inline daddr_t 507 swp_pager_getswapspace(vm_object_t object, int npages) 508 { 509 daddr_t blk; 510 511 if ((blk = blist_alloc(swapblist, npages)) == SWAPBLK_NONE) { 512 if (swap_pager_full != 2) { 513 kprintf("swap_pager_getswapspace: failed\n"); 514 swap_pager_full = 2; 515 swap_pager_almost_full = 1; 516 } 517 } else { 518 vm_swap_size -= npages; 519 if (object->type == OBJT_SWAP) 520 vm_swap_anon_use += npages; 521 else 522 vm_swap_cache_use += npages; 523 swp_sizecheck(); 524 } 525 return(blk); 526 } 527 528 /* 529 * SWP_PAGER_FREESWAPSPACE() - free raw swap space 530 * 531 * This routine returns the specified swap blocks back to the bitmap. 532 * 533 * Note: This routine may not block (it could in the old swap code), 534 * and through the use of the new blist routines it does not block. 535 * 536 * We must be called at splvm() to avoid races with bitmap frees from 537 * vm_page_remove() aka swap_pager_page_removed(). 538 * 539 * This routine may not block 540 * This routine must be called at splvm(). 541 */ 542 543 static __inline void 544 swp_pager_freeswapspace(vm_object_t object, daddr_t blk, int npages) 545 { 546 blist_free(swapblist, blk, npages); 547 vm_swap_size += npages; 548 if (object->type == OBJT_SWAP) 549 vm_swap_anon_use -= npages; 550 else 551 vm_swap_cache_use -= npages; 552 swp_sizecheck(); 553 } 554 555 /* 556 * SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page 557 * range within an object. 558 * 559 * This is a globally accessible routine. 560 * 561 * This routine removes swapblk assignments from swap metadata. 562 * 563 * The external callers of this routine typically have already destroyed 564 * or renamed vm_page_t's associated with this range in the object so 565 * we should be ok. 566 * 567 * This routine may be called at any spl. We up our spl to splvm 568 * temporarily in order to perform the metadata removal. 569 */ 570 void 571 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_pindex_t size) 572 { 573 crit_enter(); 574 swp_pager_meta_free(object, start, size); 575 crit_exit(); 576 } 577 578 void 579 swap_pager_freespace_all(vm_object_t object) 580 { 581 crit_enter(); 582 swp_pager_meta_free_all(object); 583 crit_exit(); 584 } 585 586 /* 587 * This function conditionally frees swap cache swap starting at 588 * (*basei) in the object. (count) swap blocks will be nominally freed. 589 * The actual number of blocks freed can be more or less than the 590 * requested number. 591 * 592 * This function nominally returns the number of blocks freed. However, 593 * the actual number of blocks freed may be less then the returned value. 594 * If the function is unable to exhaust the object or if it is able to 595 * free (approximately) the requested number of blocks it returns 596 * a value n > count. 597 * 598 * If we exhaust the object we will return a value n <= count. 599 * 600 * Must be called from a critical section. 601 */ 602 static int swap_pager_condfree_callback(struct swblock *swap, void *data); 603 604 int 605 swap_pager_condfree(vm_object_t object, vm_pindex_t *basei, int count) 606 { 607 struct swfreeinfo info; 608 609 info.object = object; 610 info.basei = *basei; /* skip up to this page index */ 611 info.begi = count; /* max swap pages to destroy */ 612 info.endi = count * 8; /* max swblocks to scan */ 613 614 swblock_rb_tree_RB_SCAN(&object->swblock_root, rb_swblock_condcmp, 615 swap_pager_condfree_callback, &info); 616 *basei = info.basei; 617 if (info.endi < 0 && info.begi <= count) 618 info.begi = count + 1; 619 return(count - (int)info.begi); 620 } 621 622 /* 623 * The idea is to free whole meta-block to avoid fragmenting 624 * the swap space or disk I/O. We only do this if NO VM pages 625 * are present. 626 * 627 * We do not have to deal with clearing PG_SWAPPED in related VM 628 * pages because there are no related VM pages. 629 */ 630 static int 631 swap_pager_condfree_callback(struct swblock *swap, void *data) 632 { 633 struct swfreeinfo *info = data; 634 vm_object_t object = info->object; 635 int i; 636 637 for (i = 0; i < SWAP_META_PAGES; ++i) { 638 if (vm_page_lookup(object, swap->swb_index + i)) 639 break; 640 } 641 info->basei = swap->swb_index + SWAP_META_PAGES; 642 if (i == SWAP_META_PAGES) { 643 info->begi -= swap->swb_count; 644 swap_pager_freespace(object, swap->swb_index, SWAP_META_PAGES); 645 } 646 --info->endi; 647 if ((int)info->begi < 0 || (int)info->endi < 0) 648 return(-1); 649 return(0); 650 } 651 652 /* 653 * Called by vm_page_alloc() when a new VM page is inserted 654 * into a VM object. Checks whether swap has been assigned to 655 * the page and sets PG_SWAPPED as necessary. 656 */ 657 void 658 swap_pager_page_inserted(vm_page_t m) 659 { 660 if (m->object->swblock_count) { 661 crit_enter(); 662 if (swp_pager_meta_ctl(m->object, m->pindex, 0) != SWAPBLK_NONE) 663 vm_page_flag_set(m, PG_SWAPPED); 664 crit_exit(); 665 } 666 } 667 668 /* 669 * SWAP_PAGER_RESERVE() - reserve swap blocks in object 670 * 671 * Assigns swap blocks to the specified range within the object. The 672 * swap blocks are not zerod. Any previous swap assignment is destroyed. 673 * 674 * Returns 0 on success, -1 on failure. 675 */ 676 int 677 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size) 678 { 679 int n = 0; 680 daddr_t blk = SWAPBLK_NONE; 681 vm_pindex_t beg = start; /* save start index */ 682 683 crit_enter(); 684 while (size) { 685 if (n == 0) { 686 n = BLIST_MAX_ALLOC; 687 while ((blk = swp_pager_getswapspace(object, n)) == 688 SWAPBLK_NONE) 689 { 690 n >>= 1; 691 if (n == 0) { 692 swp_pager_meta_free(object, beg, 693 start - beg); 694 crit_exit(); 695 return(-1); 696 } 697 } 698 } 699 swp_pager_meta_build(object, start, blk); 700 --size; 701 ++start; 702 ++blk; 703 --n; 704 } 705 swp_pager_meta_free(object, start, n); 706 crit_exit(); 707 return(0); 708 } 709 710 /* 711 * SWAP_PAGER_COPY() - copy blocks from source pager to destination pager 712 * and destroy the source. 713 * 714 * Copy any valid swapblks from the source to the destination. In 715 * cases where both the source and destination have a valid swapblk, 716 * we keep the destination's. 717 * 718 * This routine is allowed to block. It may block allocating metadata 719 * indirectly through swp_pager_meta_build() or if paging is still in 720 * progress on the source. 721 * 722 * This routine can be called at any spl 723 * 724 * XXX vm_page_collapse() kinda expects us not to block because we 725 * supposedly do not need to allocate memory, but for the moment we 726 * *may* have to get a little memory from the zone allocator, but 727 * it is taken from the interrupt memory. We should be ok. 728 * 729 * The source object contains no vm_page_t's (which is just as well) 730 * 731 * The source object is of type OBJT_SWAP. 732 * 733 * The source and destination objects must be locked or 734 * inaccessible (XXX are they ?) 735 */ 736 737 void 738 swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject, 739 vm_pindex_t base_index, int destroysource) 740 { 741 vm_pindex_t i; 742 743 crit_enter(); 744 745 /* 746 * transfer source to destination. 747 */ 748 for (i = 0; i < dstobject->size; ++i) { 749 daddr_t dstaddr; 750 751 /* 752 * Locate (without changing) the swapblk on the destination, 753 * unless it is invalid in which case free it silently, or 754 * if the destination is a resident page, in which case the 755 * source is thrown away. 756 */ 757 dstaddr = swp_pager_meta_ctl(dstobject, i, 0); 758 759 if (dstaddr == SWAPBLK_NONE) { 760 /* 761 * Destination has no swapblk and is not resident, 762 * copy source. 763 */ 764 daddr_t srcaddr; 765 766 srcaddr = swp_pager_meta_ctl(srcobject, 767 base_index + i, SWM_POP); 768 769 if (srcaddr != SWAPBLK_NONE) 770 swp_pager_meta_build(dstobject, i, srcaddr); 771 } else { 772 /* 773 * Destination has valid swapblk or it is represented 774 * by a resident page. We destroy the sourceblock. 775 */ 776 swp_pager_meta_ctl(srcobject, base_index + i, SWM_FREE); 777 } 778 } 779 780 /* 781 * Free left over swap blocks in source. 782 * 783 * We have to revert the type to OBJT_DEFAULT so we do not accidently 784 * double-remove the object from the swap queues. 785 */ 786 if (destroysource) { 787 /* 788 * Reverting the type is not necessary, the caller is going 789 * to destroy srcobject directly, but I'm doing it here 790 * for consistency since we've removed the object from its 791 * queues. 792 */ 793 swp_pager_meta_free_all(srcobject); 794 if (srcobject->type == OBJT_SWAP) 795 srcobject->type = OBJT_DEFAULT; 796 } 797 crit_exit(); 798 } 799 800 /* 801 * SWAP_PAGER_HASPAGE() - determine if we have good backing store for 802 * the requested page. 803 * 804 * We determine whether good backing store exists for the requested 805 * page and return TRUE if it does, FALSE if it doesn't. 806 * 807 * If TRUE, we also try to determine how much valid, contiguous backing 808 * store exists before and after the requested page within a reasonable 809 * distance. We do not try to restrict it to the swap device stripe 810 * (that is handled in getpages/putpages). It probably isn't worth 811 * doing here. 812 */ 813 814 boolean_t 815 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex) 816 { 817 daddr_t blk0; 818 819 /* 820 * do we have good backing store at the requested index ? 821 */ 822 823 crit_enter(); 824 blk0 = swp_pager_meta_ctl(object, pindex, 0); 825 826 if (blk0 == SWAPBLK_NONE) { 827 crit_exit(); 828 return (FALSE); 829 } 830 831 #if 0 832 /* 833 * find backwards-looking contiguous good backing store 834 */ 835 if (before != NULL) { 836 int i; 837 838 for (i = 1; i < (SWB_NPAGES/2); ++i) { 839 daddr_t blk; 840 841 if (i > pindex) 842 break; 843 blk = swp_pager_meta_ctl(object, pindex - i, 0); 844 if (blk != blk0 - i) 845 break; 846 } 847 *before = (i - 1); 848 } 849 850 /* 851 * find forward-looking contiguous good backing store 852 */ 853 854 if (after != NULL) { 855 int i; 856 857 for (i = 1; i < (SWB_NPAGES/2); ++i) { 858 daddr_t blk; 859 860 blk = swp_pager_meta_ctl(object, pindex + i, 0); 861 if (blk != blk0 + i) 862 break; 863 } 864 *after = (i - 1); 865 } 866 #endif 867 crit_exit(); 868 return (TRUE); 869 } 870 871 /* 872 * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page 873 * 874 * This removes any associated swap backing store, whether valid or 875 * not, from the page. This operates on any VM object, not just OBJT_SWAP 876 * objects. 877 * 878 * This routine is typically called when a page is made dirty, at 879 * which point any associated swap can be freed. MADV_FREE also 880 * calls us in a special-case situation 881 * 882 * NOTE!!! If the page is clean and the swap was valid, the caller 883 * should make the page dirty before calling this routine. This routine 884 * does NOT change the m->dirty status of the page. Also: MADV_FREE 885 * depends on it. 886 * 887 * This routine may not block. 888 * 889 * The page must be busied or soft-busied. 890 */ 891 void 892 swap_pager_unswapped(vm_page_t m) 893 { 894 if (m->flags & PG_SWAPPED) { 895 crit_enter(); 896 KKASSERT(m->flags & PG_SWAPPED); 897 swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE); 898 vm_page_flag_clear(m, PG_SWAPPED); 899 crit_exit(); 900 } 901 } 902 903 /* 904 * SWAP_PAGER_STRATEGY() - read, write, free blocks 905 * 906 * This implements a VM OBJECT strategy function using swap backing store. 907 * This can operate on any VM OBJECT type, not necessarily just OBJT_SWAP 908 * types. 909 * 910 * This is intended to be a cacheless interface (i.e. caching occurs at 911 * higher levels), and is also used as a swap-based SSD cache for vnode 912 * and device objects. 913 * 914 * All I/O goes directly to and from the swap device. 915 * 916 * We currently attempt to run I/O synchronously or asynchronously as 917 * the caller requests. This isn't perfect because we loose error 918 * sequencing when we run multiple ops in parallel to satisfy a request. 919 * But this is swap, so we let it all hang out. 920 */ 921 void 922 swap_pager_strategy(vm_object_t object, struct bio *bio) 923 { 924 struct buf *bp = bio->bio_buf; 925 struct bio *nbio; 926 vm_pindex_t start; 927 vm_pindex_t biox_blkno = 0; 928 int count; 929 char *data; 930 struct bio *biox; 931 struct buf *bufx; 932 struct bio_track *track; 933 934 /* 935 * tracking for swapdev vnode I/Os 936 */ 937 if (bp->b_cmd == BUF_CMD_READ) 938 track = &swapdev_vp->v_track_read; 939 else 940 track = &swapdev_vp->v_track_write; 941 942 if (bp->b_bcount & PAGE_MASK) { 943 bp->b_error = EINVAL; 944 bp->b_flags |= B_ERROR | B_INVAL; 945 biodone(bio); 946 kprintf("swap_pager_strategy: bp %p offset %lld size %d, " 947 "not page bounded\n", 948 bp, (long long)bio->bio_offset, (int)bp->b_bcount); 949 return; 950 } 951 952 /* 953 * Clear error indication, initialize page index, count, data pointer. 954 */ 955 bp->b_error = 0; 956 bp->b_flags &= ~B_ERROR; 957 bp->b_resid = bp->b_bcount; 958 959 start = (vm_pindex_t)(bio->bio_offset >> PAGE_SHIFT); 960 count = howmany(bp->b_bcount, PAGE_SIZE); 961 data = bp->b_data; 962 963 /* 964 * Deal with BUF_CMD_FREEBLKS 965 */ 966 if (bp->b_cmd == BUF_CMD_FREEBLKS) { 967 /* 968 * FREE PAGE(s) - destroy underlying swap that is no longer 969 * needed. 970 */ 971 crit_enter(); 972 swp_pager_meta_free(object, start, count); 973 crit_exit(); 974 bp->b_resid = 0; 975 biodone(bio); 976 return; 977 } 978 979 /* 980 * We need to be able to create a new cluster of I/O's. We cannot 981 * use the caller fields of the passed bio so push a new one. 982 * 983 * Because nbio is just a placeholder for the cluster links, 984 * we can biodone() the original bio instead of nbio to make 985 * things a bit more efficient. 986 */ 987 nbio = push_bio(bio); 988 nbio->bio_offset = bio->bio_offset; 989 nbio->bio_caller_info1.cluster_head = NULL; 990 nbio->bio_caller_info2.cluster_tail = NULL; 991 992 biox = NULL; 993 bufx = NULL; 994 995 /* 996 * Execute read or write 997 */ 998 crit_enter(); 999 while (count > 0) { 1000 daddr_t blk; 1001 1002 /* 1003 * Obtain block. If block not found and writing, allocate a 1004 * new block and build it into the object. 1005 */ 1006 blk = swp_pager_meta_ctl(object, start, 0); 1007 if ((blk == SWAPBLK_NONE) && bp->b_cmd != BUF_CMD_READ) { 1008 blk = swp_pager_getswapspace(object, 1); 1009 if (blk == SWAPBLK_NONE) { 1010 bp->b_error = ENOMEM; 1011 bp->b_flags |= B_ERROR; 1012 break; 1013 } 1014 swp_pager_meta_build(object, start, blk); 1015 } 1016 1017 /* 1018 * Do we have to flush our current collection? Yes if: 1019 * 1020 * - no swap block at this index 1021 * - swap block is not contiguous 1022 * - we cross a physical disk boundry in the 1023 * stripe. 1024 */ 1025 if ( 1026 biox && (biox_blkno + btoc(bufx->b_bcount) != blk || 1027 ((biox_blkno ^ blk) & dmmax_mask) 1028 ) 1029 ) { 1030 if (bp->b_cmd == BUF_CMD_READ) { 1031 ++mycpu->gd_cnt.v_swapin; 1032 mycpu->gd_cnt.v_swappgsin += btoc(bufx->b_bcount); 1033 } else { 1034 ++mycpu->gd_cnt.v_swapout; 1035 mycpu->gd_cnt.v_swappgsout += btoc(bufx->b_bcount); 1036 bufx->b_dirtyend = bufx->b_bcount; 1037 } 1038 1039 /* 1040 * Finished with this buf. 1041 */ 1042 KKASSERT(bufx->b_bcount != 0); 1043 if (bufx->b_cmd != BUF_CMD_READ) 1044 bufx->b_dirtyend = bufx->b_bcount; 1045 biox = NULL; 1046 bufx = NULL; 1047 } 1048 1049 /* 1050 * Add new swapblk to biox, instantiating biox if necessary. 1051 * Zero-fill reads are able to take a shortcut. 1052 */ 1053 if (blk == SWAPBLK_NONE) { 1054 /* 1055 * We can only get here if we are reading. Since 1056 * we are at splvm() we can safely modify b_resid, 1057 * even if chain ops are in progress. 1058 */ 1059 bzero(data, PAGE_SIZE); 1060 bp->b_resid -= PAGE_SIZE; 1061 } else { 1062 if (biox == NULL) { 1063 /* XXX chain count > 4, wait to <= 4 */ 1064 1065 bufx = getpbuf(NULL); 1066 biox = &bufx->b_bio1; 1067 cluster_append(nbio, bufx); 1068 bufx->b_flags |= (bufx->b_flags & B_ORDERED); 1069 bufx->b_cmd = bp->b_cmd; 1070 biox->bio_done = swap_chain_iodone; 1071 biox->bio_offset = (off_t)blk << PAGE_SHIFT; 1072 biox->bio_caller_info1.cluster_parent = nbio; 1073 biox_blkno = blk; 1074 bufx->b_bcount = 0; 1075 bufx->b_data = data; 1076 } 1077 bufx->b_bcount += PAGE_SIZE; 1078 } 1079 --count; 1080 ++start; 1081 data += PAGE_SIZE; 1082 } 1083 crit_exit(); 1084 1085 /* 1086 * Flush out last buffer 1087 */ 1088 if (biox) { 1089 if (bufx->b_cmd == BUF_CMD_READ) { 1090 ++mycpu->gd_cnt.v_swapin; 1091 mycpu->gd_cnt.v_swappgsin += btoc(bufx->b_bcount); 1092 } else { 1093 ++mycpu->gd_cnt.v_swapout; 1094 mycpu->gd_cnt.v_swappgsout += btoc(bufx->b_bcount); 1095 bufx->b_dirtyend = bufx->b_bcount; 1096 } 1097 KKASSERT(bufx->b_bcount); 1098 if (bufx->b_cmd != BUF_CMD_READ) 1099 bufx->b_dirtyend = bufx->b_bcount; 1100 /* biox, bufx = NULL */ 1101 } 1102 1103 /* 1104 * Now initiate all the I/O. Be careful looping on our chain as 1105 * I/O's may complete while we are still initiating them. 1106 * 1107 * If the request is a 100% sparse read no bios will be present 1108 * and we just biodone() the buffer. 1109 */ 1110 nbio->bio_caller_info2.cluster_tail = NULL; 1111 bufx = nbio->bio_caller_info1.cluster_head; 1112 1113 if (bufx) { 1114 while (bufx) { 1115 biox = &bufx->b_bio1; 1116 BUF_KERNPROC(bufx); 1117 bufx = bufx->b_cluster_next; 1118 vn_strategy(swapdev_vp, biox); 1119 } 1120 } else { 1121 biodone(bio); 1122 } 1123 1124 /* 1125 * Completion of the cluster will also call biodone_chain(nbio). 1126 * We never call biodone(nbio) so we don't have to worry about 1127 * setting up a bio_done callback. It's handled in the sub-IO. 1128 */ 1129 /**/ 1130 } 1131 1132 static void 1133 swap_chain_iodone(struct bio *biox) 1134 { 1135 struct buf **nextp; 1136 struct buf *bufx; /* chained sub-buffer */ 1137 struct bio *nbio; /* parent nbio with chain glue */ 1138 struct buf *bp; /* original bp associated with nbio */ 1139 int chain_empty; 1140 1141 bufx = biox->bio_buf; 1142 nbio = biox->bio_caller_info1.cluster_parent; 1143 bp = nbio->bio_buf; 1144 1145 /* 1146 * Update the original buffer 1147 */ 1148 KKASSERT(bp != NULL); 1149 if (bufx->b_flags & B_ERROR) { 1150 atomic_set_int(&bufx->b_flags, B_ERROR); 1151 bp->b_error = bufx->b_error; 1152 } else if (bufx->b_resid != 0) { 1153 atomic_set_int(&bufx->b_flags, B_ERROR); 1154 bp->b_error = EINVAL; 1155 } else { 1156 atomic_subtract_int(&bp->b_resid, bufx->b_bcount); 1157 } 1158 1159 /* 1160 * Remove us from the chain. 1161 */ 1162 spin_lock_wr(&bp->b_lock.lk_spinlock); 1163 nextp = &nbio->bio_caller_info1.cluster_head; 1164 while (*nextp != bufx) { 1165 KKASSERT(*nextp != NULL); 1166 nextp = &(*nextp)->b_cluster_next; 1167 } 1168 *nextp = bufx->b_cluster_next; 1169 chain_empty = (nbio->bio_caller_info1.cluster_head == NULL); 1170 spin_unlock_wr(&bp->b_lock.lk_spinlock); 1171 1172 /* 1173 * Clean up bufx. If the chain is now empty we finish out 1174 * the parent. Note that we may be racing other completions 1175 * so we must use the chain_empty status from above. 1176 */ 1177 if (chain_empty) { 1178 if (bp->b_resid != 0 && !(bp->b_flags & B_ERROR)) { 1179 atomic_set_int(&bp->b_flags, B_ERROR); 1180 bp->b_error = EINVAL; 1181 } 1182 biodone_chain(nbio); 1183 } 1184 relpbuf(bufx, NULL); 1185 } 1186 1187 /* 1188 * SWAP_PAGER_GETPAGES() - bring page in from swap 1189 * 1190 * The requested page may have to be brought in from swap. Calculate the 1191 * swap block and bring in additional pages if possible. All pages must 1192 * have contiguous swap block assignments and reside in the same object. 1193 * 1194 * The caller has a single vm_object_pip_add() reference prior to 1195 * calling us and we should return with the same. 1196 * 1197 * The caller has BUSY'd the page. We should return with (*mpp) left busy, 1198 * and any additinal pages unbusied. 1199 * 1200 * If the caller encounters a PG_RAM page it will pass it to us even though 1201 * it may be valid and dirty. We cannot overwrite the page in this case! 1202 * The case is used to allow us to issue pure read-aheads. 1203 * 1204 * NOTE! XXX This code does not entirely pipeline yet due to the fact that 1205 * the PG_RAM page is validated at the same time as mreq. What we 1206 * really need to do is issue a separate read-ahead pbuf. 1207 */ 1208 static int 1209 swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess) 1210 { 1211 struct buf *bp; 1212 struct bio *bio; 1213 vm_page_t mreq; 1214 vm_page_t m; 1215 vm_offset_t kva; 1216 daddr_t blk; 1217 int i; 1218 int j; 1219 int raonly; 1220 vm_page_t marray[XIO_INTERNAL_PAGES]; 1221 1222 mreq = *mpp; 1223 1224 if (mreq->object != object) { 1225 panic("swap_pager_getpages: object mismatch %p/%p", 1226 object, 1227 mreq->object 1228 ); 1229 } 1230 1231 /* 1232 * We don't want to overwrite a fully valid page as it might be 1233 * dirty. This case can occur when e.g. vm_fault hits a perfectly 1234 * valid page with PG_RAM set. 1235 * 1236 * In this case we see if the next page is a suitable page-in 1237 * candidate and if it is we issue read-ahead. PG_RAM will be 1238 * set on the last page of the read-ahead to continue the pipeline. 1239 */ 1240 if (mreq->valid == VM_PAGE_BITS_ALL) { 1241 if (swap_burst_read == 0 || mreq->pindex + 1 >= object->size) 1242 return(VM_PAGER_OK); 1243 crit_enter(); 1244 blk = swp_pager_meta_ctl(object, mreq->pindex + 1, 0); 1245 if (blk == SWAPBLK_NONE) { 1246 crit_exit(); 1247 return(VM_PAGER_OK); 1248 } 1249 m = vm_page_lookup(object, mreq->pindex + 1); 1250 if (m == NULL) { 1251 m = vm_page_alloc(object, mreq->pindex + 1, 1252 VM_ALLOC_QUICK); 1253 if (m == NULL) { 1254 crit_exit(); 1255 return(VM_PAGER_OK); 1256 } 1257 } else { 1258 if ((m->flags & PG_BUSY) || m->busy || m->valid) { 1259 crit_exit(); 1260 return(VM_PAGER_OK); 1261 } 1262 vm_page_unqueue_nowakeup(m); 1263 vm_page_busy(m); 1264 } 1265 mreq = m; 1266 raonly = 1; 1267 crit_exit(); 1268 } else { 1269 raonly = 0; 1270 } 1271 1272 /* 1273 * Try to block-read contiguous pages from swap if sequential, 1274 * otherwise just read one page. Contiguous pages from swap must 1275 * reside within a single device stripe because the I/O cannot be 1276 * broken up across multiple stripes. 1277 * 1278 * Note that blk and iblk can be SWAPBLK_NONE but the loop is 1279 * set up such that the case(s) are handled implicitly. 1280 */ 1281 crit_enter(); 1282 blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0); 1283 marray[0] = mreq; 1284 1285 for (i = 1; swap_burst_read && 1286 i < XIO_INTERNAL_PAGES && 1287 mreq->pindex + i < object->size; ++i) { 1288 daddr_t iblk; 1289 1290 iblk = swp_pager_meta_ctl(object, mreq->pindex + i, 0); 1291 if (iblk != blk + i) 1292 break; 1293 if ((blk ^ iblk) & dmmax_mask) 1294 break; 1295 m = vm_page_lookup(object, mreq->pindex + i); 1296 if (m == NULL) { 1297 m = vm_page_alloc(object, mreq->pindex + i, 1298 VM_ALLOC_QUICK); 1299 if (m == NULL) 1300 break; 1301 } else { 1302 if ((m->flags & PG_BUSY) || m->busy || m->valid) 1303 break; 1304 vm_page_unqueue_nowakeup(m); 1305 vm_page_busy(m); 1306 } 1307 marray[i] = m; 1308 } 1309 if (i > 1) 1310 vm_page_flag_set(marray[i - 1], PG_RAM); 1311 1312 crit_exit(); 1313 1314 /* 1315 * If mreq is the requested page and we have nothing to do return 1316 * VM_PAGER_FAIL. If raonly is set mreq is just another read-ahead 1317 * page and must be cleaned up. 1318 */ 1319 if (blk == SWAPBLK_NONE) { 1320 KKASSERT(i == 1); 1321 if (raonly) { 1322 vnode_pager_freepage(mreq); 1323 return(VM_PAGER_OK); 1324 } else { 1325 return(VM_PAGER_FAIL); 1326 } 1327 } 1328 1329 /* 1330 * map our page(s) into kva for input 1331 */ 1332 bp = getpbuf(&nsw_rcount); 1333 bio = &bp->b_bio1; 1334 kva = (vm_offset_t) bp->b_kvabase; 1335 bcopy(marray, bp->b_xio.xio_pages, i * sizeof(vm_page_t)); 1336 pmap_qenter(kva, bp->b_xio.xio_pages, i); 1337 1338 bp->b_data = (caddr_t)kva; 1339 bp->b_bcount = PAGE_SIZE * i; 1340 bp->b_xio.xio_npages = i; 1341 bio->bio_done = swp_pager_async_iodone; 1342 bio->bio_offset = (off_t)blk << PAGE_SHIFT; 1343 bio->bio_caller_info1.index = SWBIO_READ; 1344 1345 /* 1346 * Set index. If raonly set the index beyond the array so all 1347 * the pages are treated the same, otherwise the original mreq is 1348 * at index 0. 1349 */ 1350 if (raonly) 1351 bio->bio_driver_info = (void *)(intptr_t)i; 1352 else 1353 bio->bio_driver_info = (void *)(intptr_t)0; 1354 1355 for (j = 0; j < i; ++j) 1356 vm_page_flag_set(bp->b_xio.xio_pages[j], PG_SWAPINPROG); 1357 1358 mycpu->gd_cnt.v_swapin++; 1359 mycpu->gd_cnt.v_swappgsin += bp->b_xio.xio_npages; 1360 1361 /* 1362 * We still hold the lock on mreq, and our automatic completion routine 1363 * does not remove it. 1364 */ 1365 vm_object_pip_add(object, bp->b_xio.xio_npages); 1366 1367 /* 1368 * perform the I/O. NOTE!!! bp cannot be considered valid after 1369 * this point because we automatically release it on completion. 1370 * Instead, we look at the one page we are interested in which we 1371 * still hold a lock on even through the I/O completion. 1372 * 1373 * The other pages in our m[] array are also released on completion, 1374 * so we cannot assume they are valid anymore either. 1375 */ 1376 bp->b_cmd = BUF_CMD_READ; 1377 BUF_KERNPROC(bp); 1378 vn_strategy(swapdev_vp, bio); 1379 1380 /* 1381 * Wait for the page we want to complete. PG_SWAPINPROG is always 1382 * cleared on completion. If an I/O error occurs, SWAPBLK_NONE 1383 * is set in the meta-data. 1384 * 1385 * If this is a read-ahead only we return immediately without 1386 * waiting for I/O. 1387 */ 1388 if (raonly) 1389 return(VM_PAGER_OK); 1390 1391 /* 1392 * Read-ahead includes originally requested page case. 1393 */ 1394 crit_enter(); 1395 while ((mreq->flags & PG_SWAPINPROG) != 0) { 1396 vm_page_flag_set(mreq, PG_WANTED | PG_REFERENCED); 1397 mycpu->gd_cnt.v_intrans++; 1398 if (tsleep(mreq, 0, "swread", hz*20)) { 1399 kprintf( 1400 "swap_pager: indefinite wait buffer: " 1401 " offset: %lld, size: %ld\n", 1402 (long long)bio->bio_offset, 1403 (long)bp->b_bcount 1404 ); 1405 } 1406 } 1407 crit_exit(); 1408 1409 /* 1410 * mreq is left bussied after completion, but all the other pages 1411 * are freed. If we had an unrecoverable read error the page will 1412 * not be valid. 1413 */ 1414 if (mreq->valid != VM_PAGE_BITS_ALL) 1415 return(VM_PAGER_ERROR); 1416 else 1417 return(VM_PAGER_OK); 1418 1419 /* 1420 * A final note: in a low swap situation, we cannot deallocate swap 1421 * and mark a page dirty here because the caller is likely to mark 1422 * the page clean when we return, causing the page to possibly revert 1423 * to all-zero's later. 1424 */ 1425 } 1426 1427 /* 1428 * swap_pager_putpages: 1429 * 1430 * Assign swap (if necessary) and initiate I/O on the specified pages. 1431 * 1432 * We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects 1433 * are automatically converted to SWAP objects. 1434 * 1435 * In a low memory situation we may block in vn_strategy(), but the new 1436 * vm_page reservation system coupled with properly written VFS devices 1437 * should ensure that no low-memory deadlock occurs. This is an area 1438 * which needs work. 1439 * 1440 * The parent has N vm_object_pip_add() references prior to 1441 * calling us and will remove references for rtvals[] that are 1442 * not set to VM_PAGER_PEND. We need to remove the rest on I/O 1443 * completion. 1444 * 1445 * The parent has soft-busy'd the pages it passes us and will unbusy 1446 * those whos rtvals[] entry is not set to VM_PAGER_PEND on return. 1447 * We need to unbusy the rest on I/O completion. 1448 */ 1449 void 1450 swap_pager_putpages(vm_object_t object, vm_page_t *m, int count, 1451 boolean_t sync, int *rtvals) 1452 { 1453 int i; 1454 int n = 0; 1455 1456 if (count && m[0]->object != object) { 1457 panic("swap_pager_getpages: object mismatch %p/%p", 1458 object, 1459 m[0]->object 1460 ); 1461 } 1462 1463 /* 1464 * Step 1 1465 * 1466 * Turn object into OBJT_SWAP 1467 * check for bogus sysops 1468 * force sync if not pageout process 1469 */ 1470 if (object->type == OBJT_DEFAULT) 1471 swp_pager_meta_convert(object); 1472 1473 if (curthread != pagethread) 1474 sync = TRUE; 1475 1476 /* 1477 * Step 2 1478 * 1479 * Update nsw parameters from swap_async_max sysctl values. 1480 * Do not let the sysop crash the machine with bogus numbers. 1481 */ 1482 1483 if (swap_async_max != nsw_wcount_async_max) { 1484 int n; 1485 1486 /* 1487 * limit range 1488 */ 1489 if ((n = swap_async_max) > nswbuf / 2) 1490 n = nswbuf / 2; 1491 if (n < 1) 1492 n = 1; 1493 swap_async_max = n; 1494 1495 /* 1496 * Adjust difference ( if possible ). If the current async 1497 * count is too low, we may not be able to make the adjustment 1498 * at this time. 1499 */ 1500 crit_enter(); 1501 n -= nsw_wcount_async_max; 1502 if (nsw_wcount_async + n >= 0) { 1503 nsw_wcount_async += n; 1504 nsw_wcount_async_max += n; 1505 wakeup(&nsw_wcount_async); 1506 } 1507 crit_exit(); 1508 } 1509 1510 /* 1511 * Step 3 1512 * 1513 * Assign swap blocks and issue I/O. We reallocate swap on the fly. 1514 * The page is left dirty until the pageout operation completes 1515 * successfully. 1516 */ 1517 1518 for (i = 0; i < count; i += n) { 1519 struct buf *bp; 1520 struct bio *bio; 1521 daddr_t blk; 1522 int j; 1523 1524 /* 1525 * Maximum I/O size is limited by a number of factors. 1526 */ 1527 1528 n = min(BLIST_MAX_ALLOC, count - i); 1529 n = min(n, nsw_cluster_max); 1530 1531 crit_enter(); 1532 1533 /* 1534 * Get biggest block of swap we can. If we fail, fall 1535 * back and try to allocate a smaller block. Don't go 1536 * overboard trying to allocate space if it would overly 1537 * fragment swap. 1538 */ 1539 while ( 1540 (blk = swp_pager_getswapspace(object, n)) == SWAPBLK_NONE && 1541 n > 4 1542 ) { 1543 n >>= 1; 1544 } 1545 if (blk == SWAPBLK_NONE) { 1546 for (j = 0; j < n; ++j) 1547 rtvals[i+j] = VM_PAGER_FAIL; 1548 crit_exit(); 1549 continue; 1550 } 1551 1552 /* 1553 * The I/O we are constructing cannot cross a physical 1554 * disk boundry in the swap stripe. Note: we are still 1555 * at splvm(). 1556 */ 1557 if ((blk ^ (blk + n)) & dmmax_mask) { 1558 j = ((blk + dmmax) & dmmax_mask) - blk; 1559 swp_pager_freeswapspace(object, blk + j, n - j); 1560 n = j; 1561 } 1562 1563 /* 1564 * All I/O parameters have been satisfied, build the I/O 1565 * request and assign the swap space. 1566 */ 1567 if (sync == TRUE) 1568 bp = getpbuf(&nsw_wcount_sync); 1569 else 1570 bp = getpbuf(&nsw_wcount_async); 1571 bio = &bp->b_bio1; 1572 1573 pmap_qenter((vm_offset_t)bp->b_data, &m[i], n); 1574 1575 bp->b_bcount = PAGE_SIZE * n; 1576 bio->bio_offset = (off_t)blk << PAGE_SHIFT; 1577 1578 for (j = 0; j < n; ++j) { 1579 vm_page_t mreq = m[i+j]; 1580 1581 swp_pager_meta_build(mreq->object, mreq->pindex, 1582 blk + j); 1583 if (object->type == OBJT_SWAP) 1584 vm_page_dirty(mreq); 1585 rtvals[i+j] = VM_PAGER_OK; 1586 1587 vm_page_flag_set(mreq, PG_SWAPINPROG); 1588 bp->b_xio.xio_pages[j] = mreq; 1589 } 1590 bp->b_xio.xio_npages = n; 1591 1592 mycpu->gd_cnt.v_swapout++; 1593 mycpu->gd_cnt.v_swappgsout += bp->b_xio.xio_npages; 1594 1595 crit_exit(); 1596 1597 bp->b_dirtyoff = 0; /* req'd for NFS */ 1598 bp->b_dirtyend = bp->b_bcount; /* req'd for NFS */ 1599 bp->b_cmd = BUF_CMD_WRITE; 1600 bio->bio_caller_info1.index = SWBIO_WRITE; 1601 1602 /* 1603 * asynchronous 1604 */ 1605 if (sync == FALSE) { 1606 bio->bio_done = swp_pager_async_iodone; 1607 BUF_KERNPROC(bp); 1608 vn_strategy(swapdev_vp, bio); 1609 1610 for (j = 0; j < n; ++j) 1611 rtvals[i+j] = VM_PAGER_PEND; 1612 continue; 1613 } 1614 1615 /* 1616 * Issue synchrnously. 1617 * 1618 * Wait for the sync I/O to complete, then update rtvals. 1619 * We just set the rtvals[] to VM_PAGER_PEND so we can call 1620 * our async completion routine at the end, thus avoiding a 1621 * double-free. 1622 */ 1623 bio->bio_caller_info1.index |= SWBIO_SYNC; 1624 bio->bio_done = biodone_sync; 1625 bio->bio_flags |= BIO_SYNC; 1626 vn_strategy(swapdev_vp, bio); 1627 biowait(bio, "swwrt"); 1628 1629 for (j = 0; j < n; ++j) 1630 rtvals[i+j] = VM_PAGER_PEND; 1631 1632 /* 1633 * Now that we are through with the bp, we can call the 1634 * normal async completion, which frees everything up. 1635 */ 1636 swp_pager_async_iodone(bio); 1637 } 1638 } 1639 1640 void 1641 swap_pager_newswap(void) 1642 { 1643 swp_sizecheck(); 1644 } 1645 1646 /* 1647 * swp_pager_async_iodone: 1648 * 1649 * Completion routine for asynchronous reads and writes from/to swap. 1650 * Also called manually by synchronous code to finish up a bp. 1651 * 1652 * For READ operations, the pages are PG_BUSY'd. For WRITE operations, 1653 * the pages are vm_page_t->busy'd. For READ operations, we PG_BUSY 1654 * unbusy all pages except the 'main' request page. For WRITE 1655 * operations, we vm_page_t->busy'd unbusy all pages ( we can do this 1656 * because we marked them all VM_PAGER_PEND on return from putpages ). 1657 * 1658 * This routine may not block. 1659 */ 1660 static void 1661 swp_pager_async_iodone(struct bio *bio) 1662 { 1663 struct buf *bp = bio->bio_buf; 1664 vm_object_t object = NULL; 1665 int i; 1666 int *nswptr; 1667 1668 /* 1669 * report error 1670 */ 1671 if (bp->b_flags & B_ERROR) { 1672 kprintf( 1673 "swap_pager: I/O error - %s failed; offset %lld," 1674 "size %ld, error %d\n", 1675 ((bio->bio_caller_info1.index & SWBIO_READ) ? 1676 "pagein" : "pageout"), 1677 (long long)bio->bio_offset, 1678 (long)bp->b_bcount, 1679 bp->b_error 1680 ); 1681 } 1682 1683 /* 1684 * set object, raise to splvm(). 1685 */ 1686 if (bp->b_xio.xio_npages) 1687 object = bp->b_xio.xio_pages[0]->object; 1688 crit_enter(); 1689 1690 /* 1691 * remove the mapping for kernel virtual 1692 */ 1693 pmap_qremove((vm_offset_t)bp->b_data, bp->b_xio.xio_npages); 1694 1695 /* 1696 * cleanup pages. If an error occurs writing to swap, we are in 1697 * very serious trouble. If it happens to be a disk error, though, 1698 * we may be able to recover by reassigning the swap later on. So 1699 * in this case we remove the m->swapblk assignment for the page 1700 * but do not free it in the rlist. The errornous block(s) are thus 1701 * never reallocated as swap. Redirty the page and continue. 1702 */ 1703 for (i = 0; i < bp->b_xio.xio_npages; ++i) { 1704 vm_page_t m = bp->b_xio.xio_pages[i]; 1705 1706 if (bp->b_flags & B_ERROR) { 1707 /* 1708 * If an error occurs I'd love to throw the swapblk 1709 * away without freeing it back to swapspace, so it 1710 * can never be used again. But I can't from an 1711 * interrupt. 1712 */ 1713 1714 if (bio->bio_caller_info1.index & SWBIO_READ) { 1715 /* 1716 * When reading, reqpage needs to stay 1717 * locked for the parent, but all other 1718 * pages can be freed. We still want to 1719 * wakeup the parent waiting on the page, 1720 * though. ( also: pg_reqpage can be -1 and 1721 * not match anything ). 1722 * 1723 * We have to wake specifically requested pages 1724 * up too because we cleared PG_SWAPINPROG and 1725 * someone may be waiting for that. 1726 * 1727 * NOTE: for reads, m->dirty will probably 1728 * be overridden by the original caller of 1729 * getpages so don't play cute tricks here. 1730 * 1731 * NOTE: We can't actually free the page from 1732 * here, because this is an interrupt. It 1733 * is not legal to mess with object->memq 1734 * from an interrupt. Deactivate the page 1735 * instead. 1736 */ 1737 1738 m->valid = 0; 1739 vm_page_flag_clear(m, PG_ZERO); 1740 vm_page_flag_clear(m, PG_SWAPINPROG); 1741 1742 /* 1743 * bio_driver_info holds the requested page 1744 * index. 1745 */ 1746 if (i != (int)(intptr_t)bio->bio_driver_info) { 1747 vm_page_deactivate(m); 1748 vm_page_wakeup(m); 1749 } else { 1750 vm_page_flash(m); 1751 } 1752 /* 1753 * If i == bp->b_pager.pg_reqpage, do not wake 1754 * the page up. The caller needs to. 1755 */ 1756 } else { 1757 /* 1758 * If a write error occurs remove the swap 1759 * assignment (note that PG_SWAPPED may or 1760 * may not be set depending on prior activity). 1761 * 1762 * Re-dirty OBJT_SWAP pages as there is no 1763 * other backing store, we can't throw the 1764 * page away. 1765 * 1766 * Non-OBJT_SWAP pages (aka swapcache) must 1767 * not be dirtied since they may not have 1768 * been dirty in the first place, and they 1769 * do have backing store (the vnode). 1770 */ 1771 swp_pager_meta_ctl(m->object, m->pindex, 1772 SWM_FREE); 1773 vm_page_flag_clear(m, PG_SWAPPED); 1774 if (m->object->type == OBJT_SWAP) { 1775 vm_page_dirty(m); 1776 vm_page_activate(m); 1777 } 1778 vm_page_flag_clear(m, PG_SWAPINPROG); 1779 vm_page_io_finish(m); 1780 } 1781 } else if (bio->bio_caller_info1.index & SWBIO_READ) { 1782 /* 1783 * NOTE: for reads, m->dirty will probably be 1784 * overridden by the original caller of getpages so 1785 * we cannot set them in order to free the underlying 1786 * swap in a low-swap situation. I don't think we'd 1787 * want to do that anyway, but it was an optimization 1788 * that existed in the old swapper for a time before 1789 * it got ripped out due to precisely this problem. 1790 * 1791 * clear PG_ZERO in page. 1792 * 1793 * If not the requested page then deactivate it. 1794 * 1795 * Note that the requested page, reqpage, is left 1796 * busied, but we still have to wake it up. The 1797 * other pages are released (unbusied) by 1798 * vm_page_wakeup(). We do not set reqpage's 1799 * valid bits here, it is up to the caller. 1800 */ 1801 1802 /* 1803 * NOTE: can't call pmap_clear_modify(m) from an 1804 * interrupt thread, the pmap code may have to map 1805 * non-kernel pmaps and currently asserts the case. 1806 */ 1807 /*pmap_clear_modify(m);*/ 1808 m->valid = VM_PAGE_BITS_ALL; 1809 vm_page_undirty(m); 1810 vm_page_flag_clear(m, PG_ZERO | PG_SWAPINPROG); 1811 vm_page_flag_set(m, PG_SWAPPED); 1812 1813 /* 1814 * We have to wake specifically requested pages 1815 * up too because we cleared PG_SWAPINPROG and 1816 * could be waiting for it in getpages. However, 1817 * be sure to not unbusy getpages specifically 1818 * requested page - getpages expects it to be 1819 * left busy. 1820 * 1821 * bio_driver_info holds the requested page 1822 */ 1823 if (i != (int)(intptr_t)bio->bio_driver_info) { 1824 vm_page_deactivate(m); 1825 vm_page_wakeup(m); 1826 } else { 1827 vm_page_flash(m); 1828 } 1829 } else { 1830 /* 1831 * Mark the page clean but do not mess with the 1832 * pmap-layer's modified state. That state should 1833 * also be clear since the caller protected the 1834 * page VM_PROT_READ, but allow the case. 1835 * 1836 * We are in an interrupt, avoid pmap operations. 1837 * 1838 * If we have a severe page deficit, deactivate the 1839 * page. Do not try to cache it (which would also 1840 * involve a pmap op), because the page might still 1841 * be read-heavy. 1842 * 1843 * When using the swap to cache clean vnode pages 1844 * we do not mess with the page dirty bits. 1845 */ 1846 if (m->object->type == OBJT_SWAP) 1847 vm_page_undirty(m); 1848 vm_page_flag_clear(m, PG_SWAPINPROG); 1849 vm_page_flag_set(m, PG_SWAPPED); 1850 vm_page_io_finish(m); 1851 if (vm_page_count_severe()) 1852 vm_page_deactivate(m); 1853 #if 0 1854 if (!vm_page_count_severe() || !vm_page_try_to_cache(m)) 1855 vm_page_protect(m, VM_PROT_READ); 1856 #endif 1857 } 1858 } 1859 1860 /* 1861 * adjust pip. NOTE: the original parent may still have its own 1862 * pip refs on the object. 1863 */ 1864 1865 if (object) 1866 vm_object_pip_wakeupn(object, bp->b_xio.xio_npages); 1867 1868 /* 1869 * Release the physical I/O buffer. 1870 * 1871 * NOTE: Due to synchronous operations in the write case b_cmd may 1872 * already be set to BUF_CMD_DONE and BIO_SYNC may have already 1873 * been cleared. 1874 */ 1875 if (bio->bio_caller_info1.index & SWBIO_READ) 1876 nswptr = &nsw_rcount; 1877 else if (bio->bio_caller_info1.index & SWBIO_SYNC) 1878 nswptr = &nsw_wcount_sync; 1879 else 1880 nswptr = &nsw_wcount_async; 1881 bp->b_cmd = BUF_CMD_DONE; 1882 relpbuf(bp, nswptr); 1883 crit_exit(); 1884 } 1885 1886 /************************************************************************ 1887 * SWAP META DATA * 1888 ************************************************************************ 1889 * 1890 * These routines manipulate the swap metadata stored in the 1891 * OBJT_SWAP object. All swp_*() routines must be called at 1892 * splvm() because swap can be freed up by the low level vm_page 1893 * code which might be called from interrupts beyond what splbio() covers. 1894 * 1895 * Swap metadata is implemented with a global hash and not directly 1896 * linked into the object. Instead the object simply contains 1897 * appropriate tracking counters. 1898 */ 1899 1900 /* 1901 * Lookup the swblock containing the specified swap block index. 1902 */ 1903 static __inline 1904 struct swblock * 1905 swp_pager_lookup(vm_object_t object, vm_pindex_t index) 1906 { 1907 index &= ~SWAP_META_MASK; 1908 return (RB_LOOKUP(swblock_rb_tree, &object->swblock_root, index)); 1909 } 1910 1911 /* 1912 * Remove a swblock from the RB tree. 1913 */ 1914 static __inline 1915 void 1916 swp_pager_remove(vm_object_t object, struct swblock *swap) 1917 { 1918 RB_REMOVE(swblock_rb_tree, &object->swblock_root, swap); 1919 } 1920 1921 /* 1922 * Convert default object to swap object if necessary 1923 */ 1924 static void 1925 swp_pager_meta_convert(vm_object_t object) 1926 { 1927 if (object->type == OBJT_DEFAULT) { 1928 object->type = OBJT_SWAP; 1929 KKASSERT(object->swblock_count == 0); 1930 } 1931 } 1932 1933 /* 1934 * SWP_PAGER_META_BUILD() - add swap block to swap meta data for object 1935 * 1936 * We first convert the object to a swap object if it is a default 1937 * object. Vnode objects do not need to be converted. 1938 * 1939 * The specified swapblk is added to the object's swap metadata. If 1940 * the swapblk is not valid, it is freed instead. Any previously 1941 * assigned swapblk is freed. 1942 */ 1943 static void 1944 swp_pager_meta_build(vm_object_t object, vm_pindex_t index, daddr_t swapblk) 1945 { 1946 struct swblock *swap; 1947 struct swblock *oswap; 1948 1949 KKASSERT(swapblk != SWAPBLK_NONE); 1950 1951 /* 1952 * Convert object if necessary 1953 */ 1954 if (object->type == OBJT_DEFAULT) 1955 swp_pager_meta_convert(object); 1956 1957 /* 1958 * Locate swblock. If not found create, but if we aren't adding 1959 * anything just return. If we run out of space in the map we wait 1960 * and, since the hash table may have changed, retry. 1961 */ 1962 retry: 1963 swap = swp_pager_lookup(object, index); 1964 1965 if (swap == NULL) { 1966 int i; 1967 1968 swap = zalloc(swap_zone); 1969 if (swap == NULL) { 1970 vm_wait(0); 1971 goto retry; 1972 } 1973 swap->swb_index = index & ~SWAP_META_MASK; 1974 swap->swb_count = 0; 1975 1976 ++object->swblock_count; 1977 1978 for (i = 0; i < SWAP_META_PAGES; ++i) 1979 swap->swb_pages[i] = SWAPBLK_NONE; 1980 oswap = RB_INSERT(swblock_rb_tree, &object->swblock_root, swap); 1981 KKASSERT(oswap == NULL); 1982 } 1983 1984 /* 1985 * Delete prior contents of metadata 1986 */ 1987 1988 index &= SWAP_META_MASK; 1989 1990 if (swap->swb_pages[index] != SWAPBLK_NONE) { 1991 swp_pager_freeswapspace(object, swap->swb_pages[index], 1); 1992 --swap->swb_count; 1993 } 1994 1995 /* 1996 * Enter block into metadata 1997 */ 1998 swap->swb_pages[index] = swapblk; 1999 if (swapblk != SWAPBLK_NONE) 2000 ++swap->swb_count; 2001 } 2002 2003 /* 2004 * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata 2005 * 2006 * The requested range of blocks is freed, with any associated swap 2007 * returned to the swap bitmap. 2008 * 2009 * This routine will free swap metadata structures as they are cleaned 2010 * out. This routine does *NOT* operate on swap metadata associated 2011 * with resident pages. 2012 * 2013 * This routine must be called at splvm() 2014 */ 2015 static int swp_pager_meta_free_callback(struct swblock *swb, void *data); 2016 2017 static void 2018 swp_pager_meta_free(vm_object_t object, vm_pindex_t index, vm_pindex_t count) 2019 { 2020 struct swfreeinfo info; 2021 2022 /* 2023 * Nothing to do 2024 */ 2025 if (object->swblock_count == 0) { 2026 KKASSERT(RB_EMPTY(&object->swblock_root)); 2027 return; 2028 } 2029 if (count == 0) 2030 return; 2031 2032 /* 2033 * Setup for RB tree scan. Note that the pindex range can be huge 2034 * due to the 64 bit page index space so we cannot safely iterate. 2035 */ 2036 info.object = object; 2037 info.basei = index & ~SWAP_META_MASK; 2038 info.begi = index; 2039 info.endi = index + count - 1; 2040 swblock_rb_tree_RB_SCAN(&object->swblock_root, rb_swblock_scancmp, 2041 swp_pager_meta_free_callback, &info); 2042 } 2043 2044 static 2045 int 2046 swp_pager_meta_free_callback(struct swblock *swap, void *data) 2047 { 2048 struct swfreeinfo *info = data; 2049 vm_object_t object = info->object; 2050 int index; 2051 int eindex; 2052 2053 /* 2054 * Figure out the range within the swblock. The wider scan may 2055 * return edge-case swap blocks when the start and/or end points 2056 * are in the middle of a block. 2057 */ 2058 if (swap->swb_index < info->begi) 2059 index = (int)info->begi & SWAP_META_MASK; 2060 else 2061 index = 0; 2062 2063 if (swap->swb_index + SWAP_META_PAGES > info->endi) 2064 eindex = (int)info->endi & SWAP_META_MASK; 2065 else 2066 eindex = SWAP_META_MASK; 2067 2068 /* 2069 * Scan and free the blocks. The loop terminates early 2070 * if (swap) runs out of blocks and could be freed. 2071 */ 2072 while (index <= eindex) { 2073 daddr_t v = swap->swb_pages[index]; 2074 2075 if (v != SWAPBLK_NONE) { 2076 swp_pager_freeswapspace(object, v, 1); 2077 swap->swb_pages[index] = SWAPBLK_NONE; 2078 if (--swap->swb_count == 0) { 2079 swp_pager_remove(object, swap); 2080 zfree(swap_zone, swap); 2081 --object->swblock_count; 2082 break; 2083 } 2084 } 2085 ++index; 2086 } 2087 /* swap may be invalid here due to zfree above */ 2088 return(0); 2089 } 2090 2091 /* 2092 * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object 2093 * 2094 * This routine locates and destroys all swap metadata associated with 2095 * an object. 2096 * 2097 * This routine must be called at splvm() 2098 */ 2099 static void 2100 swp_pager_meta_free_all(vm_object_t object) 2101 { 2102 struct swblock *swap; 2103 int i; 2104 2105 while ((swap = RB_ROOT(&object->swblock_root)) != NULL) { 2106 swp_pager_remove(object, swap); 2107 for (i = 0; i < SWAP_META_PAGES; ++i) { 2108 daddr_t v = swap->swb_pages[i]; 2109 if (v != SWAPBLK_NONE) { 2110 --swap->swb_count; 2111 swp_pager_freeswapspace(object, v, 1); 2112 } 2113 } 2114 if (swap->swb_count != 0) 2115 panic("swap_pager_meta_free_all: swb_count != 0"); 2116 zfree(swap_zone, swap); 2117 --object->swblock_count; 2118 } 2119 KKASSERT(object->swblock_count == 0); 2120 } 2121 2122 /* 2123 * SWP_PAGER_METACTL() - misc control of swap and vm_page_t meta data. 2124 * 2125 * This routine is capable of looking up, popping, or freeing 2126 * swapblk assignments in the swap meta data or in the vm_page_t. 2127 * The routine typically returns the swapblk being looked-up, or popped, 2128 * or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block 2129 * was invalid. This routine will automatically free any invalid 2130 * meta-data swapblks. 2131 * 2132 * It is not possible to store invalid swapblks in the swap meta data 2133 * (other then a literal 'SWAPBLK_NONE'), so we don't bother checking. 2134 * 2135 * When acting on a busy resident page and paging is in progress, we 2136 * have to wait until paging is complete but otherwise can act on the 2137 * busy page. 2138 * 2139 * This routine must be called at splvm(). 2140 * 2141 * SWM_FREE remove and free swap block from metadata 2142 * SWM_POP remove from meta data but do not free.. pop it out 2143 */ 2144 static daddr_t 2145 swp_pager_meta_ctl(vm_object_t object, vm_pindex_t index, int flags) 2146 { 2147 struct swblock *swap; 2148 daddr_t r1; 2149 2150 if (object->swblock_count == 0) 2151 return(SWAPBLK_NONE); 2152 2153 r1 = SWAPBLK_NONE; 2154 swap = swp_pager_lookup(object, index); 2155 2156 if (swap != NULL) { 2157 index &= SWAP_META_MASK; 2158 r1 = swap->swb_pages[index]; 2159 2160 if (r1 != SWAPBLK_NONE) { 2161 if (flags & SWM_FREE) { 2162 swp_pager_freeswapspace(object, r1, 1); 2163 r1 = SWAPBLK_NONE; 2164 } 2165 if (flags & (SWM_FREE|SWM_POP)) { 2166 swap->swb_pages[index] = SWAPBLK_NONE; 2167 if (--swap->swb_count == 0) { 2168 swp_pager_remove(object, swap); 2169 zfree(swap_zone, swap); 2170 --object->swblock_count; 2171 } 2172 } 2173 } 2174 } 2175 return(r1); 2176 } 2177