1 /* $OpenBSD: uvm_swap.c,v 1.128 2014/07/12 18:44:01 tedu Exp $ */ 2 /* $NetBSD: uvm_swap.c,v 1.40 2000/11/17 11:39:39 mrg Exp $ */ 3 4 /* 5 * Copyright (c) 1995, 1996, 1997 Matthew R. Green 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. The name of the author may not be used to endorse or promote products 17 * derived from this software without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 26 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 32 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/buf.h> 38 #include <sys/conf.h> 39 #include <sys/proc.h> 40 #include <sys/namei.h> 41 #include <sys/disklabel.h> 42 #include <sys/errno.h> 43 #include <sys/kernel.h> 44 #include <sys/malloc.h> 45 #include <sys/vnode.h> 46 #include <sys/file.h> 47 #include <sys/extent.h> 48 #include <sys/mount.h> 49 #include <sys/pool.h> 50 #include <sys/syscallargs.h> 51 #include <sys/swap.h> 52 #include <sys/disk.h> 53 #include <sys/task.h> 54 #if defined(NFSCLIENT) 55 #include <sys/socket.h> 56 #include <sys/domain.h> 57 #include <netinet/in.h> 58 #include <nfs/nfsproto.h> 59 #include <nfs/nfsdiskless.h> 60 #endif 61 62 #include <uvm/uvm.h> 63 #ifdef UVM_SWAP_ENCRYPT 64 #include <dev/rndvar.h> 65 #include <sys/syslog.h> 66 #endif 67 68 #include <sys/specdev.h> 69 70 #include "vnd.h" 71 72 /* 73 * uvm_swap.c: manage configuration and i/o to swap space. 74 */ 75 76 /* 77 * swap space is managed in the following way: 78 * 79 * each swap partition or file is described by a "swapdev" structure. 80 * each "swapdev" structure contains a "swapent" structure which contains 81 * information that is passed up to the user (via system calls). 82 * 83 * each swap partition is assigned a "priority" (int) which controls 84 * swap partition usage. 85 * 86 * the system maintains a global data structure describing all swap 87 * partitions/files. there is a sorted LIST of "swappri" structures 88 * which describe "swapdev"'s at that priority. this LIST is headed 89 * by the "swap_priority" global var. each "swappri" contains a 90 * TAILQ of "swapdev" structures at that priority. 91 * 92 * locking: 93 * - swap_syscall_lock (sleep lock): this lock serializes the swapctl 94 * system call and prevents the swap priority list from changing 95 * while we are in the middle of a system call (e.g. SWAP_STATS). 96 * 97 * each swap device has the following info: 98 * - swap device in use (could be disabled, preventing future use) 99 * - swap enabled (allows new allocations on swap) 100 * - map info in /dev/drum 101 * - vnode pointer 102 * for swap files only: 103 * - block size 104 * - max byte count in buffer 105 * - buffer 106 * - credentials to use when doing i/o to file 107 * 108 * userland controls and configures swap with the swapctl(2) system call. 109 * the sys_swapctl performs the following operations: 110 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 111 * [2] SWAP_STATS: given a pointer to an array of swapent structures 112 * (passed in via "arg") of a size passed in via "misc" ... we load 113 * the current swap config into the array. 114 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 115 * priority in "misc", start swapping on it. 116 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 117 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 118 * "misc") 119 */ 120 121 /* 122 * swapdev: describes a single swap partition/file 123 * 124 * note the following should be true: 125 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 126 * swd_nblks <= swd_mapsize [because mapsize includes disklabel] 127 */ 128 struct swapdev { 129 struct swapent swd_se; 130 #define swd_dev swd_se.se_dev /* device id */ 131 #define swd_flags swd_se.se_flags /* flags:inuse/enable/fake */ 132 #define swd_priority swd_se.se_priority /* our priority */ 133 #define swd_inuse swd_se.se_inuse /* blocks used */ 134 #define swd_nblks swd_se.se_nblks /* total blocks */ 135 char *swd_path; /* saved pathname of device */ 136 int swd_pathlen; /* length of pathname */ 137 int swd_npages; /* #pages we can use */ 138 int swd_npginuse; /* #pages in use */ 139 int swd_npgbad; /* #pages bad */ 140 int swd_drumoffset; /* page0 offset in drum */ 141 int swd_drumsize; /* #pages in drum */ 142 struct extent *swd_ex; /* extent for this swapdev */ 143 char swd_exname[12]; /* name of extent above */ 144 struct vnode *swd_vp; /* backing vnode */ 145 TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */ 146 147 int swd_bsize; /* blocksize (bytes) */ 148 int swd_maxactive; /* max active i/o reqs */ 149 int swd_active; /* # of active i/o reqs */ 150 struct bufq swd_bufq; 151 struct ucred *swd_cred; /* cred for file access */ 152 #ifdef UVM_SWAP_ENCRYPT 153 #define SWD_KEY_SHIFT 7 /* One key per 0.5 MByte */ 154 #define SWD_KEY(x,y) &((x)->swd_keys[((y) - (x)->swd_drumoffset) >> SWD_KEY_SHIFT]) 155 #define SWD_KEY_SIZE(x) (((x) + (1 << SWD_KEY_SHIFT) - 1) >> SWD_KEY_SHIFT) 156 157 #define SWD_DCRYPT_SHIFT 5 158 #define SWD_DCRYPT_BITS 32 159 #define SWD_DCRYPT_MASK (SWD_DCRYPT_BITS - 1) 160 #define SWD_DCRYPT_OFF(x) ((x) >> SWD_DCRYPT_SHIFT) 161 #define SWD_DCRYPT_BIT(x) ((x) & SWD_DCRYPT_MASK) 162 #define SWD_DCRYPT_SIZE(x) (SWD_DCRYPT_OFF((x) + SWD_DCRYPT_MASK) * sizeof(u_int32_t)) 163 u_int32_t *swd_decrypt; /* bitmap for decryption */ 164 struct swap_key *swd_keys; /* keys for different parts */ 165 #endif 166 }; 167 168 /* 169 * swap device priority entry; the list is kept sorted on `spi_priority'. 170 */ 171 struct swappri { 172 int spi_priority; /* priority */ 173 TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 174 /* tailq of swapdevs at this priority */ 175 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 176 }; 177 178 /* 179 * The following two structures are used to keep track of data transfers 180 * on swap devices associated with regular files. 181 * NOTE: this code is more or less a copy of vnd.c; we use the same 182 * structure names here to ease porting.. 183 */ 184 struct vndxfer { 185 struct buf *vx_bp; /* Pointer to parent buffer */ 186 struct swapdev *vx_sdp; 187 int vx_error; 188 int vx_pending; /* # of pending aux buffers */ 189 int vx_flags; 190 #define VX_BUSY 1 191 #define VX_DEAD 2 192 }; 193 194 struct vndbuf { 195 struct buf vb_buf; 196 struct task vb_task; 197 }; 198 199 /* 200 * We keep a of pool vndbuf's and vndxfer structures. 201 */ 202 struct pool vndxfer_pool; 203 struct pool vndbuf_pool; 204 205 #define getvndxfer(vnx) do { \ 206 int s = splbio(); \ 207 vnx = pool_get(&vndxfer_pool, PR_WAITOK); \ 208 splx(s); \ 209 } while (0) 210 211 #define putvndxfer(vnx) { \ 212 pool_put(&vndxfer_pool, (void *)(vnx)); \ 213 } 214 215 #define getvndbuf(vbp) do { \ 216 int s = splbio(); \ 217 vbp = pool_get(&vndbuf_pool, PR_WAITOK); \ 218 splx(s); \ 219 } while (0) 220 221 #define putvndbuf(vbp) { \ 222 pool_put(&vndbuf_pool, (void *)(vbp)); \ 223 } 224 225 /* 226 * local variables 227 */ 228 struct extent *swapmap; /* controls the mapping of /dev/drum */ 229 230 /* list of all active swap devices [by priority] */ 231 LIST_HEAD(swap_priority, swappri); 232 struct swap_priority swap_priority; 233 234 /* locks */ 235 struct rwlock swap_syscall_lock = RWLOCK_INITIALIZER("swplk"); 236 237 /* 238 * prototypes 239 */ 240 void swapdrum_add(struct swapdev *, int); 241 struct swapdev *swapdrum_getsdp(int); 242 243 struct swapdev *swaplist_find(struct vnode *, int); 244 void swaplist_insert(struct swapdev *, 245 struct swappri *, int); 246 void swaplist_trim(void); 247 248 int swap_on(struct proc *, struct swapdev *); 249 int swap_off(struct proc *, struct swapdev *); 250 251 void sw_reg_strategy(struct swapdev *, struct buf *, int); 252 void sw_reg_iodone(struct buf *); 253 void sw_reg_iodone_internal(void *, void *); 254 void sw_reg_start(struct swapdev *); 255 256 int uvm_swap_io(struct vm_page **, int, int, int); 257 258 void swapmount(void); 259 boolean_t uvm_swap_allocpages(struct vm_page **, int); 260 261 #ifdef UVM_SWAP_ENCRYPT 262 /* for swap encrypt */ 263 void uvm_swap_markdecrypt(struct swapdev *, int, int, int); 264 boolean_t uvm_swap_needdecrypt(struct swapdev *, int); 265 void uvm_swap_initcrypt(struct swapdev *, int); 266 #endif 267 268 /* 269 * uvm_swap_init: init the swap system data structures and locks 270 * 271 * => called at boot time from init_main.c after the filesystems 272 * are brought up (which happens after uvm_init()) 273 */ 274 void 275 uvm_swap_init(void) 276 { 277 /* 278 * first, init the swap list, its counter, and its lock. 279 * then get a handle on the vnode for /dev/drum by using 280 * the its dev_t number ("swapdev", from MD conf.c). 281 */ 282 LIST_INIT(&swap_priority); 283 uvmexp.nswapdev = 0; 284 285 if (!swapdev_vp && bdevvp(swapdev, &swapdev_vp)) 286 panic("uvm_swap_init: can't get vnode for swap device"); 287 288 /* 289 * create swap block extent to map /dev/drum. The extent spans 290 * 1 to INT_MAX allows 2 gigablocks of swap space. Note that 291 * block 0 is reserved (used to indicate an allocation failure, 292 * or no allocation). 293 */ 294 swapmap = extent_create("swapmap", 1, INT_MAX, 295 M_VMSWAP, 0, 0, EX_NOWAIT); 296 if (swapmap == 0) 297 panic("uvm_swap_init: extent_create failed"); 298 299 /* allocate pools for structures used for swapping to files. */ 300 pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx", 301 NULL); 302 pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd", 303 NULL); 304 305 /* Setup the initial swap partition */ 306 swapmount(); 307 } 308 309 #ifdef UVM_SWAP_ENCRYPT 310 void 311 uvm_swap_initcrypt_all(void) 312 { 313 struct swapdev *sdp; 314 struct swappri *spp; 315 int npages; 316 317 318 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 319 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 320 if (sdp->swd_decrypt == NULL) { 321 npages = dbtob((uint64_t)sdp->swd_nblks) >> 322 PAGE_SHIFT; 323 uvm_swap_initcrypt(sdp, npages); 324 } 325 } 326 } 327 } 328 329 void 330 uvm_swap_initcrypt(struct swapdev *sdp, int npages) 331 { 332 /* 333 * keep information if a page needs to be decrypted when we get it 334 * from the swap device. 335 * We cannot chance a malloc later, if we are doing ASYNC puts, 336 * we may not call malloc with M_WAITOK. This consumes only 337 * 8KB memory for a 256MB swap partition. 338 */ 339 sdp->swd_decrypt = malloc(SWD_DCRYPT_SIZE(npages), M_VMSWAP, 340 M_WAITOK|M_ZERO); 341 sdp->swd_keys = malloc(SWD_KEY_SIZE(npages) * sizeof(struct swap_key), 342 M_VMSWAP, M_WAITOK|M_ZERO); 343 } 344 345 #endif /* UVM_SWAP_ENCRYPT */ 346 347 boolean_t 348 uvm_swap_allocpages(struct vm_page **pps, int npages) 349 { 350 struct pglist pgl; 351 int i; 352 boolean_t fail; 353 354 /* Estimate if we will succeed */ 355 uvm_lock_fpageq(); 356 357 fail = uvmexp.free - npages < uvmexp.reserve_kernel; 358 359 uvm_unlock_fpageq(); 360 361 if (fail) 362 return FALSE; 363 364 TAILQ_INIT(&pgl); 365 if (uvm_pglistalloc(npages * PAGE_SIZE, dma_constraint.ucr_low, 366 dma_constraint.ucr_high, 0, 0, &pgl, npages, UVM_PLA_NOWAIT)) 367 return FALSE; 368 369 for (i = 0; i < npages; i++) { 370 pps[i] = TAILQ_FIRST(&pgl); 371 /* *sigh* */ 372 atomic_setbits_int(&pps[i]->pg_flags, PG_BUSY); 373 TAILQ_REMOVE(&pgl, pps[i], pageq); 374 } 375 376 return TRUE; 377 } 378 379 void 380 uvm_swap_freepages(struct vm_page **pps, int npages) 381 { 382 int i; 383 384 uvm_lock_pageq(); 385 for (i = 0; i < npages; i++) 386 uvm_pagefree(pps[i]); 387 uvm_unlock_pageq(); 388 } 389 390 #ifdef UVM_SWAP_ENCRYPT 391 /* 392 * Mark pages on the swap device for later decryption 393 */ 394 395 void 396 uvm_swap_markdecrypt(struct swapdev *sdp, int startslot, int npages, 397 int decrypt) 398 { 399 int pagestart, i; 400 int off, bit; 401 402 if (!sdp) 403 return; 404 405 pagestart = startslot - sdp->swd_drumoffset; 406 for (i = 0; i < npages; i++, pagestart++) { 407 off = SWD_DCRYPT_OFF(pagestart); 408 bit = SWD_DCRYPT_BIT(pagestart); 409 if (decrypt) 410 /* pages read need decryption */ 411 sdp->swd_decrypt[off] |= 1 << bit; 412 else 413 /* pages read do not need decryption */ 414 sdp->swd_decrypt[off] &= ~(1 << bit); 415 } 416 } 417 418 /* 419 * Check if the page that we got from disk needs to be decrypted 420 */ 421 422 boolean_t 423 uvm_swap_needdecrypt(struct swapdev *sdp, int off) 424 { 425 if (!sdp) 426 return FALSE; 427 428 off -= sdp->swd_drumoffset; 429 return sdp->swd_decrypt[SWD_DCRYPT_OFF(off)] & (1 << SWD_DCRYPT_BIT(off)) ? 430 TRUE : FALSE; 431 } 432 433 void 434 uvm_swap_finicrypt_all(void) 435 { 436 struct swapdev *sdp; 437 struct swappri *spp; 438 struct swap_key *key; 439 unsigned int nkeys; 440 441 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 442 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 443 if (sdp->swd_decrypt == NULL) 444 continue; 445 446 nkeys = dbtob((uint64_t)sdp->swd_nblks) >> PAGE_SHIFT; 447 key = sdp->swd_keys + (SWD_KEY_SIZE(nkeys) - 1); 448 do { 449 if (key->refcount != 0) 450 swap_key_delete(key); 451 } while (key-- != sdp->swd_keys); 452 } 453 } 454 } 455 #endif /* UVM_SWAP_ENCRYPT */ 456 457 /* 458 * swaplist functions: functions that operate on the list of swap 459 * devices on the system. 460 */ 461 462 /* 463 * swaplist_insert: insert swap device "sdp" into the global list 464 * 465 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 466 * => caller must provide a newly malloc'd swappri structure (we will 467 * FREE it if we don't need it... this it to prevent malloc blocking 468 * here while adding swap) 469 */ 470 void 471 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) 472 { 473 struct swappri *spp, *pspp; 474 475 /* 476 * find entry at or after which to insert the new device. 477 */ 478 for (pspp = NULL, spp = LIST_FIRST(&swap_priority); spp != NULL; 479 spp = LIST_NEXT(spp, spi_swappri)) { 480 if (priority <= spp->spi_priority) 481 break; 482 pspp = spp; 483 } 484 485 /* 486 * new priority? 487 */ 488 if (spp == NULL || spp->spi_priority != priority) { 489 spp = newspp; /* use newspp! */ 490 491 spp->spi_priority = priority; 492 TAILQ_INIT(&spp->spi_swapdev); 493 494 if (pspp) 495 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 496 else 497 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 498 } else { 499 /* we don't need a new priority structure, free it */ 500 free(newspp, M_VMSWAP, 0); 501 } 502 503 /* 504 * priority found (or created). now insert on the priority's 505 * tailq list and bump the total number of swapdevs. 506 */ 507 sdp->swd_priority = priority; 508 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 509 uvmexp.nswapdev++; 510 } 511 512 /* 513 * swaplist_find: find and optionally remove a swap device from the 514 * global list. 515 * 516 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 517 * => we return the swapdev we found (and removed) 518 */ 519 struct swapdev * 520 swaplist_find(struct vnode *vp, boolean_t remove) 521 { 522 struct swapdev *sdp; 523 struct swappri *spp; 524 525 /* 526 * search the lists for the requested vp 527 */ 528 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 529 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 530 if (sdp->swd_vp != vp) 531 continue; 532 if (remove) { 533 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 534 uvmexp.nswapdev--; 535 } 536 return (sdp); 537 } 538 } 539 return (NULL); 540 } 541 542 543 /* 544 * swaplist_trim: scan priority list for empty priority entries and kill 545 * them. 546 * 547 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 548 */ 549 void 550 swaplist_trim(void) 551 { 552 struct swappri *spp, *nextspp; 553 554 LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) { 555 if (!TAILQ_EMPTY(&spp->spi_swapdev)) 556 continue; 557 LIST_REMOVE(spp, spi_swappri); 558 free(spp, M_VMSWAP, 0); 559 } 560 } 561 562 /* 563 * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area. 564 * 565 * => caller must hold swap_syscall_lock 566 * => uvm.swap_data_lock should be unlocked (we may sleep) 567 */ 568 void 569 swapdrum_add(struct swapdev *sdp, int npages) 570 { 571 u_long result; 572 573 if (extent_alloc(swapmap, npages, EX_NOALIGN, 0, EX_NOBOUNDARY, 574 EX_WAITOK, &result)) 575 panic("swapdrum_add"); 576 577 sdp->swd_drumoffset = result; 578 sdp->swd_drumsize = npages; 579 } 580 581 /* 582 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 583 * to the "swapdev" that maps that section of the drum. 584 * 585 * => each swapdev takes one big contig chunk of the drum 586 * => caller must hold uvm.swap_data_lock 587 */ 588 struct swapdev * 589 swapdrum_getsdp(int pgno) 590 { 591 struct swapdev *sdp; 592 struct swappri *spp; 593 594 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 595 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 596 if (pgno >= sdp->swd_drumoffset && 597 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 598 return sdp; 599 } 600 } 601 } 602 return NULL; 603 } 604 605 606 /* 607 * sys_swapctl: main entry point for swapctl(2) system call 608 * [with two helper functions: swap_on and swap_off] 609 */ 610 int 611 sys_swapctl(struct proc *p, void *v, register_t *retval) 612 { 613 struct sys_swapctl_args /* { 614 syscallarg(int) cmd; 615 syscallarg(void *) arg; 616 syscallarg(int) misc; 617 } */ *uap = (struct sys_swapctl_args *)v; 618 struct vnode *vp; 619 struct nameidata nd; 620 struct swappri *spp; 621 struct swapdev *sdp; 622 struct swapent *sep; 623 char userpath[MAXPATHLEN]; 624 size_t len; 625 int count, error, misc; 626 int priority; 627 628 misc = SCARG(uap, misc); 629 630 /* 631 * ensure serialized syscall access by grabbing the swap_syscall_lock 632 */ 633 rw_enter_write(&swap_syscall_lock); 634 635 /* 636 * we handle the non-priv NSWAP and STATS request first. 637 * 638 * SWAP_NSWAP: return number of config'd swap devices 639 * [can also be obtained with uvmexp sysctl] 640 */ 641 if (SCARG(uap, cmd) == SWAP_NSWAP) { 642 *retval = uvmexp.nswapdev; 643 error = 0; 644 goto out; 645 } 646 647 /* 648 * SWAP_STATS: get stats on current # of configured swap devs 649 * 650 * note that the swap_priority list can't change as long 651 * as we are holding the swap_syscall_lock. we don't want 652 * to grab the uvm.swap_data_lock because we may fault&sleep during 653 * copyout() and we don't want to be holding that lock then! 654 */ 655 if (SCARG(uap, cmd) == SWAP_STATS) { 656 sep = (struct swapent *)SCARG(uap, arg); 657 count = 0; 658 659 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 660 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 661 if (count >= misc) 662 continue; 663 664 sdp->swd_inuse = 665 btodb((u_int64_t)sdp->swd_npginuse << 666 PAGE_SHIFT); 667 error = copyout(&sdp->swd_se, sep, 668 sizeof(struct swapent)); 669 if (error) 670 goto out; 671 672 /* now copy out the path if necessary */ 673 error = copyoutstr(sdp->swd_path, 674 sep->se_path, sizeof(sep->se_path), NULL); 675 if (error) 676 goto out; 677 678 count++; 679 sep++; 680 } 681 } 682 683 *retval = count; 684 error = 0; 685 goto out; 686 } 687 688 /* all other requests require superuser privs. verify. */ 689 if ((error = suser(p, 0))) 690 goto out; 691 692 /* 693 * at this point we expect a path name in arg. we will 694 * use namei() to gain a vnode reference (vref), and lock 695 * the vnode (VOP_LOCK). 696 */ 697 error = copyinstr(SCARG(uap, arg), userpath, sizeof(userpath), &len); 698 if (error) 699 goto out; 700 disk_map(userpath, userpath, sizeof(userpath), DM_OPENBLCK); 701 NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, UIO_SYSSPACE, userpath, p); 702 if ((error = namei(&nd))) 703 goto out; 704 vp = nd.ni_vp; 705 /* note: "vp" is referenced and locked */ 706 707 error = 0; /* assume no error */ 708 switch(SCARG(uap, cmd)) { 709 case SWAP_DUMPDEV: 710 if (vp->v_type != VBLK) { 711 error = ENOTBLK; 712 break; 713 } 714 dumpdev = vp->v_rdev; 715 break; 716 case SWAP_CTL: 717 /* 718 * get new priority, remove old entry (if any) and then 719 * reinsert it in the correct place. finally, prune out 720 * any empty priority structures. 721 */ 722 priority = SCARG(uap, misc); 723 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 724 if ((sdp = swaplist_find(vp, 1)) == NULL) { 725 error = ENOENT; 726 } else { 727 swaplist_insert(sdp, spp, priority); 728 swaplist_trim(); 729 } 730 if (error) 731 free(spp, M_VMSWAP, 0); 732 break; 733 case SWAP_ON: 734 /* 735 * check for duplicates. if none found, then insert a 736 * dummy entry on the list to prevent someone else from 737 * trying to enable this device while we are working on 738 * it. 739 */ 740 priority = SCARG(uap, misc); 741 if ((sdp = swaplist_find(vp, 0)) != NULL) { 742 error = EBUSY; 743 break; 744 } 745 sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK|M_ZERO); 746 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 747 sdp->swd_flags = SWF_FAKE; /* placeholder only */ 748 sdp->swd_vp = vp; 749 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 750 751 /* 752 * XXX Is NFS elaboration necessary? 753 */ 754 if (vp->v_type == VREG) { 755 sdp->swd_cred = crdup(p->p_ucred); 756 } 757 758 swaplist_insert(sdp, spp, priority); 759 760 sdp->swd_pathlen = len; 761 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK); 762 if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0) 763 panic("swapctl: copystr"); 764 765 /* 766 * we've now got a FAKE placeholder in the swap list. 767 * now attempt to enable swap on it. if we fail, undo 768 * what we've done and kill the fake entry we just inserted. 769 * if swap_on is a success, it will clear the SWF_FAKE flag 770 */ 771 772 if ((error = swap_on(p, sdp)) != 0) { 773 (void) swaplist_find(vp, 1); /* kill fake entry */ 774 swaplist_trim(); 775 if (vp->v_type == VREG) { 776 crfree(sdp->swd_cred); 777 } 778 free(sdp->swd_path, M_VMSWAP, 0); 779 free(sdp, M_VMSWAP, 0); 780 break; 781 } 782 break; 783 case SWAP_OFF: 784 if ((sdp = swaplist_find(vp, 0)) == NULL) { 785 error = ENXIO; 786 break; 787 } 788 789 /* 790 * If a device isn't in use or enabled, we 791 * can't stop swapping from it (again). 792 */ 793 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 794 error = EBUSY; 795 break; 796 } 797 798 /* 799 * do the real work. 800 */ 801 error = swap_off(p, sdp); 802 break; 803 default: 804 error = EINVAL; 805 } 806 807 /* done! release the ref gained by namei() and unlock. */ 808 vput(vp); 809 810 out: 811 rw_exit_write(&swap_syscall_lock); 812 813 return (error); 814 } 815 816 /* 817 * swap_on: attempt to enable a swapdev for swapping. note that the 818 * swapdev is already on the global list, but disabled (marked 819 * SWF_FAKE). 820 * 821 * => we avoid the start of the disk (to protect disk labels) 822 * => caller should leave uvm.swap_data_lock unlocked, we may lock it 823 * if needed. 824 */ 825 int 826 swap_on(struct proc *p, struct swapdev *sdp) 827 { 828 static int count = 0; /* static */ 829 struct vnode *vp; 830 int error, npages, nblocks, size; 831 long addr; 832 struct vattr va; 833 #if defined(NFSCLIENT) 834 extern struct vops nfs_vops; 835 #endif /* defined(NFSCLIENT) */ 836 dev_t dev; 837 838 /* 839 * we want to enable swapping on sdp. the swd_vp contains 840 * the vnode we want (locked and ref'd), and the swd_dev 841 * contains the dev_t of the file, if it a block device. 842 */ 843 844 vp = sdp->swd_vp; 845 dev = sdp->swd_dev; 846 847 #if NVND > 0 848 /* no swapping to vnds. */ 849 if (bdevsw[major(dev)].d_strategy == vndstrategy) 850 return (EOPNOTSUPP); 851 #endif 852 853 /* 854 * open the swap file (mostly useful for block device files to 855 * let device driver know what is up). 856 * 857 * we skip the open/close for root on swap because the root 858 * has already been opened when root was mounted (mountroot). 859 */ 860 if (vp != rootvp) { 861 if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p))) 862 return (error); 863 } 864 865 /* XXX this only works for block devices */ 866 /* 867 * we now need to determine the size of the swap area. for 868 * block specials we can call the d_psize function. 869 * for normal files, we must stat [get attrs]. 870 * 871 * we put the result in nblks. 872 * for normal files, we also want the filesystem block size 873 * (which we get with statfs). 874 */ 875 switch (vp->v_type) { 876 case VBLK: 877 if (bdevsw[major(dev)].d_psize == 0 || 878 (nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) { 879 error = ENXIO; 880 goto bad; 881 } 882 break; 883 884 case VREG: 885 if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p))) 886 goto bad; 887 nblocks = (int)btodb(va.va_size); 888 if ((error = 889 VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0) 890 goto bad; 891 892 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize; 893 /* 894 * limit the max # of outstanding I/O requests we issue 895 * at any one time. take it easy on NFS servers. 896 */ 897 #if defined(NFSCLIENT) 898 if (vp->v_op == &nfs_vops) 899 sdp->swd_maxactive = 2; /* XXX */ 900 else 901 #endif /* defined(NFSCLIENT) */ 902 sdp->swd_maxactive = 8; /* XXX */ 903 bufq_init(&sdp->swd_bufq, BUFQ_FIFO); 904 break; 905 906 default: 907 error = ENXIO; 908 goto bad; 909 } 910 911 /* 912 * save nblocks in a safe place and convert to pages. 913 */ 914 915 sdp->swd_nblks = nblocks; 916 npages = dbtob((u_int64_t)nblocks) >> PAGE_SHIFT; 917 918 /* 919 * for block special files, we want to make sure that leave 920 * the disklabel and bootblocks alone, so we arrange to skip 921 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 922 * note that because of this the "size" can be less than the 923 * actual number of blocks on the device. 924 */ 925 if (vp->v_type == VBLK) { 926 /* we use pages 1 to (size - 1) [inclusive] */ 927 size = npages - 1; 928 addr = 1; 929 } else { 930 /* we use pages 0 to (size - 1) [inclusive] */ 931 size = npages; 932 addr = 0; 933 } 934 935 /* 936 * make sure we have enough blocks for a reasonable sized swap 937 * area. we want at least one page. 938 */ 939 940 if (size < 1) { 941 error = EINVAL; 942 goto bad; 943 } 944 945 /* 946 * now we need to allocate an extent to manage this swap device 947 */ 948 snprintf(sdp->swd_exname, sizeof(sdp->swd_exname), "swap0x%04x", 949 count++); 950 951 /* note that extent_create's 3rd arg is inclusive, thus "- 1" */ 952 sdp->swd_ex = extent_create(sdp->swd_exname, 0, npages - 1, M_VMSWAP, 953 0, 0, EX_WAITOK); 954 /* allocate the `saved' region from the extent so it won't be used */ 955 if (addr) { 956 if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK)) 957 panic("disklabel reserve"); 958 /* XXX: is extent synchronized with swd_npginuse? */ 959 } 960 #ifdef HIBERNATE 961 /* 962 * Lock down the last region of primary disk swap, in case 963 * hibernate needs to place a signature there. 964 */ 965 if (dev == swdevt[0].sw_dev && vp->v_type == VBLK && size > 3 ) { 966 if (extent_alloc_region(sdp->swd_ex, 967 npages - 1 - 1, 1, EX_WAITOK)) 968 panic("hibernate reserve"); 969 /* XXX: is extent synchronized with swd_npginuse? */ 970 } 971 #endif 972 973 /* add a ref to vp to reflect usage as a swap device. */ 974 vref(vp); 975 976 #ifdef UVM_SWAP_ENCRYPT 977 if (uvm_doswapencrypt) 978 uvm_swap_initcrypt(sdp, npages); 979 #endif 980 /* now add the new swapdev to the drum and enable. */ 981 swapdrum_add(sdp, npages); 982 sdp->swd_npages = size; 983 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 984 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 985 uvmexp.swpages += size; 986 return (0); 987 988 bad: 989 /* failure: close device if necessary and return error. */ 990 if (vp != rootvp) 991 (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p); 992 return (error); 993 } 994 995 /* 996 * swap_off: stop swapping on swapdev 997 * 998 * => swap data should be locked, we will unlock. 999 */ 1000 int 1001 swap_off(struct proc *p, struct swapdev *sdp) 1002 { 1003 int error = 0; 1004 1005 /* disable the swap area being removed */ 1006 sdp->swd_flags &= ~SWF_ENABLE; 1007 1008 /* 1009 * the idea is to find all the pages that are paged out to this 1010 * device, and page them all in. in uvm, swap-backed pageable 1011 * memory can take two forms: aobjs and anons. call the 1012 * swapoff hook for each subsystem to bring in pages. 1013 */ 1014 1015 if (uao_swap_off(sdp->swd_drumoffset, 1016 sdp->swd_drumoffset + sdp->swd_drumsize) || 1017 amap_swap_off(sdp->swd_drumoffset, 1018 sdp->swd_drumoffset + sdp->swd_drumsize)) { 1019 1020 error = ENOMEM; 1021 } else if (sdp->swd_npginuse > sdp->swd_npgbad) { 1022 error = EBUSY; 1023 } 1024 1025 if (error) { 1026 sdp->swd_flags |= SWF_ENABLE; 1027 return (error); 1028 } 1029 1030 /* 1031 * done with the vnode and saved creds. 1032 * drop our ref on the vnode before calling VOP_CLOSE() 1033 * so that spec_close() can tell if this is the last close. 1034 */ 1035 if (sdp->swd_vp->v_type == VREG) { 1036 crfree(sdp->swd_cred); 1037 } 1038 vrele(sdp->swd_vp); 1039 if (sdp->swd_vp != rootvp) { 1040 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p); 1041 } 1042 1043 uvmexp.swpages -= sdp->swd_npages; 1044 1045 if (swaplist_find(sdp->swd_vp, 1) == NULL) 1046 panic("swap_off: swapdev not in list"); 1047 swaplist_trim(); 1048 1049 /* 1050 * free all resources! 1051 */ 1052 extent_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize, 1053 EX_WAITOK); 1054 extent_destroy(sdp->swd_ex); 1055 free(sdp, M_VMSWAP, 0); 1056 return (0); 1057 } 1058 1059 /* 1060 * /dev/drum interface and i/o functions 1061 */ 1062 1063 /* 1064 * swstrategy: perform I/O on the drum 1065 * 1066 * => we must map the i/o request from the drum to the correct swapdev. 1067 */ 1068 void 1069 swstrategy(struct buf *bp) 1070 { 1071 struct swapdev *sdp; 1072 int s, pageno, bn; 1073 1074 /* 1075 * convert block number to swapdev. note that swapdev can't 1076 * be yanked out from under us because we are holding resources 1077 * in it (i.e. the blocks we are doing I/O on). 1078 */ 1079 pageno = dbtob((u_int64_t)bp->b_blkno) >> PAGE_SHIFT; 1080 sdp = swapdrum_getsdp(pageno); 1081 if (sdp == NULL) { 1082 bp->b_error = EINVAL; 1083 bp->b_flags |= B_ERROR; 1084 s = splbio(); 1085 biodone(bp); 1086 splx(s); 1087 return; 1088 } 1089 1090 /* convert drum page number to block number on this swapdev. */ 1091 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1092 bn = btodb((u_int64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ 1093 1094 /* 1095 * for block devices we finish up here. 1096 * for regular files we have to do more work which we delegate 1097 * to sw_reg_strategy(). 1098 */ 1099 switch (sdp->swd_vp->v_type) { 1100 default: 1101 panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type); 1102 case VBLK: 1103 /* 1104 * must convert "bp" from an I/O on /dev/drum to an I/O 1105 * on the swapdev (sdp). 1106 */ 1107 s = splbio(); 1108 buf_replacevnode(bp, sdp->swd_vp); 1109 1110 bp->b_blkno = bn; 1111 splx(s); 1112 VOP_STRATEGY(bp); 1113 return; 1114 case VREG: 1115 /* delegate to sw_reg_strategy function. */ 1116 sw_reg_strategy(sdp, bp, bn); 1117 return; 1118 } 1119 /* NOTREACHED */ 1120 } 1121 1122 /* 1123 * sw_reg_strategy: handle swap i/o to regular files 1124 */ 1125 void 1126 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) 1127 { 1128 struct vnode *vp; 1129 struct vndxfer *vnx; 1130 daddr_t nbn; 1131 caddr_t addr; 1132 off_t byteoff; 1133 int s, off, nra, error, sz, resid; 1134 1135 /* 1136 * allocate a vndxfer head for this transfer and point it to 1137 * our buffer. 1138 */ 1139 getvndxfer(vnx); 1140 vnx->vx_flags = VX_BUSY; 1141 vnx->vx_error = 0; 1142 vnx->vx_pending = 0; 1143 vnx->vx_bp = bp; 1144 vnx->vx_sdp = sdp; 1145 1146 /* 1147 * setup for main loop where we read filesystem blocks into 1148 * our buffer. 1149 */ 1150 error = 0; 1151 bp->b_resid = bp->b_bcount; /* nothing transferred yet! */ 1152 addr = bp->b_data; /* current position in buffer */ 1153 byteoff = dbtob((u_int64_t)bn); 1154 1155 for (resid = bp->b_resid; resid; resid -= sz) { 1156 struct vndbuf *nbp; 1157 /* 1158 * translate byteoffset into block number. return values: 1159 * vp = vnode of underlying device 1160 * nbn = new block number (on underlying vnode dev) 1161 * nra = num blocks we can read-ahead (excludes requested 1162 * block) 1163 */ 1164 nra = 0; 1165 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1166 &vp, &nbn, &nra); 1167 1168 if (error == 0 && nbn == -1) { 1169 /* 1170 * this used to just set error, but that doesn't 1171 * do the right thing. Instead, it causes random 1172 * memory errors. The panic() should remain until 1173 * this condition doesn't destabilize the system. 1174 */ 1175 #if 1 1176 panic("sw_reg_strategy: swap to sparse file"); 1177 #else 1178 error = EIO; /* failure */ 1179 #endif 1180 } 1181 1182 /* 1183 * punt if there was an error or a hole in the file. 1184 * we must wait for any i/o ops we have already started 1185 * to finish before returning. 1186 * 1187 * XXX we could deal with holes here but it would be 1188 * a hassle (in the write case). 1189 */ 1190 if (error) { 1191 s = splbio(); 1192 vnx->vx_error = error; /* pass error up */ 1193 goto out; 1194 } 1195 1196 /* 1197 * compute the size ("sz") of this transfer (in bytes). 1198 */ 1199 off = byteoff % sdp->swd_bsize; 1200 sz = (1 + nra) * sdp->swd_bsize - off; 1201 if (sz > resid) 1202 sz = resid; 1203 1204 /* 1205 * now get a buf structure. note that the vb_buf is 1206 * at the front of the nbp structure so that you can 1207 * cast pointers between the two structure easily. 1208 */ 1209 getvndbuf(nbp); 1210 nbp->vb_buf.b_flags = bp->b_flags | B_CALL; 1211 nbp->vb_buf.b_bcount = sz; 1212 nbp->vb_buf.b_bufsize = sz; 1213 nbp->vb_buf.b_error = 0; 1214 nbp->vb_buf.b_data = addr; 1215 nbp->vb_buf.b_bq = NULL; 1216 nbp->vb_buf.b_blkno = nbn + btodb(off); 1217 nbp->vb_buf.b_proc = bp->b_proc; 1218 nbp->vb_buf.b_iodone = sw_reg_iodone; 1219 nbp->vb_buf.b_vp = NULLVP; 1220 nbp->vb_buf.b_vnbufs.le_next = NOLIST; 1221 LIST_INIT(&nbp->vb_buf.b_dep); 1222 1223 /* 1224 * set b_dirtyoff/end and b_validoff/end. this is 1225 * required by the NFS client code (otherwise it will 1226 * just discard our I/O request). 1227 */ 1228 if (bp->b_dirtyend == 0) { 1229 nbp->vb_buf.b_dirtyoff = 0; 1230 nbp->vb_buf.b_dirtyend = sz; 1231 } else { 1232 nbp->vb_buf.b_dirtyoff = 1233 max(0, bp->b_dirtyoff - (bp->b_bcount-resid)); 1234 nbp->vb_buf.b_dirtyend = 1235 min(sz, 1236 max(0, bp->b_dirtyend - (bp->b_bcount-resid))); 1237 } 1238 if (bp->b_validend == 0) { 1239 nbp->vb_buf.b_validoff = 0; 1240 nbp->vb_buf.b_validend = sz; 1241 } else { 1242 nbp->vb_buf.b_validoff = 1243 max(0, bp->b_validoff - (bp->b_bcount-resid)); 1244 nbp->vb_buf.b_validend = 1245 min(sz, 1246 max(0, bp->b_validend - (bp->b_bcount-resid))); 1247 } 1248 1249 /* patch it back to the vnx */ 1250 task_set(&nbp->vb_task, sw_reg_iodone_internal, nbp, vnx); 1251 1252 s = splbio(); 1253 if (vnx->vx_error != 0) { 1254 putvndbuf(nbp); 1255 goto out; 1256 } 1257 vnx->vx_pending++; 1258 1259 /* assoc new buffer with underlying vnode */ 1260 bgetvp(vp, &nbp->vb_buf); 1261 1262 /* start I/O if we are not over our limit */ 1263 bufq_queue(&sdp->swd_bufq, &nbp->vb_buf); 1264 sw_reg_start(sdp); 1265 splx(s); 1266 1267 /* 1268 * advance to the next I/O 1269 */ 1270 byteoff += sz; 1271 addr += sz; 1272 } 1273 1274 s = splbio(); 1275 1276 out: /* Arrive here at splbio */ 1277 vnx->vx_flags &= ~VX_BUSY; 1278 if (vnx->vx_pending == 0) { 1279 if (vnx->vx_error != 0) { 1280 bp->b_error = vnx->vx_error; 1281 bp->b_flags |= B_ERROR; 1282 } 1283 putvndxfer(vnx); 1284 biodone(bp); 1285 } 1286 splx(s); 1287 } 1288 1289 /* sw_reg_start: start an I/O request on the requested swapdev. */ 1290 void 1291 sw_reg_start(struct swapdev *sdp) 1292 { 1293 struct buf *bp; 1294 1295 /* XXX: recursion control */ 1296 if ((sdp->swd_flags & SWF_BUSY) != 0) 1297 return; 1298 1299 sdp->swd_flags |= SWF_BUSY; 1300 1301 while (sdp->swd_active < sdp->swd_maxactive) { 1302 bp = bufq_dequeue(&sdp->swd_bufq); 1303 if (bp == NULL) 1304 break; 1305 1306 sdp->swd_active++; 1307 1308 if ((bp->b_flags & B_READ) == 0) 1309 bp->b_vp->v_numoutput++; 1310 1311 VOP_STRATEGY(bp); 1312 } 1313 sdp->swd_flags &= ~SWF_BUSY; 1314 } 1315 1316 /* 1317 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1318 * 1319 * => note that we can recover the vndbuf struct by casting the buf ptr 1320 * 1321 * XXX: 1322 * We only put this onto a taskq here, because of the maxactive game since 1323 * it basically requires us to call back into VOP_STRATEGY() (where we must 1324 * be able to sleep) via sw_reg_start(). 1325 */ 1326 void 1327 sw_reg_iodone(struct buf *bp) 1328 { 1329 struct vndbuf *vbp = (struct vndbuf *)bp; 1330 task_add(systq, &vbp->vb_task); 1331 } 1332 1333 void 1334 sw_reg_iodone_internal(void *xvbp, void *xvnx) 1335 { 1336 struct vndbuf *vbp = xvbp; 1337 struct vndxfer *vnx = xvnx; 1338 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1339 struct swapdev *sdp = vnx->vx_sdp; 1340 int resid, s; 1341 1342 s = splbio(); 1343 1344 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1345 pbp->b_resid -= resid; 1346 vnx->vx_pending--; 1347 1348 /* pass error upward */ 1349 if (vbp->vb_buf.b_error) 1350 vnx->vx_error = vbp->vb_buf.b_error; 1351 1352 /* disassociate this buffer from the vnode (if any). */ 1353 if (vbp->vb_buf.b_vp != NULL) { 1354 brelvp(&vbp->vb_buf); 1355 } 1356 1357 /* kill vbp structure */ 1358 putvndbuf(vbp); 1359 1360 /* 1361 * wrap up this transaction if it has run to completion or, in 1362 * case of an error, when all auxiliary buffers have returned. 1363 */ 1364 if (vnx->vx_error != 0) { 1365 /* pass error upward */ 1366 pbp->b_flags |= B_ERROR; 1367 pbp->b_error = vnx->vx_error; 1368 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1369 putvndxfer(vnx); 1370 biodone(pbp); 1371 } 1372 } else if (pbp->b_resid == 0) { 1373 KASSERT(vnx->vx_pending == 0); 1374 if ((vnx->vx_flags & VX_BUSY) == 0) { 1375 putvndxfer(vnx); 1376 biodone(pbp); 1377 } 1378 } 1379 1380 /* 1381 * done! start next swapdev I/O if one is pending 1382 */ 1383 sdp->swd_active--; 1384 sw_reg_start(sdp); 1385 splx(s); 1386 } 1387 1388 1389 /* 1390 * uvm_swap_alloc: allocate space on swap 1391 * 1392 * => allocation is done "round robin" down the priority list, as we 1393 * allocate in a priority we "rotate" the tail queue. 1394 * => space can be freed with uvm_swap_free 1395 * => we return the page slot number in /dev/drum (0 == invalid slot) 1396 * => we lock uvm.swap_data_lock 1397 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1398 */ 1399 int 1400 uvm_swap_alloc(int *nslots, boolean_t lessok) 1401 { 1402 struct swapdev *sdp; 1403 struct swappri *spp; 1404 u_long result; 1405 1406 /* 1407 * no swap devices configured yet? definite failure. 1408 */ 1409 if (uvmexp.nswapdev < 1) 1410 return 0; 1411 1412 /* 1413 * lock data lock, convert slots into blocks, and enter loop 1414 */ 1415 1416 ReTry: /* XXXMRG */ 1417 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1418 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1419 /* if it's not enabled, then we can't swap from it */ 1420 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1421 continue; 1422 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1423 continue; 1424 if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN, 0, 1425 EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT, 1426 &result) != 0) { 1427 continue; 1428 } 1429 1430 /* 1431 * successful allocation! now rotate the tailq. 1432 */ 1433 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1434 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1435 sdp->swd_npginuse += *nslots; 1436 uvmexp.swpginuse += *nslots; 1437 /* done! return drum slot number */ 1438 return(result + sdp->swd_drumoffset); 1439 } 1440 } 1441 1442 /* XXXMRG: BEGIN HACK */ 1443 if (*nslots > 1 && lessok) { 1444 *nslots = 1; 1445 goto ReTry; /* XXXMRG: ugh! extent should support this for us */ 1446 } 1447 /* XXXMRG: END HACK */ 1448 1449 return 0; /* failed */ 1450 } 1451 1452 /* 1453 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1454 * 1455 * => we lock uvm.swap_data_lock 1456 */ 1457 void 1458 uvm_swap_markbad(int startslot, int nslots) 1459 { 1460 struct swapdev *sdp; 1461 1462 sdp = swapdrum_getsdp(startslot); 1463 if (sdp != NULL) { 1464 /* 1465 * we just keep track of how many pages have been marked bad 1466 * in this device, to make everything add up in swap_off(). 1467 * we assume here that the range of slots will all be within 1468 * one swap device. 1469 */ 1470 sdp->swd_npgbad += nslots; 1471 } 1472 } 1473 1474 /* 1475 * uvm_swap_free: free swap slots 1476 * 1477 * => this can be all or part of an allocation made by uvm_swap_alloc 1478 * => we lock uvm.swap_data_lock 1479 */ 1480 void 1481 uvm_swap_free(int startslot, int nslots) 1482 { 1483 struct swapdev *sdp; 1484 1485 /* 1486 * ignore attempts to free the "bad" slot. 1487 */ 1488 1489 if (startslot == SWSLOT_BAD) { 1490 return; 1491 } 1492 1493 /* 1494 * convert drum slot offset back to sdp, free the blocks 1495 * in the extent, and return. must hold pri lock to do 1496 * lookup and access the extent. 1497 */ 1498 1499 sdp = swapdrum_getsdp(startslot); 1500 KASSERT(uvmexp.nswapdev >= 1); 1501 KASSERT(sdp != NULL); 1502 KASSERT(sdp->swd_npginuse >= nslots); 1503 if (extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots, 1504 EX_MALLOCOK|EX_NOWAIT) != 0) { 1505 printf("warning: resource shortage: %d pages of swap lost\n", 1506 nslots); 1507 } 1508 1509 sdp->swd_npginuse -= nslots; 1510 uvmexp.swpginuse -= nslots; 1511 #ifdef UVM_SWAP_ENCRYPT 1512 { 1513 int i; 1514 if (swap_encrypt_initialized) { 1515 /* Dereference keys */ 1516 for (i = 0; i < nslots; i++) 1517 if (uvm_swap_needdecrypt(sdp, startslot + i)) { 1518 struct swap_key *key; 1519 1520 key = SWD_KEY(sdp, startslot + i); 1521 if (key->refcount != 0) 1522 SWAP_KEY_PUT(sdp, key); 1523 } 1524 1525 /* Mark range as not decrypt */ 1526 uvm_swap_markdecrypt(sdp, startslot, nslots, 0); 1527 } 1528 } 1529 #endif /* UVM_SWAP_ENCRYPT */ 1530 } 1531 1532 /* 1533 * uvm_swap_put: put any number of pages into a contig place on swap 1534 * 1535 * => can be sync or async 1536 * => XXXMRG: consider making it an inline or macro 1537 */ 1538 int 1539 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) 1540 { 1541 int result; 1542 1543 result = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1544 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1545 1546 return (result); 1547 } 1548 1549 /* 1550 * uvm_swap_get: get a single page from swap 1551 * 1552 * => usually a sync op (from fault) 1553 * => XXXMRG: consider making it an inline or macro 1554 */ 1555 int 1556 uvm_swap_get(struct vm_page *page, int swslot, int flags) 1557 { 1558 int result; 1559 1560 uvmexp.nswget++; 1561 KASSERT(flags & PGO_SYNCIO); 1562 if (swslot == SWSLOT_BAD) { 1563 return VM_PAGER_ERROR; 1564 } 1565 1566 /* this page is (about to be) no longer only in swap. */ 1567 uvmexp.swpgonly--; 1568 1569 result = uvm_swap_io(&page, swslot, 1, B_READ | 1570 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1571 1572 if (result != VM_PAGER_OK && result != VM_PAGER_PEND) { 1573 /* oops, the read failed so it really is still only in swap. */ 1574 uvmexp.swpgonly++; 1575 } 1576 1577 return (result); 1578 } 1579 1580 /* 1581 * uvm_swap_io: do an i/o operation to swap 1582 */ 1583 1584 int 1585 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) 1586 { 1587 daddr_t startblk; 1588 struct buf *bp; 1589 vaddr_t kva; 1590 int result, s, mapinflags, pflag, bounce = 0, i; 1591 boolean_t write, async; 1592 vaddr_t bouncekva; 1593 struct vm_page *tpps[MAXBSIZE >> PAGE_SHIFT]; 1594 #ifdef UVM_SWAP_ENCRYPT 1595 struct swapdev *sdp; 1596 int encrypt = 0; 1597 #endif 1598 1599 write = (flags & B_READ) == 0; 1600 async = (flags & B_ASYNC) != 0; 1601 1602 /* convert starting drum slot to block number */ 1603 startblk = btodb((u_int64_t)startslot << PAGE_SHIFT); 1604 1605 /* 1606 * first, map the pages into the kernel (XXX: currently required 1607 * by buffer system). 1608 */ 1609 mapinflags = !write ? UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE; 1610 if (!async) 1611 mapinflags |= UVMPAGER_MAPIN_WAITOK; 1612 kva = uvm_pagermapin(pps, npages, mapinflags); 1613 if (kva == 0) 1614 return (VM_PAGER_AGAIN); 1615 1616 #ifdef UVM_SWAP_ENCRYPT 1617 if (write) { 1618 /* 1619 * Check if we need to do swap encryption on old pages. 1620 * Later we need a different scheme, that swap encrypts 1621 * all pages of a process that had at least one page swap 1622 * encrypted. Then we might not need to copy all pages 1623 * in the cluster, and avoid the memory overheard in 1624 * swapping. 1625 */ 1626 if (uvm_doswapencrypt) 1627 encrypt = 1; 1628 } 1629 1630 if (swap_encrypt_initialized || encrypt) { 1631 /* 1632 * we need to know the swap device that we are swapping to/from 1633 * to see if the pages need to be marked for decryption or 1634 * actually need to be decrypted. 1635 * XXX - does this information stay the same over the whole 1636 * execution of this function? 1637 */ 1638 sdp = swapdrum_getsdp(startslot); 1639 } 1640 1641 /* 1642 * Check that we are dma capable for read (write always bounces 1643 * through the swapencrypt anyway... 1644 */ 1645 if (write && encrypt) { 1646 bounce = 1; /* bounce through swapencrypt always */ 1647 } else { 1648 #else 1649 { 1650 #endif 1651 1652 for (i = 0; i < npages; i++) { 1653 if (VM_PAGE_TO_PHYS(pps[i]) < dma_constraint.ucr_low || 1654 VM_PAGE_TO_PHYS(pps[i]) > dma_constraint.ucr_high) { 1655 bounce = 1; 1656 break; 1657 } 1658 } 1659 } 1660 1661 if (bounce) { 1662 int swmapflags; 1663 1664 /* We always need write access. */ 1665 swmapflags = UVMPAGER_MAPIN_READ; 1666 if (!async) 1667 swmapflags |= UVMPAGER_MAPIN_WAITOK; 1668 1669 if (!uvm_swap_allocpages(tpps, npages)) { 1670 uvm_pagermapout(kva, npages); 1671 return (VM_PAGER_AGAIN); 1672 } 1673 1674 bouncekva = uvm_pagermapin(tpps, npages, swmapflags); 1675 if (bouncekva == 0) { 1676 uvm_pagermapout(kva, npages); 1677 uvm_swap_freepages(tpps, npages); 1678 return (VM_PAGER_AGAIN); 1679 } 1680 } 1681 1682 /* encrypt to swap */ 1683 if (write && bounce) { 1684 int i, opages; 1685 caddr_t src, dst; 1686 u_int64_t block; 1687 1688 src = (caddr_t) kva; 1689 dst = (caddr_t) bouncekva; 1690 block = startblk; 1691 for (i = 0; i < npages; i++) { 1692 #ifdef UVM_SWAP_ENCRYPT 1693 struct swap_key *key; 1694 1695 if (encrypt) { 1696 key = SWD_KEY(sdp, startslot + i); 1697 SWAP_KEY_GET(sdp, key); /* add reference */ 1698 1699 swap_encrypt(key, src, dst, block, PAGE_SIZE); 1700 block += btodb(PAGE_SIZE); 1701 } else { 1702 #else 1703 { 1704 #endif /* UVM_SWAP_ENCRYPT */ 1705 memcpy(dst, src, PAGE_SIZE); 1706 } 1707 /* this just tells async callbacks to free */ 1708 atomic_setbits_int(&tpps[i]->pg_flags, PQ_ENCRYPT); 1709 src += PAGE_SIZE; 1710 dst += PAGE_SIZE; 1711 } 1712 1713 uvm_pagermapout(kva, npages); 1714 1715 /* dispose of pages we dont use anymore */ 1716 opages = npages; 1717 uvm_pager_dropcluster(NULL, NULL, pps, &opages, 1718 PGO_PDFREECLUST); 1719 1720 kva = bouncekva; 1721 } 1722 1723 /* 1724 * now allocate a buf for the i/o. 1725 * [make sure we don't put the pagedaemon to sleep...] 1726 */ 1727 s = splbio(); 1728 pflag = (async || curproc == uvm.pagedaemon_proc) ? PR_NOWAIT : 1729 PR_WAITOK; 1730 bp = pool_get(&bufpool, pflag); 1731 splx(s); 1732 1733 /* 1734 * if we failed to get a swapbuf, return "try again" 1735 */ 1736 if (bp == NULL) { 1737 if (write && bounce) { 1738 #ifdef UVM_SWAP_ENCRYPT 1739 int i; 1740 1741 /* swap encrypt needs cleanup */ 1742 if (encrypt) 1743 for (i = 0; i < npages; i++) 1744 SWAP_KEY_PUT(sdp, SWD_KEY(sdp, 1745 startslot + i)); 1746 #endif 1747 1748 uvm_pagermapout(kva, npages); 1749 uvm_swap_freepages(tpps, npages); 1750 } 1751 return (VM_PAGER_AGAIN); 1752 } 1753 1754 /* 1755 * prevent ASYNC reads. 1756 * uvm_swap_io is only called from uvm_swap_get, uvm_swap_get 1757 * assumes that all gets are SYNCIO. Just make sure here. 1758 * XXXARTUBC - might not be true anymore. 1759 */ 1760 if (!write) { 1761 flags &= ~B_ASYNC; 1762 async = 0; 1763 } 1764 1765 /* 1766 * fill in the bp. we currently route our i/o through 1767 * /dev/drum's vnode [swapdev_vp]. 1768 */ 1769 bp->b_flags = B_BUSY | B_NOCACHE | B_RAW | (flags & (B_READ|B_ASYNC)); 1770 bp->b_proc = &proc0; /* XXX */ 1771 bp->b_vnbufs.le_next = NOLIST; 1772 if (bounce) 1773 bp->b_data = (caddr_t)bouncekva; 1774 else 1775 bp->b_data = (caddr_t)kva; 1776 bp->b_bq = NULL; 1777 bp->b_blkno = startblk; 1778 LIST_INIT(&bp->b_dep); 1779 s = splbio(); 1780 bp->b_vp = NULL; 1781 buf_replacevnode(bp, swapdev_vp); 1782 splx(s); 1783 bp->b_bufsize = bp->b_bcount = (long)npages << PAGE_SHIFT; 1784 1785 /* 1786 * for pageouts we must set "dirtyoff" [NFS client code needs it]. 1787 * and we bump v_numoutput (counter of number of active outputs). 1788 */ 1789 if (write) { 1790 bp->b_dirtyoff = 0; 1791 bp->b_dirtyend = npages << PAGE_SHIFT; 1792 #ifdef UVM_SWAP_ENCRYPT 1793 /* mark the pages in the drum for decryption */ 1794 if (swap_encrypt_initialized) 1795 uvm_swap_markdecrypt(sdp, startslot, npages, encrypt); 1796 #endif 1797 s = splbio(); 1798 swapdev_vp->v_numoutput++; 1799 splx(s); 1800 } 1801 1802 /* for async ops we must set up the iodone handler. */ 1803 if (async) { 1804 bp->b_flags |= B_CALL | (curproc == uvm.pagedaemon_proc ? 1805 B_PDAEMON : 0); 1806 bp->b_iodone = uvm_aio_biodone; 1807 } 1808 1809 /* now we start the I/O, and if async, return. */ 1810 VOP_STRATEGY(bp); 1811 if (async) 1812 return (VM_PAGER_PEND); 1813 1814 /* must be sync i/o. wait for it to finish */ 1815 (void) biowait(bp); 1816 result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK; 1817 1818 /* decrypt swap */ 1819 if (!write && !(bp->b_flags & B_ERROR)) { 1820 int i; 1821 caddr_t data = (caddr_t)kva; 1822 caddr_t dst = (caddr_t)kva; 1823 u_int64_t block = startblk; 1824 1825 if (bounce) 1826 data = (caddr_t)bouncekva; 1827 1828 for (i = 0; i < npages; i++) { 1829 #ifdef UVM_SWAP_ENCRYPT 1830 struct swap_key *key; 1831 1832 /* Check if we need to decrypt */ 1833 if (swap_encrypt_initialized && 1834 uvm_swap_needdecrypt(sdp, startslot + i)) { 1835 key = SWD_KEY(sdp, startslot + i); 1836 if (key->refcount == 0) { 1837 result = VM_PAGER_ERROR; 1838 break; 1839 } 1840 swap_decrypt(key, data, dst, block, PAGE_SIZE); 1841 } else if (bounce) { 1842 #else 1843 if (bounce) { 1844 #endif 1845 memcpy(dst, data, PAGE_SIZE); 1846 } 1847 data += PAGE_SIZE; 1848 dst += PAGE_SIZE; 1849 block += btodb(PAGE_SIZE); 1850 } 1851 if (bounce) 1852 uvm_pagermapout(bouncekva, npages); 1853 } 1854 /* kill the pager mapping */ 1855 uvm_pagermapout(kva, npages); 1856 1857 /* Not anymore needed, free after encryption/bouncing */ 1858 if (!write && bounce) 1859 uvm_swap_freepages(tpps, npages); 1860 1861 /* now dispose of the buf */ 1862 s = splbio(); 1863 if (bp->b_vp) 1864 brelvp(bp); 1865 1866 if (write && bp->b_vp) 1867 vwakeup(bp->b_vp); 1868 pool_put(&bufpool, bp); 1869 splx(s); 1870 1871 /* finally return. */ 1872 return (result); 1873 } 1874 1875 void 1876 swapmount(void) 1877 { 1878 struct swapdev *sdp; 1879 struct swappri *spp; 1880 struct vnode *vp; 1881 dev_t swap_dev = swdevt[0].sw_dev; 1882 char *nam; 1883 1884 /* 1885 * No locking here since we happen to know that we will just be called 1886 * once before any other process has forked. 1887 */ 1888 1889 if (swap_dev == NODEV) 1890 return; 1891 1892 sdp = malloc(sizeof(*sdp), M_VMSWAP, M_WAITOK|M_ZERO); 1893 spp = malloc(sizeof(*spp), M_VMSWAP, M_WAITOK); 1894 1895 sdp->swd_flags = SWF_FAKE; 1896 sdp->swd_dev = swap_dev; 1897 1898 /* Construct a potential path to swap */ 1899 sdp->swd_pathlen = MNAMELEN + 1; 1900 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK | M_ZERO); 1901 #if defined(NFSCLIENT) 1902 if (swap_dev == NETDEV) { 1903 extern struct nfs_diskless nfs_diskless; 1904 1905 snprintf(sdp->swd_path, sdp->swd_pathlen, "%s", 1906 nfs_diskless.nd_swap.ndm_host); 1907 vp = nfs_diskless.sw_vp; 1908 goto gotit; 1909 } else 1910 #endif 1911 if (bdevvp(swap_dev, &vp)) { 1912 free(sdp->swd_path, M_VMSWAP, 0); 1913 free(sdp, M_VMSWAP, 0); 1914 free(spp, M_VMSWAP, 0); 1915 return; 1916 } 1917 1918 if ((nam = findblkname(major(swap_dev)))) 1919 snprintf(sdp->swd_path, sdp->swd_pathlen, "/dev/%s%d%c", nam, 1920 DISKUNIT(swap_dev), 'a' + DISKPART(swap_dev)); 1921 else 1922 snprintf(sdp->swd_path, sdp->swd_pathlen, "blkdev0x%x", 1923 swap_dev); 1924 1925 #if defined(NFSCLIENT) 1926 gotit: 1927 #endif 1928 sdp->swd_pathlen = strlen(sdp->swd_path) + 1; 1929 sdp->swd_vp = vp; 1930 1931 swaplist_insert(sdp, spp, 0); 1932 1933 if (swap_on(curproc, sdp)) { 1934 swaplist_find(vp, 1); 1935 swaplist_trim(); 1936 vput(sdp->swd_vp); 1937 free(sdp->swd_path, M_VMSWAP, 0); 1938 free(sdp, M_VMSWAP, 0); 1939 return; 1940 } 1941 } 1942 1943 #ifdef HIBERNATE 1944 int 1945 uvm_hibswap(dev_t dev, u_long *sp, u_long *ep) 1946 { 1947 struct swapdev *sdp, *swd = NULL; 1948 struct swappri *spp; 1949 struct extent_region *exr, *exrn; 1950 u_long start = 0, end = 0, size = 0; 1951 1952 /* no swap devices configured yet? */ 1953 if (uvmexp.nswapdev < 1 || dev != swdevt[0].sw_dev) 1954 return (1); 1955 1956 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1957 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1958 if (sdp->swd_dev == dev) 1959 swd = sdp; 1960 } 1961 } 1962 1963 if (swd == NULL || (swd->swd_flags & SWF_ENABLE) == 0) 1964 return (1); 1965 1966 LIST_FOREACH(exr, &swd->swd_ex->ex_regions, er_link) { 1967 u_long gapstart, gapend, gapsize; 1968 1969 gapstart = exr->er_end + 1; 1970 exrn = LIST_NEXT(exr, er_link); 1971 if (!exrn) 1972 break; 1973 gapend = exrn->er_start - 1; 1974 gapsize = gapend - gapstart; 1975 if (gapsize > size) { 1976 start = gapstart; 1977 end = gapend; 1978 size = gapsize; 1979 } 1980 } 1981 1982 if (size) { 1983 *sp = start; 1984 *ep = end; 1985 return (0); 1986 } 1987 return (1); 1988 } 1989 #endif /* HIBERNATE */ 1990