1 /* $OpenBSD: uvm_swap.c,v 1.140 2016/09/15 02:00:18 dlg Exp $ */ 2 /* $NetBSD: uvm_swap.c,v 1.40 2000/11/17 11:39:39 mrg Exp $ */ 3 4 /* 5 * Copyright (c) 1995, 1996, 1997 Matthew R. Green 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. The name of the author may not be used to endorse or promote products 17 * derived from this software without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 26 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 32 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/buf.h> 38 #include <sys/conf.h> 39 #include <sys/proc.h> 40 #include <sys/namei.h> 41 #include <sys/disklabel.h> 42 #include <sys/errno.h> 43 #include <sys/kernel.h> 44 #include <sys/malloc.h> 45 #include <sys/vnode.h> 46 #include <sys/file.h> 47 #include <sys/extent.h> 48 #include <sys/mount.h> 49 #include <sys/pool.h> 50 #include <sys/syscallargs.h> 51 #include <sys/swap.h> 52 #include <sys/disk.h> 53 #include <sys/task.h> 54 #include <sys/pledge.h> 55 #if defined(NFSCLIENT) 56 #include <sys/socket.h> 57 #include <sys/domain.h> 58 #include <netinet/in.h> 59 #include <nfs/nfsproto.h> 60 #include <nfs/nfsdiskless.h> 61 #endif 62 63 #include <uvm/uvm.h> 64 #ifdef UVM_SWAP_ENCRYPT 65 #include <uvm/uvm_swap_encrypt.h> 66 #endif 67 68 #include <sys/specdev.h> 69 70 #include "vnd.h" 71 72 /* 73 * uvm_swap.c: manage configuration and i/o to swap space. 74 */ 75 76 /* 77 * swap space is managed in the following way: 78 * 79 * each swap partition or file is described by a "swapdev" structure. 80 * each "swapdev" structure contains a "swapent" structure which contains 81 * information that is passed up to the user (via system calls). 82 * 83 * each swap partition is assigned a "priority" (int) which controls 84 * swap partition usage. 85 * 86 * the system maintains a global data structure describing all swap 87 * partitions/files. there is a sorted LIST of "swappri" structures 88 * which describe "swapdev"'s at that priority. this LIST is headed 89 * by the "swap_priority" global var. each "swappri" contains a 90 * TAILQ of "swapdev" structures at that priority. 91 * 92 * locking: 93 * - swap_syscall_lock (sleep lock): this lock serializes the swapctl 94 * system call and prevents the swap priority list from changing 95 * while we are in the middle of a system call (e.g. SWAP_STATS). 96 * 97 * each swap device has the following info: 98 * - swap device in use (could be disabled, preventing future use) 99 * - swap enabled (allows new allocations on swap) 100 * - map info in /dev/drum 101 * - vnode pointer 102 * for swap files only: 103 * - block size 104 * - max byte count in buffer 105 * - buffer 106 * - credentials to use when doing i/o to file 107 * 108 * userland controls and configures swap with the swapctl(2) system call. 109 * the sys_swapctl performs the following operations: 110 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 111 * [2] SWAP_STATS: given a pointer to an array of swapent structures 112 * (passed in via "arg") of a size passed in via "misc" ... we load 113 * the current swap config into the array. 114 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 115 * priority in "misc", start swapping on it. 116 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 117 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 118 * "misc") 119 */ 120 121 /* 122 * swapdev: describes a single swap partition/file 123 * 124 * note the following should be true: 125 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 126 * swd_nblks <= swd_mapsize [because mapsize includes disklabel] 127 */ 128 struct swapdev { 129 struct swapent swd_se; 130 #define swd_dev swd_se.se_dev /* device id */ 131 #define swd_flags swd_se.se_flags /* flags:inuse/enable/fake */ 132 #define swd_priority swd_se.se_priority /* our priority */ 133 #define swd_inuse swd_se.se_inuse /* blocks used */ 134 #define swd_nblks swd_se.se_nblks /* total blocks */ 135 char *swd_path; /* saved pathname of device */ 136 int swd_pathlen; /* length of pathname */ 137 int swd_npages; /* #pages we can use */ 138 int swd_npginuse; /* #pages in use */ 139 int swd_npgbad; /* #pages bad */ 140 int swd_drumoffset; /* page0 offset in drum */ 141 int swd_drumsize; /* #pages in drum */ 142 struct extent *swd_ex; /* extent for this swapdev */ 143 char swd_exname[12]; /* name of extent above */ 144 struct vnode *swd_vp; /* backing vnode */ 145 TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */ 146 147 int swd_bsize; /* blocksize (bytes) */ 148 int swd_maxactive; /* max active i/o reqs */ 149 int swd_active; /* # of active i/o reqs */ 150 struct bufq swd_bufq; 151 struct ucred *swd_cred; /* cred for file access */ 152 #ifdef UVM_SWAP_ENCRYPT 153 #define SWD_KEY_SHIFT 7 /* One key per 0.5 MByte */ 154 #define SWD_KEY(x,y) &((x)->swd_keys[((y) - (x)->swd_drumoffset) >> SWD_KEY_SHIFT]) 155 #define SWD_KEY_SIZE(x) (((x) + (1 << SWD_KEY_SHIFT) - 1) >> SWD_KEY_SHIFT) 156 157 #define SWD_DCRYPT_SHIFT 5 158 #define SWD_DCRYPT_BITS 32 159 #define SWD_DCRYPT_MASK (SWD_DCRYPT_BITS - 1) 160 #define SWD_DCRYPT_OFF(x) ((x) >> SWD_DCRYPT_SHIFT) 161 #define SWD_DCRYPT_BIT(x) ((x) & SWD_DCRYPT_MASK) 162 #define SWD_DCRYPT_SIZE(x) (SWD_DCRYPT_OFF((x) + SWD_DCRYPT_MASK) * sizeof(u_int32_t)) 163 u_int32_t *swd_decrypt; /* bitmap for decryption */ 164 struct swap_key *swd_keys; /* keys for different parts */ 165 #endif 166 }; 167 168 /* 169 * swap device priority entry; the list is kept sorted on `spi_priority'. 170 */ 171 struct swappri { 172 int spi_priority; /* priority */ 173 TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 174 /* tailq of swapdevs at this priority */ 175 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 176 }; 177 178 /* 179 * The following two structures are used to keep track of data transfers 180 * on swap devices associated with regular files. 181 * NOTE: this code is more or less a copy of vnd.c; we use the same 182 * structure names here to ease porting.. 183 */ 184 struct vndxfer { 185 struct buf *vx_bp; /* Pointer to parent buffer */ 186 struct swapdev *vx_sdp; 187 int vx_error; 188 int vx_pending; /* # of pending aux buffers */ 189 int vx_flags; 190 #define VX_BUSY 1 191 #define VX_DEAD 2 192 }; 193 194 struct vndbuf { 195 struct buf vb_buf; 196 struct vndxfer *vb_vnx; 197 struct task vb_task; 198 }; 199 200 /* 201 * We keep a of pool vndbuf's and vndxfer structures. 202 */ 203 struct pool vndxfer_pool; 204 struct pool vndbuf_pool; 205 206 207 /* 208 * local variables 209 */ 210 struct extent *swapmap; /* controls the mapping of /dev/drum */ 211 212 /* list of all active swap devices [by priority] */ 213 LIST_HEAD(swap_priority, swappri); 214 struct swap_priority swap_priority; 215 216 /* locks */ 217 struct rwlock swap_syscall_lock = RWLOCK_INITIALIZER("swplk"); 218 219 /* 220 * prototypes 221 */ 222 void swapdrum_add(struct swapdev *, int); 223 struct swapdev *swapdrum_getsdp(int); 224 225 struct swapdev *swaplist_find(struct vnode *, int); 226 void swaplist_insert(struct swapdev *, 227 struct swappri *, int); 228 void swaplist_trim(void); 229 230 int swap_on(struct proc *, struct swapdev *); 231 int swap_off(struct proc *, struct swapdev *); 232 233 void sw_reg_strategy(struct swapdev *, struct buf *, int); 234 void sw_reg_iodone(struct buf *); 235 void sw_reg_iodone_internal(void *); 236 void sw_reg_start(struct swapdev *); 237 238 int uvm_swap_io(struct vm_page **, int, int, int); 239 240 void swapmount(void); 241 boolean_t uvm_swap_allocpages(struct vm_page **, int); 242 243 #ifdef UVM_SWAP_ENCRYPT 244 /* for swap encrypt */ 245 void uvm_swap_markdecrypt(struct swapdev *, int, int, int); 246 boolean_t uvm_swap_needdecrypt(struct swapdev *, int); 247 void uvm_swap_initcrypt(struct swapdev *, int); 248 #endif 249 250 /* 251 * uvm_swap_init: init the swap system data structures and locks 252 * 253 * => called at boot time from init_main.c after the filesystems 254 * are brought up (which happens after uvm_init()) 255 */ 256 void 257 uvm_swap_init(void) 258 { 259 /* 260 * first, init the swap list, its counter, and its lock. 261 * then get a handle on the vnode for /dev/drum by using 262 * the its dev_t number ("swapdev", from MD conf.c). 263 */ 264 LIST_INIT(&swap_priority); 265 uvmexp.nswapdev = 0; 266 267 if (!swapdev_vp && bdevvp(swapdev, &swapdev_vp)) 268 panic("uvm_swap_init: can't get vnode for swap device"); 269 270 /* 271 * create swap block extent to map /dev/drum. The extent spans 272 * 1 to INT_MAX allows 2 gigablocks of swap space. Note that 273 * block 0 is reserved (used to indicate an allocation failure, 274 * or no allocation). 275 */ 276 swapmap = extent_create("swapmap", 1, INT_MAX, 277 M_VMSWAP, 0, 0, EX_NOWAIT); 278 if (swapmap == 0) 279 panic("uvm_swap_init: extent_create failed"); 280 281 /* allocate pools for structures used for swapping to files. */ 282 pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, IPL_BIO, 0, 283 "swp vnx", NULL); 284 pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, IPL_BIO, 0, 285 "swp vnd", NULL); 286 287 /* Setup the initial swap partition */ 288 swapmount(); 289 } 290 291 #ifdef UVM_SWAP_ENCRYPT 292 void 293 uvm_swap_initcrypt_all(void) 294 { 295 struct swapdev *sdp; 296 struct swappri *spp; 297 int npages; 298 299 300 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 301 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 302 if (sdp->swd_decrypt == NULL) { 303 npages = dbtob((uint64_t)sdp->swd_nblks) >> 304 PAGE_SHIFT; 305 uvm_swap_initcrypt(sdp, npages); 306 } 307 } 308 } 309 } 310 311 void 312 uvm_swap_initcrypt(struct swapdev *sdp, int npages) 313 { 314 /* 315 * keep information if a page needs to be decrypted when we get it 316 * from the swap device. 317 * We cannot chance a malloc later, if we are doing ASYNC puts, 318 * we may not call malloc with M_WAITOK. This consumes only 319 * 8KB memory for a 256MB swap partition. 320 */ 321 sdp->swd_decrypt = malloc(SWD_DCRYPT_SIZE(npages), M_VMSWAP, 322 M_WAITOK|M_ZERO); 323 sdp->swd_keys = malloc(SWD_KEY_SIZE(npages) * sizeof(struct swap_key), 324 M_VMSWAP, M_WAITOK|M_ZERO); 325 } 326 327 #endif /* UVM_SWAP_ENCRYPT */ 328 329 boolean_t 330 uvm_swap_allocpages(struct vm_page **pps, int npages) 331 { 332 struct pglist pgl; 333 int i; 334 boolean_t fail; 335 336 /* Estimate if we will succeed */ 337 uvm_lock_fpageq(); 338 339 fail = uvmexp.free - npages < uvmexp.reserve_kernel; 340 341 uvm_unlock_fpageq(); 342 343 if (fail) 344 return FALSE; 345 346 TAILQ_INIT(&pgl); 347 if (uvm_pglistalloc(npages * PAGE_SIZE, dma_constraint.ucr_low, 348 dma_constraint.ucr_high, 0, 0, &pgl, npages, UVM_PLA_NOWAIT)) 349 return FALSE; 350 351 for (i = 0; i < npages; i++) { 352 pps[i] = TAILQ_FIRST(&pgl); 353 /* *sigh* */ 354 atomic_setbits_int(&pps[i]->pg_flags, PG_BUSY); 355 TAILQ_REMOVE(&pgl, pps[i], pageq); 356 } 357 358 return TRUE; 359 } 360 361 void 362 uvm_swap_freepages(struct vm_page **pps, int npages) 363 { 364 int i; 365 366 uvm_lock_pageq(); 367 for (i = 0; i < npages; i++) 368 uvm_pagefree(pps[i]); 369 uvm_unlock_pageq(); 370 } 371 372 #ifdef UVM_SWAP_ENCRYPT 373 /* 374 * Mark pages on the swap device for later decryption 375 */ 376 377 void 378 uvm_swap_markdecrypt(struct swapdev *sdp, int startslot, int npages, 379 int decrypt) 380 { 381 int pagestart, i; 382 int off, bit; 383 384 if (!sdp) 385 return; 386 387 pagestart = startslot - sdp->swd_drumoffset; 388 for (i = 0; i < npages; i++, pagestart++) { 389 off = SWD_DCRYPT_OFF(pagestart); 390 bit = SWD_DCRYPT_BIT(pagestart); 391 if (decrypt) 392 /* pages read need decryption */ 393 sdp->swd_decrypt[off] |= 1 << bit; 394 else 395 /* pages read do not need decryption */ 396 sdp->swd_decrypt[off] &= ~(1 << bit); 397 } 398 } 399 400 /* 401 * Check if the page that we got from disk needs to be decrypted 402 */ 403 404 boolean_t 405 uvm_swap_needdecrypt(struct swapdev *sdp, int off) 406 { 407 if (!sdp) 408 return FALSE; 409 410 off -= sdp->swd_drumoffset; 411 return sdp->swd_decrypt[SWD_DCRYPT_OFF(off)] & (1 << SWD_DCRYPT_BIT(off)) ? 412 TRUE : FALSE; 413 } 414 415 void 416 uvm_swap_finicrypt_all(void) 417 { 418 struct swapdev *sdp; 419 struct swappri *spp; 420 struct swap_key *key; 421 unsigned int nkeys; 422 423 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 424 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 425 if (sdp->swd_decrypt == NULL) 426 continue; 427 428 nkeys = dbtob((uint64_t)sdp->swd_nblks) >> PAGE_SHIFT; 429 key = sdp->swd_keys + (SWD_KEY_SIZE(nkeys) - 1); 430 do { 431 if (key->refcount != 0) 432 swap_key_delete(key); 433 } while (key-- != sdp->swd_keys); 434 } 435 } 436 } 437 #endif /* UVM_SWAP_ENCRYPT */ 438 439 /* 440 * swaplist functions: functions that operate on the list of swap 441 * devices on the system. 442 */ 443 444 /* 445 * swaplist_insert: insert swap device "sdp" into the global list 446 * 447 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 448 * => caller must provide a newly malloc'd swappri structure (we will 449 * FREE it if we don't need it... this it to prevent malloc blocking 450 * here while adding swap) 451 */ 452 void 453 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) 454 { 455 struct swappri *spp, *pspp; 456 457 /* 458 * find entry at or after which to insert the new device. 459 */ 460 for (pspp = NULL, spp = LIST_FIRST(&swap_priority); spp != NULL; 461 spp = LIST_NEXT(spp, spi_swappri)) { 462 if (priority <= spp->spi_priority) 463 break; 464 pspp = spp; 465 } 466 467 /* 468 * new priority? 469 */ 470 if (spp == NULL || spp->spi_priority != priority) { 471 spp = newspp; /* use newspp! */ 472 473 spp->spi_priority = priority; 474 TAILQ_INIT(&spp->spi_swapdev); 475 476 if (pspp) 477 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 478 else 479 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 480 } else { 481 /* we don't need a new priority structure, free it */ 482 free(newspp, M_VMSWAP, sizeof(*newspp)); 483 } 484 485 /* 486 * priority found (or created). now insert on the priority's 487 * tailq list and bump the total number of swapdevs. 488 */ 489 sdp->swd_priority = priority; 490 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 491 uvmexp.nswapdev++; 492 } 493 494 /* 495 * swaplist_find: find and optionally remove a swap device from the 496 * global list. 497 * 498 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 499 * => we return the swapdev we found (and removed) 500 */ 501 struct swapdev * 502 swaplist_find(struct vnode *vp, boolean_t remove) 503 { 504 struct swapdev *sdp; 505 struct swappri *spp; 506 507 /* 508 * search the lists for the requested vp 509 */ 510 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 511 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 512 if (sdp->swd_vp != vp) 513 continue; 514 if (remove) { 515 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 516 uvmexp.nswapdev--; 517 } 518 return (sdp); 519 } 520 } 521 return (NULL); 522 } 523 524 525 /* 526 * swaplist_trim: scan priority list for empty priority entries and kill 527 * them. 528 * 529 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 530 */ 531 void 532 swaplist_trim(void) 533 { 534 struct swappri *spp, *nextspp; 535 536 LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) { 537 if (!TAILQ_EMPTY(&spp->spi_swapdev)) 538 continue; 539 LIST_REMOVE(spp, spi_swappri); 540 free(spp, M_VMSWAP, sizeof(*spp)); 541 } 542 } 543 544 /* 545 * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area. 546 * 547 * => caller must hold swap_syscall_lock 548 * => uvm.swap_data_lock should be unlocked (we may sleep) 549 */ 550 void 551 swapdrum_add(struct swapdev *sdp, int npages) 552 { 553 u_long result; 554 555 if (extent_alloc(swapmap, npages, EX_NOALIGN, 0, EX_NOBOUNDARY, 556 EX_WAITOK, &result)) 557 panic("swapdrum_add"); 558 559 sdp->swd_drumoffset = result; 560 sdp->swd_drumsize = npages; 561 } 562 563 /* 564 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 565 * to the "swapdev" that maps that section of the drum. 566 * 567 * => each swapdev takes one big contig chunk of the drum 568 * => caller must hold uvm.swap_data_lock 569 */ 570 struct swapdev * 571 swapdrum_getsdp(int pgno) 572 { 573 struct swapdev *sdp; 574 struct swappri *spp; 575 576 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 577 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 578 if (pgno >= sdp->swd_drumoffset && 579 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 580 return sdp; 581 } 582 } 583 } 584 return NULL; 585 } 586 587 588 /* 589 * sys_swapctl: main entry point for swapctl(2) system call 590 * [with two helper functions: swap_on and swap_off] 591 */ 592 int 593 sys_swapctl(struct proc *p, void *v, register_t *retval) 594 { 595 struct sys_swapctl_args /* { 596 syscallarg(int) cmd; 597 syscallarg(void *) arg; 598 syscallarg(int) misc; 599 } */ *uap = (struct sys_swapctl_args *)v; 600 struct vnode *vp; 601 struct nameidata nd; 602 struct swappri *spp; 603 struct swapdev *sdp; 604 struct swapent *sep; 605 char userpath[MAXPATHLEN]; 606 size_t len; 607 int count, error, misc; 608 int priority; 609 610 misc = SCARG(uap, misc); 611 612 /* 613 * ensure serialized syscall access by grabbing the swap_syscall_lock 614 */ 615 rw_enter_write(&swap_syscall_lock); 616 617 /* 618 * we handle the non-priv NSWAP and STATS request first. 619 * 620 * SWAP_NSWAP: return number of config'd swap devices 621 * [can also be obtained with uvmexp sysctl] 622 */ 623 if (SCARG(uap, cmd) == SWAP_NSWAP) { 624 *retval = uvmexp.nswapdev; 625 error = 0; 626 goto out; 627 } 628 629 /* 630 * SWAP_STATS: get stats on current # of configured swap devs 631 * 632 * note that the swap_priority list can't change as long 633 * as we are holding the swap_syscall_lock. we don't want 634 * to grab the uvm.swap_data_lock because we may fault&sleep during 635 * copyout() and we don't want to be holding that lock then! 636 */ 637 if (SCARG(uap, cmd) == SWAP_STATS) { 638 sep = (struct swapent *)SCARG(uap, arg); 639 count = 0; 640 641 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 642 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 643 if (count >= misc) 644 continue; 645 646 sdp->swd_inuse = 647 btodb((u_int64_t)sdp->swd_npginuse << 648 PAGE_SHIFT); 649 error = copyout(&sdp->swd_se, sep, 650 sizeof(struct swapent)); 651 if (error) 652 goto out; 653 654 /* now copy out the path if necessary */ 655 error = copyoutstr(sdp->swd_path, 656 sep->se_path, sizeof(sep->se_path), NULL); 657 if (error) 658 goto out; 659 660 count++; 661 sep++; 662 } 663 } 664 665 *retval = count; 666 error = 0; 667 goto out; 668 } 669 670 /* all other requests require superuser privs. verify. */ 671 if ((error = suser(p, 0)) || (error = pledge_swapctl(p))) 672 goto out; 673 674 /* 675 * at this point we expect a path name in arg. we will 676 * use namei() to gain a vnode reference (vref), and lock 677 * the vnode (VOP_LOCK). 678 */ 679 error = copyinstr(SCARG(uap, arg), userpath, sizeof(userpath), &len); 680 if (error) 681 goto out; 682 disk_map(userpath, userpath, sizeof(userpath), DM_OPENBLCK); 683 NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, UIO_SYSSPACE, userpath, p); 684 if ((error = namei(&nd))) 685 goto out; 686 vp = nd.ni_vp; 687 /* note: "vp" is referenced and locked */ 688 689 error = 0; /* assume no error */ 690 switch(SCARG(uap, cmd)) { 691 case SWAP_DUMPDEV: 692 if (vp->v_type != VBLK) { 693 error = ENOTBLK; 694 break; 695 } 696 dumpdev = vp->v_rdev; 697 break; 698 case SWAP_CTL: 699 /* 700 * get new priority, remove old entry (if any) and then 701 * reinsert it in the correct place. finally, prune out 702 * any empty priority structures. 703 */ 704 priority = SCARG(uap, misc); 705 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 706 if ((sdp = swaplist_find(vp, 1)) == NULL) { 707 error = ENOENT; 708 } else { 709 swaplist_insert(sdp, spp, priority); 710 swaplist_trim(); 711 } 712 if (error) 713 free(spp, M_VMSWAP, sizeof(*spp)); 714 break; 715 case SWAP_ON: 716 /* 717 * check for duplicates. if none found, then insert a 718 * dummy entry on the list to prevent someone else from 719 * trying to enable this device while we are working on 720 * it. 721 */ 722 priority = SCARG(uap, misc); 723 if ((sdp = swaplist_find(vp, 0)) != NULL) { 724 error = EBUSY; 725 break; 726 } 727 sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK|M_ZERO); 728 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 729 sdp->swd_flags = SWF_FAKE; /* placeholder only */ 730 sdp->swd_vp = vp; 731 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 732 733 /* 734 * XXX Is NFS elaboration necessary? 735 */ 736 if (vp->v_type == VREG) { 737 sdp->swd_cred = crdup(p->p_ucred); 738 } 739 740 swaplist_insert(sdp, spp, priority); 741 742 sdp->swd_pathlen = len; 743 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK); 744 strlcpy(sdp->swd_path, userpath, len); 745 746 /* 747 * we've now got a FAKE placeholder in the swap list. 748 * now attempt to enable swap on it. if we fail, undo 749 * what we've done and kill the fake entry we just inserted. 750 * if swap_on is a success, it will clear the SWF_FAKE flag 751 */ 752 753 if ((error = swap_on(p, sdp)) != 0) { 754 (void) swaplist_find(vp, 1); /* kill fake entry */ 755 swaplist_trim(); 756 if (vp->v_type == VREG) { 757 crfree(sdp->swd_cred); 758 } 759 free(sdp->swd_path, M_VMSWAP, sdp->swd_pathlen); 760 free(sdp, M_VMSWAP, sizeof(*sdp)); 761 break; 762 } 763 break; 764 case SWAP_OFF: 765 if ((sdp = swaplist_find(vp, 0)) == NULL) { 766 error = ENXIO; 767 break; 768 } 769 770 /* 771 * If a device isn't in use or enabled, we 772 * can't stop swapping from it (again). 773 */ 774 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 775 error = EBUSY; 776 break; 777 } 778 779 /* 780 * do the real work. 781 */ 782 error = swap_off(p, sdp); 783 break; 784 default: 785 error = EINVAL; 786 } 787 788 /* done! release the ref gained by namei() and unlock. */ 789 vput(vp); 790 791 out: 792 rw_exit_write(&swap_syscall_lock); 793 794 return (error); 795 } 796 797 /* 798 * swap_on: attempt to enable a swapdev for swapping. note that the 799 * swapdev is already on the global list, but disabled (marked 800 * SWF_FAKE). 801 * 802 * => we avoid the start of the disk (to protect disk labels) 803 * => caller should leave uvm.swap_data_lock unlocked, we may lock it 804 * if needed. 805 */ 806 int 807 swap_on(struct proc *p, struct swapdev *sdp) 808 { 809 static int count = 0; /* static */ 810 struct vnode *vp; 811 int error, npages, nblocks, size; 812 long addr; 813 struct vattr va; 814 #if defined(NFSCLIENT) 815 extern struct vops nfs_vops; 816 #endif /* defined(NFSCLIENT) */ 817 dev_t dev; 818 819 /* 820 * we want to enable swapping on sdp. the swd_vp contains 821 * the vnode we want (locked and ref'd), and the swd_dev 822 * contains the dev_t of the file, if it a block device. 823 */ 824 825 vp = sdp->swd_vp; 826 dev = sdp->swd_dev; 827 828 #if NVND > 0 829 /* no swapping to vnds. */ 830 if (bdevsw[major(dev)].d_strategy == vndstrategy) 831 return (EOPNOTSUPP); 832 #endif 833 834 /* 835 * open the swap file (mostly useful for block device files to 836 * let device driver know what is up). 837 * 838 * we skip the open/close for root on swap because the root 839 * has already been opened when root was mounted (mountroot). 840 */ 841 if (vp != rootvp) { 842 if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p))) 843 return (error); 844 } 845 846 /* XXX this only works for block devices */ 847 /* 848 * we now need to determine the size of the swap area. for 849 * block specials we can call the d_psize function. 850 * for normal files, we must stat [get attrs]. 851 * 852 * we put the result in nblks. 853 * for normal files, we also want the filesystem block size 854 * (which we get with statfs). 855 */ 856 switch (vp->v_type) { 857 case VBLK: 858 if (bdevsw[major(dev)].d_psize == 0 || 859 (nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) { 860 error = ENXIO; 861 goto bad; 862 } 863 break; 864 865 case VREG: 866 if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p))) 867 goto bad; 868 nblocks = (int)btodb(va.va_size); 869 if ((error = 870 VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0) 871 goto bad; 872 873 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize; 874 /* 875 * limit the max # of outstanding I/O requests we issue 876 * at any one time. take it easy on NFS servers. 877 */ 878 #if defined(NFSCLIENT) 879 if (vp->v_op == &nfs_vops) 880 sdp->swd_maxactive = 2; /* XXX */ 881 else 882 #endif /* defined(NFSCLIENT) */ 883 sdp->swd_maxactive = 8; /* XXX */ 884 bufq_init(&sdp->swd_bufq, BUFQ_FIFO); 885 break; 886 887 default: 888 error = ENXIO; 889 goto bad; 890 } 891 892 /* 893 * save nblocks in a safe place and convert to pages. 894 */ 895 896 sdp->swd_nblks = nblocks; 897 npages = dbtob((u_int64_t)nblocks) >> PAGE_SHIFT; 898 899 /* 900 * for block special files, we want to make sure that leave 901 * the disklabel and bootblocks alone, so we arrange to skip 902 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 903 * note that because of this the "size" can be less than the 904 * actual number of blocks on the device. 905 */ 906 if (vp->v_type == VBLK) { 907 /* we use pages 1 to (size - 1) [inclusive] */ 908 size = npages - 1; 909 addr = 1; 910 } else { 911 /* we use pages 0 to (size - 1) [inclusive] */ 912 size = npages; 913 addr = 0; 914 } 915 916 /* 917 * make sure we have enough blocks for a reasonable sized swap 918 * area. we want at least one page. 919 */ 920 921 if (size < 1) { 922 error = EINVAL; 923 goto bad; 924 } 925 926 /* 927 * now we need to allocate an extent to manage this swap device 928 */ 929 snprintf(sdp->swd_exname, sizeof(sdp->swd_exname), "swap0x%04x", 930 count++); 931 932 /* note that extent_create's 3rd arg is inclusive, thus "- 1" */ 933 sdp->swd_ex = extent_create(sdp->swd_exname, 0, npages - 1, M_VMSWAP, 934 0, 0, EX_WAITOK); 935 /* allocate the `saved' region from the extent so it won't be used */ 936 if (addr) { 937 if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK)) 938 panic("disklabel reserve"); 939 /* XXX: is extent synchronized with swd_npginuse? */ 940 } 941 #ifdef HIBERNATE 942 /* 943 * Lock down the last region of primary disk swap, in case 944 * hibernate needs to place a signature there. 945 */ 946 if (dev == swdevt[0].sw_dev && vp->v_type == VBLK && size > 3 ) { 947 if (extent_alloc_region(sdp->swd_ex, 948 npages - 1 - 1, 1, EX_WAITOK)) 949 panic("hibernate reserve"); 950 /* XXX: is extent synchronized with swd_npginuse? */ 951 } 952 #endif 953 954 /* add a ref to vp to reflect usage as a swap device. */ 955 vref(vp); 956 957 #ifdef UVM_SWAP_ENCRYPT 958 if (uvm_doswapencrypt) 959 uvm_swap_initcrypt(sdp, npages); 960 #endif 961 /* now add the new swapdev to the drum and enable. */ 962 swapdrum_add(sdp, npages); 963 sdp->swd_npages = size; 964 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 965 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 966 uvmexp.swpages += size; 967 return (0); 968 969 bad: 970 /* failure: close device if necessary and return error. */ 971 if (vp != rootvp) 972 (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p); 973 return (error); 974 } 975 976 /* 977 * swap_off: stop swapping on swapdev 978 * 979 * => swap data should be locked, we will unlock. 980 */ 981 int 982 swap_off(struct proc *p, struct swapdev *sdp) 983 { 984 int error = 0; 985 986 /* disable the swap area being removed */ 987 sdp->swd_flags &= ~SWF_ENABLE; 988 989 /* 990 * the idea is to find all the pages that are paged out to this 991 * device, and page them all in. in uvm, swap-backed pageable 992 * memory can take two forms: aobjs and anons. call the 993 * swapoff hook for each subsystem to bring in pages. 994 */ 995 996 if (uao_swap_off(sdp->swd_drumoffset, 997 sdp->swd_drumoffset + sdp->swd_drumsize) || 998 amap_swap_off(sdp->swd_drumoffset, 999 sdp->swd_drumoffset + sdp->swd_drumsize)) { 1000 1001 error = ENOMEM; 1002 } else if (sdp->swd_npginuse > sdp->swd_npgbad) { 1003 error = EBUSY; 1004 } 1005 1006 if (error) { 1007 sdp->swd_flags |= SWF_ENABLE; 1008 return (error); 1009 } 1010 1011 /* 1012 * done with the vnode and saved creds. 1013 * drop our ref on the vnode before calling VOP_CLOSE() 1014 * so that spec_close() can tell if this is the last close. 1015 */ 1016 if (sdp->swd_vp->v_type == VREG) { 1017 crfree(sdp->swd_cred); 1018 } 1019 vrele(sdp->swd_vp); 1020 if (sdp->swd_vp != rootvp) { 1021 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p); 1022 } 1023 1024 uvmexp.swpages -= sdp->swd_npages; 1025 1026 if (swaplist_find(sdp->swd_vp, 1) == NULL) 1027 panic("swap_off: swapdev not in list"); 1028 swaplist_trim(); 1029 1030 /* 1031 * free all resources! 1032 */ 1033 extent_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize, 1034 EX_WAITOK); 1035 extent_destroy(sdp->swd_ex); 1036 /* free sdp->swd_path ? */ 1037 free(sdp, M_VMSWAP, sizeof(*sdp)); 1038 return (0); 1039 } 1040 1041 /* 1042 * /dev/drum interface and i/o functions 1043 */ 1044 1045 /* 1046 * swstrategy: perform I/O on the drum 1047 * 1048 * => we must map the i/o request from the drum to the correct swapdev. 1049 */ 1050 void 1051 swstrategy(struct buf *bp) 1052 { 1053 struct swapdev *sdp; 1054 int s, pageno, bn; 1055 1056 /* 1057 * convert block number to swapdev. note that swapdev can't 1058 * be yanked out from under us because we are holding resources 1059 * in it (i.e. the blocks we are doing I/O on). 1060 */ 1061 pageno = dbtob((u_int64_t)bp->b_blkno) >> PAGE_SHIFT; 1062 sdp = swapdrum_getsdp(pageno); 1063 if (sdp == NULL) { 1064 bp->b_error = EINVAL; 1065 bp->b_flags |= B_ERROR; 1066 s = splbio(); 1067 biodone(bp); 1068 splx(s); 1069 return; 1070 } 1071 1072 /* convert drum page number to block number on this swapdev. */ 1073 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1074 bn = btodb((u_int64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ 1075 1076 /* 1077 * for block devices we finish up here. 1078 * for regular files we have to do more work which we delegate 1079 * to sw_reg_strategy(). 1080 */ 1081 switch (sdp->swd_vp->v_type) { 1082 default: 1083 panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type); 1084 case VBLK: 1085 /* 1086 * must convert "bp" from an I/O on /dev/drum to an I/O 1087 * on the swapdev (sdp). 1088 */ 1089 s = splbio(); 1090 buf_replacevnode(bp, sdp->swd_vp); 1091 1092 bp->b_blkno = bn; 1093 splx(s); 1094 VOP_STRATEGY(bp); 1095 return; 1096 case VREG: 1097 /* delegate to sw_reg_strategy function. */ 1098 sw_reg_strategy(sdp, bp, bn); 1099 return; 1100 } 1101 /* NOTREACHED */ 1102 } 1103 1104 /* 1105 * sw_reg_strategy: handle swap i/o to regular files 1106 */ 1107 void 1108 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) 1109 { 1110 struct vnode *vp; 1111 struct vndxfer *vnx; 1112 daddr_t nbn; 1113 caddr_t addr; 1114 off_t byteoff; 1115 int s, off, nra, error, sz, resid; 1116 1117 /* 1118 * allocate a vndxfer head for this transfer and point it to 1119 * our buffer. 1120 */ 1121 vnx = pool_get(&vndxfer_pool, PR_WAITOK); 1122 vnx->vx_flags = VX_BUSY; 1123 vnx->vx_error = 0; 1124 vnx->vx_pending = 0; 1125 vnx->vx_bp = bp; 1126 vnx->vx_sdp = sdp; 1127 1128 /* 1129 * setup for main loop where we read filesystem blocks into 1130 * our buffer. 1131 */ 1132 error = 0; 1133 bp->b_resid = bp->b_bcount; /* nothing transferred yet! */ 1134 addr = bp->b_data; /* current position in buffer */ 1135 byteoff = dbtob((u_int64_t)bn); 1136 1137 for (resid = bp->b_resid; resid; resid -= sz) { 1138 struct vndbuf *nbp; 1139 /* 1140 * translate byteoffset into block number. return values: 1141 * vp = vnode of underlying device 1142 * nbn = new block number (on underlying vnode dev) 1143 * nra = num blocks we can read-ahead (excludes requested 1144 * block) 1145 */ 1146 nra = 0; 1147 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1148 &vp, &nbn, &nra); 1149 1150 if (error == 0 && nbn == -1) { 1151 /* 1152 * this used to just set error, but that doesn't 1153 * do the right thing. Instead, it causes random 1154 * memory errors. The panic() should remain until 1155 * this condition doesn't destabilize the system. 1156 */ 1157 #if 1 1158 panic("sw_reg_strategy: swap to sparse file"); 1159 #else 1160 error = EIO; /* failure */ 1161 #endif 1162 } 1163 1164 /* 1165 * punt if there was an error or a hole in the file. 1166 * we must wait for any i/o ops we have already started 1167 * to finish before returning. 1168 * 1169 * XXX we could deal with holes here but it would be 1170 * a hassle (in the write case). 1171 */ 1172 if (error) { 1173 s = splbio(); 1174 vnx->vx_error = error; /* pass error up */ 1175 goto out; 1176 } 1177 1178 /* 1179 * compute the size ("sz") of this transfer (in bytes). 1180 */ 1181 off = byteoff % sdp->swd_bsize; 1182 sz = (1 + nra) * sdp->swd_bsize - off; 1183 if (sz > resid) 1184 sz = resid; 1185 1186 /* 1187 * now get a buf structure. note that the vb_buf is 1188 * at the front of the nbp structure so that you can 1189 * cast pointers between the two structure easily. 1190 */ 1191 nbp = pool_get(&vndbuf_pool, PR_WAITOK); 1192 nbp->vb_buf.b_flags = bp->b_flags | B_CALL; 1193 nbp->vb_buf.b_bcount = sz; 1194 nbp->vb_buf.b_bufsize = sz; 1195 nbp->vb_buf.b_error = 0; 1196 nbp->vb_buf.b_data = addr; 1197 nbp->vb_buf.b_bq = NULL; 1198 nbp->vb_buf.b_blkno = nbn + btodb(off); 1199 nbp->vb_buf.b_proc = bp->b_proc; 1200 nbp->vb_buf.b_iodone = sw_reg_iodone; 1201 nbp->vb_buf.b_vp = NULLVP; 1202 nbp->vb_buf.b_vnbufs.le_next = NOLIST; 1203 LIST_INIT(&nbp->vb_buf.b_dep); 1204 1205 /* 1206 * set b_dirtyoff/end and b_validoff/end. this is 1207 * required by the NFS client code (otherwise it will 1208 * just discard our I/O request). 1209 */ 1210 if (bp->b_dirtyend == 0) { 1211 nbp->vb_buf.b_dirtyoff = 0; 1212 nbp->vb_buf.b_dirtyend = sz; 1213 } else { 1214 nbp->vb_buf.b_dirtyoff = 1215 max(0, bp->b_dirtyoff - (bp->b_bcount-resid)); 1216 nbp->vb_buf.b_dirtyend = 1217 min(sz, 1218 max(0, bp->b_dirtyend - (bp->b_bcount-resid))); 1219 } 1220 if (bp->b_validend == 0) { 1221 nbp->vb_buf.b_validoff = 0; 1222 nbp->vb_buf.b_validend = sz; 1223 } else { 1224 nbp->vb_buf.b_validoff = 1225 max(0, bp->b_validoff - (bp->b_bcount-resid)); 1226 nbp->vb_buf.b_validend = 1227 min(sz, 1228 max(0, bp->b_validend - (bp->b_bcount-resid))); 1229 } 1230 1231 /* patch it back to the vnx */ 1232 nbp->vb_vnx = vnx; 1233 task_set(&nbp->vb_task, sw_reg_iodone_internal, nbp); 1234 1235 s = splbio(); 1236 if (vnx->vx_error != 0) { 1237 pool_put(&vndbuf_pool, nbp); 1238 goto out; 1239 } 1240 vnx->vx_pending++; 1241 1242 /* assoc new buffer with underlying vnode */ 1243 bgetvp(vp, &nbp->vb_buf); 1244 1245 /* start I/O if we are not over our limit */ 1246 bufq_queue(&sdp->swd_bufq, &nbp->vb_buf); 1247 sw_reg_start(sdp); 1248 splx(s); 1249 1250 /* 1251 * advance to the next I/O 1252 */ 1253 byteoff += sz; 1254 addr += sz; 1255 } 1256 1257 s = splbio(); 1258 1259 out: /* Arrive here at splbio */ 1260 vnx->vx_flags &= ~VX_BUSY; 1261 if (vnx->vx_pending == 0) { 1262 if (vnx->vx_error != 0) { 1263 bp->b_error = vnx->vx_error; 1264 bp->b_flags |= B_ERROR; 1265 } 1266 pool_put(&vndxfer_pool, vnx); 1267 biodone(bp); 1268 } 1269 splx(s); 1270 } 1271 1272 /* sw_reg_start: start an I/O request on the requested swapdev. */ 1273 void 1274 sw_reg_start(struct swapdev *sdp) 1275 { 1276 struct buf *bp; 1277 1278 /* XXX: recursion control */ 1279 if ((sdp->swd_flags & SWF_BUSY) != 0) 1280 return; 1281 1282 sdp->swd_flags |= SWF_BUSY; 1283 1284 while (sdp->swd_active < sdp->swd_maxactive) { 1285 bp = bufq_dequeue(&sdp->swd_bufq); 1286 if (bp == NULL) 1287 break; 1288 1289 sdp->swd_active++; 1290 1291 if ((bp->b_flags & B_READ) == 0) 1292 bp->b_vp->v_numoutput++; 1293 1294 VOP_STRATEGY(bp); 1295 } 1296 sdp->swd_flags &= ~SWF_BUSY; 1297 } 1298 1299 /* 1300 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1301 * 1302 * => note that we can recover the vndbuf struct by casting the buf ptr 1303 * 1304 * XXX: 1305 * We only put this onto a taskq here, because of the maxactive game since 1306 * it basically requires us to call back into VOP_STRATEGY() (where we must 1307 * be able to sleep) via sw_reg_start(). 1308 */ 1309 void 1310 sw_reg_iodone(struct buf *bp) 1311 { 1312 struct vndbuf *vbp = (struct vndbuf *)bp; 1313 task_add(systq, &vbp->vb_task); 1314 } 1315 1316 void 1317 sw_reg_iodone_internal(void *xvbp) 1318 { 1319 struct vndbuf *vbp = xvbp; 1320 struct vndxfer *vnx = vbp->vb_vnx; 1321 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1322 struct swapdev *sdp = vnx->vx_sdp; 1323 int resid, s; 1324 1325 s = splbio(); 1326 1327 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1328 pbp->b_resid -= resid; 1329 vnx->vx_pending--; 1330 1331 /* pass error upward */ 1332 if (vbp->vb_buf.b_error) 1333 vnx->vx_error = vbp->vb_buf.b_error; 1334 1335 /* disassociate this buffer from the vnode (if any). */ 1336 if (vbp->vb_buf.b_vp != NULL) { 1337 brelvp(&vbp->vb_buf); 1338 } 1339 1340 /* kill vbp structure */ 1341 pool_put(&vndbuf_pool, vbp); 1342 1343 /* 1344 * wrap up this transaction if it has run to completion or, in 1345 * case of an error, when all auxiliary buffers have returned. 1346 */ 1347 if (vnx->vx_error != 0) { 1348 /* pass error upward */ 1349 pbp->b_flags |= B_ERROR; 1350 pbp->b_error = vnx->vx_error; 1351 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1352 pool_put(&vndxfer_pool, vnx); 1353 biodone(pbp); 1354 } 1355 } else if (pbp->b_resid == 0) { 1356 KASSERT(vnx->vx_pending == 0); 1357 if ((vnx->vx_flags & VX_BUSY) == 0) { 1358 pool_put(&vndxfer_pool, vnx); 1359 biodone(pbp); 1360 } 1361 } 1362 1363 /* 1364 * done! start next swapdev I/O if one is pending 1365 */ 1366 sdp->swd_active--; 1367 sw_reg_start(sdp); 1368 splx(s); 1369 } 1370 1371 1372 /* 1373 * uvm_swap_alloc: allocate space on swap 1374 * 1375 * => allocation is done "round robin" down the priority list, as we 1376 * allocate in a priority we "rotate" the tail queue. 1377 * => space can be freed with uvm_swap_free 1378 * => we return the page slot number in /dev/drum (0 == invalid slot) 1379 * => we lock uvm.swap_data_lock 1380 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1381 */ 1382 int 1383 uvm_swap_alloc(int *nslots, boolean_t lessok) 1384 { 1385 struct swapdev *sdp; 1386 struct swappri *spp; 1387 u_long result; 1388 1389 /* 1390 * no swap devices configured yet? definite failure. 1391 */ 1392 if (uvmexp.nswapdev < 1) 1393 return 0; 1394 1395 /* 1396 * lock data lock, convert slots into blocks, and enter loop 1397 */ 1398 1399 ReTry: /* XXXMRG */ 1400 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1401 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1402 /* if it's not enabled, then we can't swap from it */ 1403 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1404 continue; 1405 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1406 continue; 1407 if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN, 0, 1408 EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT, 1409 &result) != 0) { 1410 continue; 1411 } 1412 1413 /* 1414 * successful allocation! now rotate the tailq. 1415 */ 1416 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1417 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1418 sdp->swd_npginuse += *nslots; 1419 uvmexp.swpginuse += *nslots; 1420 /* done! return drum slot number */ 1421 return(result + sdp->swd_drumoffset); 1422 } 1423 } 1424 1425 /* XXXMRG: BEGIN HACK */ 1426 if (*nslots > 1 && lessok) { 1427 *nslots = 1; 1428 goto ReTry; /* XXXMRG: ugh! extent should support this for us */ 1429 } 1430 /* XXXMRG: END HACK */ 1431 1432 return 0; /* failed */ 1433 } 1434 1435 /* 1436 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1437 * 1438 * => we lock uvm.swap_data_lock 1439 */ 1440 void 1441 uvm_swap_markbad(int startslot, int nslots) 1442 { 1443 struct swapdev *sdp; 1444 1445 sdp = swapdrum_getsdp(startslot); 1446 if (sdp != NULL) { 1447 /* 1448 * we just keep track of how many pages have been marked bad 1449 * in this device, to make everything add up in swap_off(). 1450 * we assume here that the range of slots will all be within 1451 * one swap device. 1452 */ 1453 sdp->swd_npgbad += nslots; 1454 } 1455 } 1456 1457 /* 1458 * uvm_swap_free: free swap slots 1459 * 1460 * => this can be all or part of an allocation made by uvm_swap_alloc 1461 * => we lock uvm.swap_data_lock 1462 */ 1463 void 1464 uvm_swap_free(int startslot, int nslots) 1465 { 1466 struct swapdev *sdp; 1467 1468 /* 1469 * ignore attempts to free the "bad" slot. 1470 */ 1471 1472 if (startslot == SWSLOT_BAD) { 1473 return; 1474 } 1475 1476 /* 1477 * convert drum slot offset back to sdp, free the blocks 1478 * in the extent, and return. must hold pri lock to do 1479 * lookup and access the extent. 1480 */ 1481 1482 sdp = swapdrum_getsdp(startslot); 1483 KASSERT(uvmexp.nswapdev >= 1); 1484 KASSERT(sdp != NULL); 1485 KASSERT(sdp->swd_npginuse >= nslots); 1486 if (extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots, 1487 EX_MALLOCOK|EX_NOWAIT) != 0) { 1488 printf("warning: resource shortage: %d pages of swap lost\n", 1489 nslots); 1490 } 1491 1492 sdp->swd_npginuse -= nslots; 1493 uvmexp.swpginuse -= nslots; 1494 #ifdef UVM_SWAP_ENCRYPT 1495 { 1496 int i; 1497 if (swap_encrypt_initialized) { 1498 /* Dereference keys */ 1499 for (i = 0; i < nslots; i++) 1500 if (uvm_swap_needdecrypt(sdp, startslot + i)) { 1501 struct swap_key *key; 1502 1503 key = SWD_KEY(sdp, startslot + i); 1504 if (key->refcount != 0) 1505 SWAP_KEY_PUT(sdp, key); 1506 } 1507 1508 /* Mark range as not decrypt */ 1509 uvm_swap_markdecrypt(sdp, startslot, nslots, 0); 1510 } 1511 } 1512 #endif /* UVM_SWAP_ENCRYPT */ 1513 } 1514 1515 /* 1516 * uvm_swap_put: put any number of pages into a contig place on swap 1517 * 1518 * => can be sync or async 1519 */ 1520 int 1521 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) 1522 { 1523 int result; 1524 1525 result = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1526 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1527 1528 return (result); 1529 } 1530 1531 /* 1532 * uvm_swap_get: get a single page from swap 1533 * 1534 * => usually a sync op (from fault) 1535 */ 1536 int 1537 uvm_swap_get(struct vm_page *page, int swslot, int flags) 1538 { 1539 int result; 1540 1541 uvmexp.nswget++; 1542 KASSERT(flags & PGO_SYNCIO); 1543 if (swslot == SWSLOT_BAD) { 1544 return VM_PAGER_ERROR; 1545 } 1546 1547 /* this page is (about to be) no longer only in swap. */ 1548 uvmexp.swpgonly--; 1549 1550 result = uvm_swap_io(&page, swslot, 1, B_READ | 1551 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1552 1553 if (result != VM_PAGER_OK && result != VM_PAGER_PEND) { 1554 /* oops, the read failed so it really is still only in swap. */ 1555 uvmexp.swpgonly++; 1556 } 1557 1558 return (result); 1559 } 1560 1561 /* 1562 * uvm_swap_io: do an i/o operation to swap 1563 */ 1564 1565 int 1566 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) 1567 { 1568 daddr_t startblk; 1569 struct buf *bp; 1570 vaddr_t kva; 1571 int result, s, mapinflags, pflag, bounce = 0, i; 1572 boolean_t write, async; 1573 vaddr_t bouncekva; 1574 struct vm_page *tpps[MAXBSIZE >> PAGE_SHIFT]; 1575 #ifdef UVM_SWAP_ENCRYPT 1576 struct swapdev *sdp; 1577 int encrypt = 0; 1578 #endif 1579 1580 write = (flags & B_READ) == 0; 1581 async = (flags & B_ASYNC) != 0; 1582 1583 /* convert starting drum slot to block number */ 1584 startblk = btodb((u_int64_t)startslot << PAGE_SHIFT); 1585 1586 /* 1587 * first, map the pages into the kernel (XXX: currently required 1588 * by buffer system). 1589 */ 1590 mapinflags = !write ? UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE; 1591 if (!async) 1592 mapinflags |= UVMPAGER_MAPIN_WAITOK; 1593 kva = uvm_pagermapin(pps, npages, mapinflags); 1594 if (kva == 0) 1595 return (VM_PAGER_AGAIN); 1596 1597 #ifdef UVM_SWAP_ENCRYPT 1598 if (write) { 1599 /* 1600 * Check if we need to do swap encryption on old pages. 1601 * Later we need a different scheme, that swap encrypts 1602 * all pages of a process that had at least one page swap 1603 * encrypted. Then we might not need to copy all pages 1604 * in the cluster, and avoid the memory overheard in 1605 * swapping. 1606 */ 1607 if (uvm_doswapencrypt) 1608 encrypt = 1; 1609 } 1610 1611 if (swap_encrypt_initialized || encrypt) { 1612 /* 1613 * we need to know the swap device that we are swapping to/from 1614 * to see if the pages need to be marked for decryption or 1615 * actually need to be decrypted. 1616 * XXX - does this information stay the same over the whole 1617 * execution of this function? 1618 */ 1619 sdp = swapdrum_getsdp(startslot); 1620 } 1621 1622 /* 1623 * Check that we are dma capable for read (write always bounces 1624 * through the swapencrypt anyway... 1625 */ 1626 if (write && encrypt) { 1627 bounce = 1; /* bounce through swapencrypt always */ 1628 } else { 1629 #else 1630 { 1631 #endif 1632 1633 for (i = 0; i < npages; i++) { 1634 if (VM_PAGE_TO_PHYS(pps[i]) < dma_constraint.ucr_low || 1635 VM_PAGE_TO_PHYS(pps[i]) > dma_constraint.ucr_high) { 1636 bounce = 1; 1637 break; 1638 } 1639 } 1640 } 1641 1642 if (bounce) { 1643 int swmapflags; 1644 1645 /* We always need write access. */ 1646 swmapflags = UVMPAGER_MAPIN_READ; 1647 if (!async) 1648 swmapflags |= UVMPAGER_MAPIN_WAITOK; 1649 1650 if (!uvm_swap_allocpages(tpps, npages)) { 1651 uvm_pagermapout(kva, npages); 1652 return (VM_PAGER_AGAIN); 1653 } 1654 1655 bouncekva = uvm_pagermapin(tpps, npages, swmapflags); 1656 if (bouncekva == 0) { 1657 uvm_pagermapout(kva, npages); 1658 uvm_swap_freepages(tpps, npages); 1659 return (VM_PAGER_AGAIN); 1660 } 1661 } 1662 1663 /* encrypt to swap */ 1664 if (write && bounce) { 1665 int i, opages; 1666 caddr_t src, dst; 1667 u_int64_t block; 1668 1669 src = (caddr_t) kva; 1670 dst = (caddr_t) bouncekva; 1671 block = startblk; 1672 for (i = 0; i < npages; i++) { 1673 #ifdef UVM_SWAP_ENCRYPT 1674 struct swap_key *key; 1675 1676 if (encrypt) { 1677 key = SWD_KEY(sdp, startslot + i); 1678 SWAP_KEY_GET(sdp, key); /* add reference */ 1679 1680 swap_encrypt(key, src, dst, block, PAGE_SIZE); 1681 block += btodb(PAGE_SIZE); 1682 } else { 1683 #else 1684 { 1685 #endif /* UVM_SWAP_ENCRYPT */ 1686 memcpy(dst, src, PAGE_SIZE); 1687 } 1688 /* this just tells async callbacks to free */ 1689 atomic_setbits_int(&tpps[i]->pg_flags, PQ_ENCRYPT); 1690 src += PAGE_SIZE; 1691 dst += PAGE_SIZE; 1692 } 1693 1694 uvm_pagermapout(kva, npages); 1695 1696 /* dispose of pages we dont use anymore */ 1697 opages = npages; 1698 uvm_pager_dropcluster(NULL, NULL, pps, &opages, 1699 PGO_PDFREECLUST); 1700 1701 kva = bouncekva; 1702 } 1703 1704 /* 1705 * now allocate a buf for the i/o. 1706 * [make sure we don't put the pagedaemon to sleep...] 1707 */ 1708 pflag = (async || curproc == uvm.pagedaemon_proc) ? PR_NOWAIT : 1709 PR_WAITOK; 1710 bp = pool_get(&bufpool, pflag | PR_ZERO); 1711 1712 /* 1713 * if we failed to get a swapbuf, return "try again" 1714 */ 1715 if (bp == NULL) { 1716 if (write && bounce) { 1717 #ifdef UVM_SWAP_ENCRYPT 1718 int i; 1719 1720 /* swap encrypt needs cleanup */ 1721 if (encrypt) 1722 for (i = 0; i < npages; i++) 1723 SWAP_KEY_PUT(sdp, SWD_KEY(sdp, 1724 startslot + i)); 1725 #endif 1726 1727 uvm_pagermapout(kva, npages); 1728 uvm_swap_freepages(tpps, npages); 1729 } 1730 return (VM_PAGER_AGAIN); 1731 } 1732 1733 /* 1734 * prevent ASYNC reads. 1735 * uvm_swap_io is only called from uvm_swap_get, uvm_swap_get 1736 * assumes that all gets are SYNCIO. Just make sure here. 1737 * XXXARTUBC - might not be true anymore. 1738 */ 1739 if (!write) { 1740 flags &= ~B_ASYNC; 1741 async = 0; 1742 } 1743 1744 /* 1745 * fill in the bp. we currently route our i/o through 1746 * /dev/drum's vnode [swapdev_vp]. 1747 */ 1748 bp->b_flags = B_BUSY | B_NOCACHE | B_RAW | (flags & (B_READ|B_ASYNC)); 1749 bp->b_proc = &proc0; /* XXX */ 1750 bp->b_vnbufs.le_next = NOLIST; 1751 if (bounce) 1752 bp->b_data = (caddr_t)bouncekva; 1753 else 1754 bp->b_data = (caddr_t)kva; 1755 bp->b_bq = NULL; 1756 bp->b_blkno = startblk; 1757 LIST_INIT(&bp->b_dep); 1758 s = splbio(); 1759 bp->b_vp = NULL; 1760 buf_replacevnode(bp, swapdev_vp); 1761 splx(s); 1762 bp->b_bufsize = bp->b_bcount = (long)npages << PAGE_SHIFT; 1763 1764 /* 1765 * for pageouts we must set "dirtyoff" [NFS client code needs it]. 1766 * and we bump v_numoutput (counter of number of active outputs). 1767 */ 1768 if (write) { 1769 bp->b_dirtyoff = 0; 1770 bp->b_dirtyend = npages << PAGE_SHIFT; 1771 #ifdef UVM_SWAP_ENCRYPT 1772 /* mark the pages in the drum for decryption */ 1773 if (swap_encrypt_initialized) 1774 uvm_swap_markdecrypt(sdp, startslot, npages, encrypt); 1775 #endif 1776 s = splbio(); 1777 swapdev_vp->v_numoutput++; 1778 splx(s); 1779 } 1780 1781 /* for async ops we must set up the iodone handler. */ 1782 if (async) { 1783 bp->b_flags |= B_CALL | (curproc == uvm.pagedaemon_proc ? 1784 B_PDAEMON : 0); 1785 bp->b_iodone = uvm_aio_biodone; 1786 } 1787 1788 /* now we start the I/O, and if async, return. */ 1789 VOP_STRATEGY(bp); 1790 if (async) 1791 return (VM_PAGER_PEND); 1792 1793 /* must be sync i/o. wait for it to finish */ 1794 (void) biowait(bp); 1795 result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK; 1796 1797 /* decrypt swap */ 1798 if (!write && !(bp->b_flags & B_ERROR)) { 1799 int i; 1800 caddr_t data = (caddr_t)kva; 1801 caddr_t dst = (caddr_t)kva; 1802 u_int64_t block = startblk; 1803 1804 if (bounce) 1805 data = (caddr_t)bouncekva; 1806 1807 for (i = 0; i < npages; i++) { 1808 #ifdef UVM_SWAP_ENCRYPT 1809 struct swap_key *key; 1810 1811 /* Check if we need to decrypt */ 1812 if (swap_encrypt_initialized && 1813 uvm_swap_needdecrypt(sdp, startslot + i)) { 1814 key = SWD_KEY(sdp, startslot + i); 1815 if (key->refcount == 0) { 1816 result = VM_PAGER_ERROR; 1817 break; 1818 } 1819 swap_decrypt(key, data, dst, block, PAGE_SIZE); 1820 } else if (bounce) { 1821 #else 1822 if (bounce) { 1823 #endif 1824 memcpy(dst, data, PAGE_SIZE); 1825 } 1826 data += PAGE_SIZE; 1827 dst += PAGE_SIZE; 1828 block += btodb(PAGE_SIZE); 1829 } 1830 if (bounce) 1831 uvm_pagermapout(bouncekva, npages); 1832 } 1833 /* kill the pager mapping */ 1834 uvm_pagermapout(kva, npages); 1835 1836 /* Not anymore needed, free after encryption/bouncing */ 1837 if (!write && bounce) 1838 uvm_swap_freepages(tpps, npages); 1839 1840 /* now dispose of the buf */ 1841 s = splbio(); 1842 if (bp->b_vp) 1843 brelvp(bp); 1844 1845 if (write && bp->b_vp) 1846 vwakeup(bp->b_vp); 1847 pool_put(&bufpool, bp); 1848 splx(s); 1849 1850 /* finally return. */ 1851 return (result); 1852 } 1853 1854 void 1855 swapmount(void) 1856 { 1857 struct swapdev *sdp; 1858 struct swappri *spp; 1859 struct vnode *vp; 1860 dev_t swap_dev = swdevt[0].sw_dev; 1861 char *nam; 1862 char path[MNAMELEN + 1]; 1863 1864 /* 1865 * No locking here since we happen to know that we will just be called 1866 * once before any other process has forked. 1867 */ 1868 if (swap_dev == NODEV) 1869 return; 1870 1871 #if defined(NFSCLIENT) 1872 if (swap_dev == NETDEV) { 1873 extern struct nfs_diskless nfs_diskless; 1874 1875 snprintf(path, sizeof(path), "%s", 1876 nfs_diskless.nd_swap.ndm_host); 1877 vp = nfs_diskless.sw_vp; 1878 goto gotit; 1879 } else 1880 #endif 1881 if (bdevvp(swap_dev, &vp)) 1882 return; 1883 1884 /* Construct a potential path to swap */ 1885 if ((nam = findblkname(major(swap_dev)))) 1886 snprintf(path, sizeof(path), "/dev/%s%d%c", nam, 1887 DISKUNIT(swap_dev), 'a' + DISKPART(swap_dev)); 1888 else 1889 snprintf(path, sizeof(path), "blkdev0x%x", 1890 swap_dev); 1891 1892 #if defined(NFSCLIENT) 1893 gotit: 1894 #endif 1895 sdp = malloc(sizeof(*sdp), M_VMSWAP, M_WAITOK|M_ZERO); 1896 spp = malloc(sizeof(*spp), M_VMSWAP, M_WAITOK); 1897 1898 sdp->swd_flags = SWF_FAKE; 1899 sdp->swd_dev = swap_dev; 1900 1901 sdp->swd_pathlen = strlen(path) + 1; 1902 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK | M_ZERO); 1903 strlcpy(sdp->swd_path, path, sdp->swd_pathlen); 1904 1905 sdp->swd_vp = vp; 1906 1907 swaplist_insert(sdp, spp, 0); 1908 1909 if (swap_on(curproc, sdp)) { 1910 swaplist_find(vp, 1); 1911 swaplist_trim(); 1912 vput(sdp->swd_vp); 1913 free(sdp->swd_path, M_VMSWAP, sdp->swd_pathlen); 1914 free(sdp, M_VMSWAP, sizeof(*sdp)); 1915 return; 1916 } 1917 } 1918 1919 #ifdef HIBERNATE 1920 int 1921 uvm_hibswap(dev_t dev, u_long *sp, u_long *ep) 1922 { 1923 struct swapdev *sdp, *swd = NULL; 1924 struct swappri *spp; 1925 struct extent_region *exr, *exrn; 1926 u_long start = 0, end = 0, size = 0; 1927 1928 /* no swap devices configured yet? */ 1929 if (uvmexp.nswapdev < 1 || dev != swdevt[0].sw_dev) 1930 return (1); 1931 1932 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1933 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1934 if (sdp->swd_dev == dev) 1935 swd = sdp; 1936 } 1937 } 1938 1939 if (swd == NULL || (swd->swd_flags & SWF_ENABLE) == 0) 1940 return (1); 1941 1942 LIST_FOREACH(exr, &swd->swd_ex->ex_regions, er_link) { 1943 u_long gapstart, gapend, gapsize; 1944 1945 gapstart = exr->er_end + 1; 1946 exrn = LIST_NEXT(exr, er_link); 1947 if (!exrn) 1948 break; 1949 gapend = exrn->er_start - 1; 1950 gapsize = gapend - gapstart; 1951 if (gapsize > size) { 1952 start = gapstart; 1953 end = gapend; 1954 size = gapsize; 1955 } 1956 } 1957 1958 if (size) { 1959 *sp = start; 1960 *ep = end; 1961 return (0); 1962 } 1963 return (1); 1964 } 1965 #endif /* HIBERNATE */ 1966