1 /* $NetBSD: uvm_swap.c,v 1.203 2021/03/13 15:29:55 skrll Exp $ */ 2 3 /* 4 * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 29 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.203 2021/03/13 15:29:55 skrll Exp $"); 34 35 #include "opt_uvmhist.h" 36 #include "opt_compat_netbsd.h" 37 #include "opt_ddb.h" 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/atomic.h> 42 #include <sys/buf.h> 43 #include <sys/bufq.h> 44 #include <sys/conf.h> 45 #include <sys/cprng.h> 46 #include <sys/proc.h> 47 #include <sys/namei.h> 48 #include <sys/disklabel.h> 49 #include <sys/errno.h> 50 #include <sys/kernel.h> 51 #include <sys/vnode.h> 52 #include <sys/file.h> 53 #include <sys/vmem.h> 54 #include <sys/blist.h> 55 #include <sys/mount.h> 56 #include <sys/pool.h> 57 #include <sys/kmem.h> 58 #include <sys/syscallargs.h> 59 #include <sys/swap.h> 60 #include <sys/kauth.h> 61 #include <sys/sysctl.h> 62 #include <sys/workqueue.h> 63 64 #include <uvm/uvm.h> 65 66 #include <miscfs/specfs/specdev.h> 67 68 #include <crypto/aes/aes.h> 69 #include <crypto/aes/aes_cbc.h> 70 71 /* 72 * uvm_swap.c: manage configuration and i/o to swap space. 73 */ 74 75 /* 76 * swap space is managed in the following way: 77 * 78 * each swap partition or file is described by a "swapdev" structure. 79 * each "swapdev" structure contains a "swapent" structure which contains 80 * information that is passed up to the user (via system calls). 81 * 82 * each swap partition is assigned a "priority" (int) which controls 83 * swap partition usage. 84 * 85 * the system maintains a global data structure describing all swap 86 * partitions/files. there is a sorted LIST of "swappri" structures 87 * which describe "swapdev"'s at that priority. this LIST is headed 88 * by the "swap_priority" global var. each "swappri" contains a 89 * TAILQ of "swapdev" structures at that priority. 90 * 91 * locking: 92 * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl 93 * system call and prevents the swap priority list from changing 94 * while we are in the middle of a system call (e.g. SWAP_STATS). 95 * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data 96 * structures including the priority list, the swapdev structures, 97 * and the swapmap arena. 98 * 99 * each swap device has the following info: 100 * - swap device in use (could be disabled, preventing future use) 101 * - swap enabled (allows new allocations on swap) 102 * - map info in /dev/drum 103 * - vnode pointer 104 * for swap files only: 105 * - block size 106 * - max byte count in buffer 107 * - buffer 108 * 109 * userland controls and configures swap with the swapctl(2) system call. 110 * the sys_swapctl performs the following operations: 111 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 112 * [2] SWAP_STATS: given a pointer to an array of swapent structures 113 * (passed in via "arg") of a size passed in via "misc" ... we load 114 * the current swap config into the array. The actual work is done 115 * in the uvm_swap_stats() function. 116 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 117 * priority in "misc", start swapping on it. 118 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 119 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 120 * "misc") 121 */ 122 123 /* 124 * swapdev: describes a single swap partition/file 125 * 126 * note the following should be true: 127 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 128 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] 129 */ 130 struct swapdev { 131 dev_t swd_dev; /* device id */ 132 int swd_flags; /* flags:inuse/enable/fake */ 133 int swd_priority; /* our priority */ 134 int swd_nblks; /* blocks in this device */ 135 char *swd_path; /* saved pathname of device */ 136 int swd_pathlen; /* length of pathname */ 137 int swd_npages; /* #pages we can use */ 138 int swd_npginuse; /* #pages in use */ 139 int swd_npgbad; /* #pages bad */ 140 int swd_drumoffset; /* page0 offset in drum */ 141 int swd_drumsize; /* #pages in drum */ 142 blist_t swd_blist; /* blist for this swapdev */ 143 struct vnode *swd_vp; /* backing vnode */ 144 TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */ 145 146 int swd_bsize; /* blocksize (bytes) */ 147 int swd_maxactive; /* max active i/o reqs */ 148 struct bufq_state *swd_tab; /* buffer list */ 149 int swd_active; /* number of active buffers */ 150 151 volatile uint32_t *swd_encmap; /* bitmap of encrypted slots */ 152 struct aesenc swd_enckey; /* AES key expanded for enc */ 153 struct aesdec swd_deckey; /* AES key expanded for dec */ 154 bool swd_encinit; /* true if keys initialized */ 155 }; 156 157 /* 158 * swap device priority entry; the list is kept sorted on `spi_priority'. 159 */ 160 struct swappri { 161 int spi_priority; /* priority */ 162 TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 163 /* tailq of swapdevs at this priority */ 164 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 165 }; 166 167 /* 168 * The following two structures are used to keep track of data transfers 169 * on swap devices associated with regular files. 170 * NOTE: this code is more or less a copy of vnd.c; we use the same 171 * structure names here to ease porting.. 172 */ 173 struct vndxfer { 174 struct buf *vx_bp; /* Pointer to parent buffer */ 175 struct swapdev *vx_sdp; 176 int vx_error; 177 int vx_pending; /* # of pending aux buffers */ 178 int vx_flags; 179 #define VX_BUSY 1 180 #define VX_DEAD 2 181 }; 182 183 struct vndbuf { 184 struct buf vb_buf; 185 struct vndxfer *vb_xfer; 186 }; 187 188 /* 189 * We keep a of pool vndbuf's and vndxfer structures. 190 */ 191 static struct pool vndxfer_pool, vndbuf_pool; 192 193 /* 194 * local variables 195 */ 196 static vmem_t *swapmap; /* controls the mapping of /dev/drum */ 197 198 /* list of all active swap devices [by priority] */ 199 LIST_HEAD(swap_priority, swappri); 200 static struct swap_priority swap_priority; 201 202 /* locks */ 203 static kmutex_t uvm_swap_data_lock __cacheline_aligned; 204 static krwlock_t swap_syscall_lock; 205 206 /* workqueue and use counter for swap to regular files */ 207 static int sw_reg_count = 0; 208 static struct workqueue *sw_reg_workqueue; 209 210 /* tuneables */ 211 u_int uvm_swapisfull_factor = 99; 212 bool uvm_swap_encrypt = false; 213 214 /* 215 * prototypes 216 */ 217 static struct swapdev *swapdrum_getsdp(int); 218 219 static struct swapdev *swaplist_find(struct vnode *, bool); 220 static void swaplist_insert(struct swapdev *, 221 struct swappri *, int); 222 static void swaplist_trim(void); 223 224 static int swap_on(struct lwp *, struct swapdev *); 225 static int swap_off(struct lwp *, struct swapdev *); 226 227 static void sw_reg_strategy(struct swapdev *, struct buf *, int); 228 static void sw_reg_biodone(struct buf *); 229 static void sw_reg_iodone(struct work *wk, void *dummy); 230 static void sw_reg_start(struct swapdev *); 231 232 static int uvm_swap_io(struct vm_page **, int, int, int); 233 234 static void uvm_swap_genkey(struct swapdev *); 235 static void uvm_swap_encryptpage(struct swapdev *, void *, int); 236 static void uvm_swap_decryptpage(struct swapdev *, void *, int); 237 238 static size_t 239 encmap_size(size_t npages) 240 { 241 struct swapdev *sdp; 242 const size_t bytesperword = sizeof(sdp->swd_encmap[0]); 243 const size_t bitsperword = NBBY * bytesperword; 244 const size_t nbits = npages; /* one bit for each page */ 245 const size_t nwords = howmany(nbits, bitsperword); 246 const size_t nbytes = nwords * bytesperword; 247 248 return nbytes; 249 } 250 251 /* 252 * uvm_swap_init: init the swap system data structures and locks 253 * 254 * => called at boot time from init_main.c after the filesystems 255 * are brought up (which happens after uvm_init()) 256 */ 257 void 258 uvm_swap_init(void) 259 { 260 UVMHIST_FUNC(__func__); 261 262 UVMHIST_CALLED(pdhist); 263 /* 264 * first, init the swap list, its counter, and its lock. 265 * then get a handle on the vnode for /dev/drum by using 266 * the its dev_t number ("swapdev", from MD conf.c). 267 */ 268 269 LIST_INIT(&swap_priority); 270 uvmexp.nswapdev = 0; 271 rw_init(&swap_syscall_lock); 272 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE); 273 274 if (bdevvp(swapdev, &swapdev_vp)) 275 panic("%s: can't get vnode for swap device", __func__); 276 if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY)) 277 panic("%s: can't lock swap device", __func__); 278 if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED)) 279 panic("%s: can't open swap device", __func__); 280 VOP_UNLOCK(swapdev_vp); 281 282 /* 283 * create swap block resource map to map /dev/drum. the range 284 * from 1 to INT_MAX allows 2 gigablocks of swap space. note 285 * that block 0 is reserved (used to indicate an allocation 286 * failure, or no allocation). 287 */ 288 swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0, 289 VM_NOSLEEP, IPL_NONE); 290 if (swapmap == 0) { 291 panic("%s: vmem_create failed", __func__); 292 } 293 294 pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx", 295 NULL, IPL_BIO); 296 pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd", 297 NULL, IPL_BIO); 298 299 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); 300 } 301 302 /* 303 * swaplist functions: functions that operate on the list of swap 304 * devices on the system. 305 */ 306 307 /* 308 * swaplist_insert: insert swap device "sdp" into the global list 309 * 310 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 311 * => caller must provide a newly allocated swappri structure (we will 312 * FREE it if we don't need it... this it to prevent allocation 313 * blocking here while adding swap) 314 */ 315 static void 316 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) 317 { 318 struct swappri *spp, *pspp; 319 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 320 321 KASSERT(rw_write_held(&swap_syscall_lock)); 322 KASSERT(mutex_owned(&uvm_swap_data_lock)); 323 324 /* 325 * find entry at or after which to insert the new device. 326 */ 327 pspp = NULL; 328 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 329 if (priority <= spp->spi_priority) 330 break; 331 pspp = spp; 332 } 333 334 /* 335 * new priority? 336 */ 337 if (spp == NULL || spp->spi_priority != priority) { 338 spp = newspp; /* use newspp! */ 339 UVMHIST_LOG(pdhist, "created new swappri = %jd", 340 priority, 0, 0, 0); 341 342 spp->spi_priority = priority; 343 TAILQ_INIT(&spp->spi_swapdev); 344 345 if (pspp) 346 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 347 else 348 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 349 } else { 350 /* we don't need a new priority structure, free it */ 351 kmem_free(newspp, sizeof(*newspp)); 352 } 353 354 /* 355 * priority found (or created). now insert on the priority's 356 * tailq list and bump the total number of swapdevs. 357 */ 358 sdp->swd_priority = priority; 359 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 360 uvmexp.nswapdev++; 361 } 362 363 /* 364 * swaplist_find: find and optionally remove a swap device from the 365 * global list. 366 * 367 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 368 * => we return the swapdev we found (and removed) 369 */ 370 static struct swapdev * 371 swaplist_find(struct vnode *vp, bool remove) 372 { 373 struct swapdev *sdp; 374 struct swappri *spp; 375 376 KASSERT(rw_lock_held(&swap_syscall_lock)); 377 KASSERT(remove ? rw_write_held(&swap_syscall_lock) : 1); 378 KASSERT(mutex_owned(&uvm_swap_data_lock)); 379 380 /* 381 * search the lists for the requested vp 382 */ 383 384 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 385 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 386 if (sdp->swd_vp == vp) { 387 if (remove) { 388 TAILQ_REMOVE(&spp->spi_swapdev, 389 sdp, swd_next); 390 uvmexp.nswapdev--; 391 } 392 return(sdp); 393 } 394 } 395 } 396 return (NULL); 397 } 398 399 /* 400 * swaplist_trim: scan priority list for empty priority entries and kill 401 * them. 402 * 403 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 404 */ 405 static void 406 swaplist_trim(void) 407 { 408 struct swappri *spp, *nextspp; 409 410 KASSERT(rw_write_held(&swap_syscall_lock)); 411 KASSERT(mutex_owned(&uvm_swap_data_lock)); 412 413 LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) { 414 if (!TAILQ_EMPTY(&spp->spi_swapdev)) 415 continue; 416 LIST_REMOVE(spp, spi_swappri); 417 kmem_free(spp, sizeof(*spp)); 418 } 419 } 420 421 /* 422 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 423 * to the "swapdev" that maps that section of the drum. 424 * 425 * => each swapdev takes one big contig chunk of the drum 426 * => caller must hold uvm_swap_data_lock 427 */ 428 static struct swapdev * 429 swapdrum_getsdp(int pgno) 430 { 431 struct swapdev *sdp; 432 struct swappri *spp; 433 434 KASSERT(mutex_owned(&uvm_swap_data_lock)); 435 436 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 437 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 438 if (sdp->swd_flags & SWF_FAKE) 439 continue; 440 if (pgno >= sdp->swd_drumoffset && 441 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 442 return sdp; 443 } 444 } 445 } 446 return NULL; 447 } 448 449 /* 450 * swapdrum_sdp_is: true iff the swap device for pgno is sdp 451 * 452 * => for use in positive assertions only; result is not stable 453 */ 454 static bool __debugused 455 swapdrum_sdp_is(int pgno, struct swapdev *sdp) 456 { 457 bool result; 458 459 mutex_enter(&uvm_swap_data_lock); 460 result = swapdrum_getsdp(pgno) == sdp; 461 mutex_exit(&uvm_swap_data_lock); 462 463 return result; 464 } 465 466 void swapsys_lock(krw_t op) 467 { 468 rw_enter(&swap_syscall_lock, op); 469 } 470 471 void swapsys_unlock(void) 472 { 473 rw_exit(&swap_syscall_lock); 474 } 475 476 static void 477 swapent_cvt(struct swapent *se, const struct swapdev *sdp, int inuse) 478 { 479 se->se_dev = sdp->swd_dev; 480 se->se_flags = sdp->swd_flags; 481 se->se_nblks = sdp->swd_nblks; 482 se->se_inuse = inuse; 483 se->se_priority = sdp->swd_priority; 484 KASSERT(sdp->swd_pathlen < sizeof(se->se_path)); 485 strcpy(se->se_path, sdp->swd_path); 486 } 487 488 int (*uvm_swap_stats13)(const struct sys_swapctl_args *, register_t *) = 489 (void *)enosys; 490 int (*uvm_swap_stats50)(const struct sys_swapctl_args *, register_t *) = 491 (void *)enosys; 492 493 /* 494 * sys_swapctl: main entry point for swapctl(2) system call 495 * [with two helper functions: swap_on and swap_off] 496 */ 497 int 498 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval) 499 { 500 /* { 501 syscallarg(int) cmd; 502 syscallarg(void *) arg; 503 syscallarg(int) misc; 504 } */ 505 struct vnode *vp; 506 struct nameidata nd; 507 struct swappri *spp; 508 struct swapdev *sdp; 509 #define SWAP_PATH_MAX (PATH_MAX + 1) 510 char *userpath; 511 size_t len = 0; 512 int error; 513 int priority; 514 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 515 516 /* 517 * we handle the non-priv NSWAP and STATS request first. 518 * 519 * SWAP_NSWAP: return number of config'd swap devices 520 * [can also be obtained with uvmexp sysctl] 521 */ 522 if (SCARG(uap, cmd) == SWAP_NSWAP) { 523 const int nswapdev = uvmexp.nswapdev; 524 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%jd", nswapdev, 525 0, 0, 0); 526 *retval = nswapdev; 527 return 0; 528 } 529 530 userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP); 531 532 /* 533 * ensure serialized syscall access by grabbing the swap_syscall_lock 534 */ 535 rw_enter(&swap_syscall_lock, RW_WRITER); 536 537 /* 538 * SWAP_STATS: get stats on current # of configured swap devs 539 * 540 * note that the swap_priority list can't change as long 541 * as we are holding the swap_syscall_lock. we don't want 542 * to grab the uvm_swap_data_lock because we may fault&sleep during 543 * copyout() and we don't want to be holding that lock then! 544 */ 545 switch (SCARG(uap, cmd)) { 546 case SWAP_STATS13: 547 error = (*uvm_swap_stats13)(uap, retval); 548 goto out; 549 case SWAP_STATS50: 550 error = (*uvm_swap_stats50)(uap, retval); 551 goto out; 552 case SWAP_STATS: 553 error = uvm_swap_stats(SCARG(uap, arg), SCARG(uap, misc), 554 NULL, sizeof(struct swapent), retval); 555 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); 556 goto out; 557 558 case SWAP_GETDUMPDEV: 559 error = copyout(&dumpdev, SCARG(uap, arg), sizeof(dumpdev)); 560 goto out; 561 default: 562 break; 563 } 564 565 /* 566 * all other requests require superuser privs. verify. 567 */ 568 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL, 569 0, NULL, NULL, NULL))) 570 goto out; 571 572 if (SCARG(uap, cmd) == SWAP_DUMPOFF) { 573 /* drop the current dump device */ 574 dumpdev = NODEV; 575 dumpcdev = NODEV; 576 cpu_dumpconf(); 577 goto out; 578 } 579 580 /* 581 * at this point we expect a path name in arg. we will 582 * use namei() to gain a vnode reference (vref), and lock 583 * the vnode (VOP_LOCK). 584 * 585 * XXX: a NULL arg means use the root vnode pointer (e.g. for 586 * miniroot) 587 */ 588 if (SCARG(uap, arg) == NULL) { 589 vp = rootvp; /* miniroot */ 590 vref(vp); 591 if (vn_lock(vp, LK_EXCLUSIVE)) { 592 vrele(vp); 593 error = EBUSY; 594 goto out; 595 } 596 if (SCARG(uap, cmd) == SWAP_ON && 597 copystr("miniroot", userpath, SWAP_PATH_MAX, &len)) 598 panic("swapctl: miniroot copy failed"); 599 } else { 600 struct pathbuf *pb; 601 602 /* 603 * This used to allow copying in one extra byte 604 * (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON. 605 * This was completely pointless because if anyone 606 * used that extra byte namei would fail with 607 * ENAMETOOLONG anyway, so I've removed the excess 608 * logic. - dholland 20100215 609 */ 610 611 error = pathbuf_copyin(SCARG(uap, arg), &pb); 612 if (error) { 613 goto out; 614 } 615 if (SCARG(uap, cmd) == SWAP_ON) { 616 /* get a copy of the string */ 617 pathbuf_copystring(pb, userpath, SWAP_PATH_MAX); 618 len = strlen(userpath) + 1; 619 } 620 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb); 621 if ((error = namei(&nd))) { 622 pathbuf_destroy(pb); 623 goto out; 624 } 625 vp = nd.ni_vp; 626 pathbuf_destroy(pb); 627 } 628 /* note: "vp" is referenced and locked */ 629 630 error = 0; /* assume no error */ 631 switch(SCARG(uap, cmd)) { 632 633 case SWAP_DUMPDEV: 634 if (vp->v_type != VBLK) { 635 error = ENOTBLK; 636 break; 637 } 638 if (bdevsw_lookup(vp->v_rdev)) { 639 dumpdev = vp->v_rdev; 640 dumpcdev = devsw_blk2chr(dumpdev); 641 } else 642 dumpdev = NODEV; 643 cpu_dumpconf(); 644 break; 645 646 case SWAP_CTL: 647 /* 648 * get new priority, remove old entry (if any) and then 649 * reinsert it in the correct place. finally, prune out 650 * any empty priority structures. 651 */ 652 priority = SCARG(uap, misc); 653 spp = kmem_alloc(sizeof(*spp), KM_SLEEP); 654 mutex_enter(&uvm_swap_data_lock); 655 if ((sdp = swaplist_find(vp, true)) == NULL) { 656 error = ENOENT; 657 } else { 658 swaplist_insert(sdp, spp, priority); 659 swaplist_trim(); 660 } 661 mutex_exit(&uvm_swap_data_lock); 662 if (error) 663 kmem_free(spp, sizeof(*spp)); 664 break; 665 666 case SWAP_ON: 667 668 /* 669 * check for duplicates. if none found, then insert a 670 * dummy entry on the list to prevent someone else from 671 * trying to enable this device while we are working on 672 * it. 673 */ 674 675 priority = SCARG(uap, misc); 676 sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP); 677 spp = kmem_alloc(sizeof(*spp), KM_SLEEP); 678 sdp->swd_flags = SWF_FAKE; 679 sdp->swd_vp = vp; 680 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 681 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK); 682 mutex_enter(&uvm_swap_data_lock); 683 if (swaplist_find(vp, false) != NULL) { 684 error = EBUSY; 685 mutex_exit(&uvm_swap_data_lock); 686 bufq_free(sdp->swd_tab); 687 kmem_free(sdp, sizeof(*sdp)); 688 kmem_free(spp, sizeof(*spp)); 689 break; 690 } 691 swaplist_insert(sdp, spp, priority); 692 mutex_exit(&uvm_swap_data_lock); 693 694 KASSERT(len > 0); 695 sdp->swd_pathlen = len; 696 sdp->swd_path = kmem_alloc(len, KM_SLEEP); 697 if (copystr(userpath, sdp->swd_path, len, 0) != 0) 698 panic("swapctl: copystr"); 699 700 /* 701 * we've now got a FAKE placeholder in the swap list. 702 * now attempt to enable swap on it. if we fail, undo 703 * what we've done and kill the fake entry we just inserted. 704 * if swap_on is a success, it will clear the SWF_FAKE flag 705 */ 706 707 if ((error = swap_on(l, sdp)) != 0) { 708 mutex_enter(&uvm_swap_data_lock); 709 (void) swaplist_find(vp, true); /* kill fake entry */ 710 swaplist_trim(); 711 mutex_exit(&uvm_swap_data_lock); 712 bufq_free(sdp->swd_tab); 713 kmem_free(sdp->swd_path, sdp->swd_pathlen); 714 kmem_free(sdp, sizeof(*sdp)); 715 break; 716 } 717 break; 718 719 case SWAP_OFF: 720 mutex_enter(&uvm_swap_data_lock); 721 if ((sdp = swaplist_find(vp, false)) == NULL) { 722 mutex_exit(&uvm_swap_data_lock); 723 error = ENXIO; 724 break; 725 } 726 727 /* 728 * If a device isn't in use or enabled, we 729 * can't stop swapping from it (again). 730 */ 731 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 732 mutex_exit(&uvm_swap_data_lock); 733 error = EBUSY; 734 break; 735 } 736 737 /* 738 * do the real work. 739 */ 740 error = swap_off(l, sdp); 741 break; 742 743 default: 744 error = EINVAL; 745 } 746 747 /* 748 * done! release the ref gained by namei() and unlock. 749 */ 750 vput(vp); 751 out: 752 rw_exit(&swap_syscall_lock); 753 kmem_free(userpath, SWAP_PATH_MAX); 754 755 UVMHIST_LOG(pdhist, "<- done! error=%jd", error, 0, 0, 0); 756 return (error); 757 } 758 759 /* 760 * uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept 761 * away from sys_swapctl() in order to allow COMPAT_* swapctl() 762 * emulation to use it directly without going through sys_swapctl(). 763 * The problem with using sys_swapctl() there is that it involves 764 * copying the swapent array to the stackgap, and this array's size 765 * is not known at build time. Hence it would not be possible to 766 * ensure it would fit in the stackgap in any case. 767 */ 768 int 769 uvm_swap_stats(char *ptr, int misc, 770 void (*f)(void *, const struct swapent *), size_t len, 771 register_t *retval) 772 { 773 struct swappri *spp; 774 struct swapdev *sdp; 775 struct swapent sep; 776 int count = 0; 777 int error; 778 779 KASSERT(len <= sizeof(sep)); 780 if (len == 0) 781 return ENOSYS; 782 783 if (misc < 0) 784 return EINVAL; 785 786 if (misc == 0 || uvmexp.nswapdev == 0) 787 return 0; 788 789 /* Make sure userland cannot exhaust kernel memory */ 790 if ((size_t)misc > (size_t)uvmexp.nswapdev) 791 misc = uvmexp.nswapdev; 792 793 KASSERT(rw_lock_held(&swap_syscall_lock)); 794 795 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 796 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 797 int inuse; 798 799 if (misc-- <= 0) 800 break; 801 802 inuse = btodb((uint64_t)sdp->swd_npginuse << 803 PAGE_SHIFT); 804 805 memset(&sep, 0, sizeof(sep)); 806 swapent_cvt(&sep, sdp, inuse); 807 if (f) 808 (*f)(&sep, &sep); 809 if ((error = copyout(&sep, ptr, len)) != 0) 810 return error; 811 ptr += len; 812 count++; 813 } 814 } 815 *retval = count; 816 return 0; 817 } 818 819 /* 820 * swap_on: attempt to enable a swapdev for swapping. note that the 821 * swapdev is already on the global list, but disabled (marked 822 * SWF_FAKE). 823 * 824 * => we avoid the start of the disk (to protect disk labels) 825 * => we also avoid the miniroot, if we are swapping to root. 826 * => caller should leave uvm_swap_data_lock unlocked, we may lock it 827 * if needed. 828 */ 829 static int 830 swap_on(struct lwp *l, struct swapdev *sdp) 831 { 832 struct vnode *vp; 833 int error, npages, nblocks, size; 834 long addr; 835 vmem_addr_t result; 836 struct vattr va; 837 dev_t dev; 838 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 839 840 /* 841 * we want to enable swapping on sdp. the swd_vp contains 842 * the vnode we want (locked and ref'd), and the swd_dev 843 * contains the dev_t of the file, if it a block device. 844 */ 845 846 vp = sdp->swd_vp; 847 dev = sdp->swd_dev; 848 849 /* 850 * open the swap file (mostly useful for block device files to 851 * let device driver know what is up). 852 * 853 * we skip the open/close for root on swap because the root 854 * has already been opened when root was mounted (mountroot). 855 */ 856 if (vp != rootvp) { 857 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred))) 858 return (error); 859 } 860 861 /* XXX this only works for block devices */ 862 UVMHIST_LOG(pdhist, " dev=%jd, major(dev)=%jd", dev, major(dev), 0, 0); 863 864 /* 865 * we now need to determine the size of the swap area. for 866 * block specials we can call the d_psize function. 867 * for normal files, we must stat [get attrs]. 868 * 869 * we put the result in nblks. 870 * for normal files, we also want the filesystem block size 871 * (which we get with statfs). 872 */ 873 switch (vp->v_type) { 874 case VBLK: 875 if ((nblocks = bdev_size(dev)) == -1) { 876 error = ENXIO; 877 goto bad; 878 } 879 break; 880 881 case VREG: 882 if ((error = VOP_GETATTR(vp, &va, l->l_cred))) 883 goto bad; 884 nblocks = (int)btodb(va.va_size); 885 sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift; 886 /* 887 * limit the max # of outstanding I/O requests we issue 888 * at any one time. take it easy on NFS servers. 889 */ 890 if (vp->v_tag == VT_NFS) 891 sdp->swd_maxactive = 2; /* XXX */ 892 else 893 sdp->swd_maxactive = 8; /* XXX */ 894 break; 895 896 default: 897 error = ENXIO; 898 goto bad; 899 } 900 901 /* 902 * save nblocks in a safe place and convert to pages. 903 */ 904 905 sdp->swd_nblks = nblocks; 906 npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT; 907 908 /* 909 * for block special files, we want to make sure that leave 910 * the disklabel and bootblocks alone, so we arrange to skip 911 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 912 * note that because of this the "size" can be less than the 913 * actual number of blocks on the device. 914 */ 915 if (vp->v_type == VBLK) { 916 /* we use pages 1 to (size - 1) [inclusive] */ 917 size = npages - 1; 918 addr = 1; 919 } else { 920 /* we use pages 0 to (size - 1) [inclusive] */ 921 size = npages; 922 addr = 0; 923 } 924 925 /* 926 * make sure we have enough blocks for a reasonable sized swap 927 * area. we want at least one page. 928 */ 929 930 if (size < 1) { 931 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); 932 error = EINVAL; 933 goto bad; 934 } 935 936 UVMHIST_LOG(pdhist, " dev=%#jx: size=%jd addr=%jd", dev, size, addr, 0); 937 938 /* 939 * now we need to allocate an extent to manage this swap device 940 */ 941 942 sdp->swd_blist = blist_create(npages); 943 /* mark all expect the `saved' region free. */ 944 blist_free(sdp->swd_blist, addr, size); 945 946 /* 947 * allocate space to for swap encryption state and mark the 948 * keys uninitialized so we generate them lazily 949 */ 950 sdp->swd_encmap = kmem_zalloc(encmap_size(npages), KM_SLEEP); 951 sdp->swd_encinit = false; 952 953 /* 954 * if the vnode we are swapping to is the root vnode 955 * (i.e. we are swapping to the miniroot) then we want 956 * to make sure we don't overwrite it. do a statfs to 957 * find its size and skip over it. 958 */ 959 if (vp == rootvp) { 960 struct mount *mp; 961 struct statvfs *sp; 962 int rootblocks, rootpages; 963 964 mp = rootvnode->v_mount; 965 sp = &mp->mnt_stat; 966 rootblocks = sp->f_blocks * btodb(sp->f_frsize); 967 /* 968 * XXX: sp->f_blocks isn't the total number of 969 * blocks in the filesystem, it's the number of 970 * data blocks. so, our rootblocks almost 971 * definitely underestimates the total size 972 * of the filesystem - how badly depends on the 973 * details of the filesystem type. there isn't 974 * an obvious way to deal with this cleanly 975 * and perfectly, so for now we just pad our 976 * rootblocks estimate with an extra 5 percent. 977 */ 978 rootblocks += (rootblocks >> 5) + 979 (rootblocks >> 6) + 980 (rootblocks >> 7); 981 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; 982 if (rootpages > size) 983 panic("swap_on: miniroot larger than swap?"); 984 985 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) { 986 panic("swap_on: unable to preserve miniroot"); 987 } 988 989 size -= rootpages; 990 printf("Preserved %d pages of miniroot ", rootpages); 991 printf("leaving %d pages of swap\n", size); 992 } 993 994 /* 995 * add a ref to vp to reflect usage as a swap device. 996 */ 997 vref(vp); 998 999 /* 1000 * now add the new swapdev to the drum and enable. 1001 */ 1002 error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result); 1003 if (error != 0) 1004 panic("swapdrum_add"); 1005 /* 1006 * If this is the first regular swap create the workqueue. 1007 * => Protected by swap_syscall_lock. 1008 */ 1009 if (vp->v_type != VBLK) { 1010 if (sw_reg_count++ == 0) { 1011 KASSERT(sw_reg_workqueue == NULL); 1012 if (workqueue_create(&sw_reg_workqueue, "swapiod", 1013 sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0) 1014 panic("%s: workqueue_create failed", __func__); 1015 } 1016 } 1017 1018 sdp->swd_drumoffset = (int)result; 1019 sdp->swd_drumsize = npages; 1020 sdp->swd_npages = size; 1021 mutex_enter(&uvm_swap_data_lock); 1022 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 1023 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 1024 uvmexp.swpages += size; 1025 uvmexp.swpgavail += size; 1026 mutex_exit(&uvm_swap_data_lock); 1027 return (0); 1028 1029 /* 1030 * failure: clean up and return error. 1031 */ 1032 1033 bad: 1034 if (sdp->swd_blist) { 1035 blist_destroy(sdp->swd_blist); 1036 } 1037 if (vp != rootvp) { 1038 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred); 1039 } 1040 return (error); 1041 } 1042 1043 /* 1044 * swap_off: stop swapping on swapdev 1045 * 1046 * => swap data should be locked, we will unlock. 1047 */ 1048 static int 1049 swap_off(struct lwp *l, struct swapdev *sdp) 1050 { 1051 int npages = sdp->swd_npages; 1052 int error = 0; 1053 1054 UVMHIST_FUNC(__func__); 1055 UVMHIST_CALLARGS(pdhist, " dev=%#jx, npages=%jd", sdp->swd_dev,npages, 0, 0); 1056 1057 KASSERT(rw_write_held(&swap_syscall_lock)); 1058 KASSERT(mutex_owned(&uvm_swap_data_lock)); 1059 1060 /* disable the swap area being removed */ 1061 sdp->swd_flags &= ~SWF_ENABLE; 1062 uvmexp.swpgavail -= npages; 1063 mutex_exit(&uvm_swap_data_lock); 1064 1065 /* 1066 * the idea is to find all the pages that are paged out to this 1067 * device, and page them all in. in uvm, swap-backed pageable 1068 * memory can take two forms: aobjs and anons. call the 1069 * swapoff hook for each subsystem to bring in pages. 1070 */ 1071 1072 if (uao_swap_off(sdp->swd_drumoffset, 1073 sdp->swd_drumoffset + sdp->swd_drumsize) || 1074 amap_swap_off(sdp->swd_drumoffset, 1075 sdp->swd_drumoffset + sdp->swd_drumsize)) { 1076 error = ENOMEM; 1077 } else if (sdp->swd_npginuse > sdp->swd_npgbad) { 1078 error = EBUSY; 1079 } 1080 1081 if (error) { 1082 mutex_enter(&uvm_swap_data_lock); 1083 sdp->swd_flags |= SWF_ENABLE; 1084 uvmexp.swpgavail += npages; 1085 mutex_exit(&uvm_swap_data_lock); 1086 1087 return error; 1088 } 1089 1090 /* 1091 * If this is the last regular swap destroy the workqueue. 1092 * => Protected by swap_syscall_lock. 1093 */ 1094 if (sdp->swd_vp->v_type != VBLK) { 1095 KASSERT(sw_reg_count > 0); 1096 KASSERT(sw_reg_workqueue != NULL); 1097 if (--sw_reg_count == 0) { 1098 workqueue_destroy(sw_reg_workqueue); 1099 sw_reg_workqueue = NULL; 1100 } 1101 } 1102 1103 /* 1104 * done with the vnode. 1105 * drop our ref on the vnode before calling VOP_CLOSE() 1106 * so that spec_close() can tell if this is the last close. 1107 */ 1108 vrele(sdp->swd_vp); 1109 if (sdp->swd_vp != rootvp) { 1110 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred); 1111 } 1112 1113 mutex_enter(&uvm_swap_data_lock); 1114 uvmexp.swpages -= npages; 1115 uvmexp.swpginuse -= sdp->swd_npgbad; 1116 1117 if (swaplist_find(sdp->swd_vp, true) == NULL) 1118 panic("%s: swapdev not in list", __func__); 1119 swaplist_trim(); 1120 mutex_exit(&uvm_swap_data_lock); 1121 1122 /* 1123 * free all resources! 1124 */ 1125 vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize); 1126 blist_destroy(sdp->swd_blist); 1127 bufq_free(sdp->swd_tab); 1128 kmem_free(__UNVOLATILE(sdp->swd_encmap), 1129 encmap_size(sdp->swd_drumsize)); 1130 explicit_memset(&sdp->swd_enckey, 0, sizeof sdp->swd_enckey); 1131 explicit_memset(&sdp->swd_deckey, 0, sizeof sdp->swd_deckey); 1132 kmem_free(sdp, sizeof(*sdp)); 1133 return (0); 1134 } 1135 1136 void 1137 uvm_swap_shutdown(struct lwp *l) 1138 { 1139 struct swapdev *sdp; 1140 struct swappri *spp; 1141 struct vnode *vp; 1142 int error; 1143 1144 printf("turning off swap..."); 1145 rw_enter(&swap_syscall_lock, RW_WRITER); 1146 mutex_enter(&uvm_swap_data_lock); 1147 again: 1148 LIST_FOREACH(spp, &swap_priority, spi_swappri) 1149 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1150 if (sdp->swd_flags & SWF_FAKE) 1151 continue; 1152 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) 1153 continue; 1154 #ifdef DEBUG 1155 printf("\nturning off swap on %s...", sdp->swd_path); 1156 #endif 1157 /* Have to lock and reference vnode for swap_off(). */ 1158 vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE|LK_RETRY); 1159 vref(vp); 1160 error = swap_off(l, sdp); 1161 vput(vp); 1162 mutex_enter(&uvm_swap_data_lock); 1163 if (error) { 1164 printf("stopping swap on %s failed " 1165 "with error %d\n", sdp->swd_path, error); 1166 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1167 uvmexp.nswapdev--; 1168 swaplist_trim(); 1169 } 1170 goto again; 1171 } 1172 printf(" done\n"); 1173 mutex_exit(&uvm_swap_data_lock); 1174 rw_exit(&swap_syscall_lock); 1175 } 1176 1177 1178 /* 1179 * /dev/drum interface and i/o functions 1180 */ 1181 1182 /* 1183 * swstrategy: perform I/O on the drum 1184 * 1185 * => we must map the i/o request from the drum to the correct swapdev. 1186 */ 1187 static void 1188 swstrategy(struct buf *bp) 1189 { 1190 struct swapdev *sdp; 1191 struct vnode *vp; 1192 int pageno, bn; 1193 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1194 1195 /* 1196 * convert block number to swapdev. note that swapdev can't 1197 * be yanked out from under us because we are holding resources 1198 * in it (i.e. the blocks we are doing I/O on). 1199 */ 1200 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT; 1201 mutex_enter(&uvm_swap_data_lock); 1202 sdp = swapdrum_getsdp(pageno); 1203 mutex_exit(&uvm_swap_data_lock); 1204 if (sdp == NULL) { 1205 bp->b_error = EINVAL; 1206 bp->b_resid = bp->b_bcount; 1207 biodone(bp); 1208 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); 1209 return; 1210 } 1211 1212 /* 1213 * convert drum page number to block number on this swapdev. 1214 */ 1215 1216 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1217 bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ 1218 1219 UVMHIST_LOG(pdhist, " Rd/Wr (0/1) %jd: mapoff=%#jx bn=%#jx bcount=%jd", 1220 ((bp->b_flags & B_READ) == 0) ? 1 : 0, 1221 sdp->swd_drumoffset, bn, bp->b_bcount); 1222 1223 /* 1224 * for block devices we finish up here. 1225 * for regular files we have to do more work which we delegate 1226 * to sw_reg_strategy(). 1227 */ 1228 1229 vp = sdp->swd_vp; /* swapdev vnode pointer */ 1230 switch (vp->v_type) { 1231 default: 1232 panic("%s: vnode type 0x%x", __func__, vp->v_type); 1233 1234 case VBLK: 1235 1236 /* 1237 * must convert "bp" from an I/O on /dev/drum to an I/O 1238 * on the swapdev (sdp). 1239 */ 1240 bp->b_blkno = bn; /* swapdev block number */ 1241 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */ 1242 1243 /* 1244 * if we are doing a write, we have to redirect the i/o on 1245 * drum's v_numoutput counter to the swapdevs. 1246 */ 1247 if ((bp->b_flags & B_READ) == 0) { 1248 mutex_enter(bp->b_objlock); 1249 vwakeup(bp); /* kills one 'v_numoutput' on drum */ 1250 mutex_exit(bp->b_objlock); 1251 mutex_enter(vp->v_interlock); 1252 vp->v_numoutput++; /* put it on swapdev */ 1253 mutex_exit(vp->v_interlock); 1254 } 1255 1256 /* 1257 * finally plug in swapdev vnode and start I/O 1258 */ 1259 bp->b_vp = vp; 1260 bp->b_objlock = vp->v_interlock; 1261 VOP_STRATEGY(vp, bp); 1262 return; 1263 1264 case VREG: 1265 /* 1266 * delegate to sw_reg_strategy function. 1267 */ 1268 sw_reg_strategy(sdp, bp, bn); 1269 return; 1270 } 1271 /* NOTREACHED */ 1272 } 1273 1274 /* 1275 * swread: the read function for the drum (just a call to physio) 1276 */ 1277 /*ARGSUSED*/ 1278 static int 1279 swread(dev_t dev, struct uio *uio, int ioflag) 1280 { 1281 UVMHIST_FUNC(__func__); 1282 UVMHIST_CALLARGS(pdhist, " dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0); 1283 1284 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); 1285 } 1286 1287 /* 1288 * swwrite: the write function for the drum (just a call to physio) 1289 */ 1290 /*ARGSUSED*/ 1291 static int 1292 swwrite(dev_t dev, struct uio *uio, int ioflag) 1293 { 1294 UVMHIST_FUNC(__func__); 1295 UVMHIST_CALLARGS(pdhist, " dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0); 1296 1297 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); 1298 } 1299 1300 const struct bdevsw swap_bdevsw = { 1301 .d_open = nullopen, 1302 .d_close = nullclose, 1303 .d_strategy = swstrategy, 1304 .d_ioctl = noioctl, 1305 .d_dump = nodump, 1306 .d_psize = nosize, 1307 .d_discard = nodiscard, 1308 .d_flag = D_OTHER 1309 }; 1310 1311 const struct cdevsw swap_cdevsw = { 1312 .d_open = nullopen, 1313 .d_close = nullclose, 1314 .d_read = swread, 1315 .d_write = swwrite, 1316 .d_ioctl = noioctl, 1317 .d_stop = nostop, 1318 .d_tty = notty, 1319 .d_poll = nopoll, 1320 .d_mmap = nommap, 1321 .d_kqfilter = nokqfilter, 1322 .d_discard = nodiscard, 1323 .d_flag = D_OTHER, 1324 }; 1325 1326 /* 1327 * sw_reg_strategy: handle swap i/o to regular files 1328 */ 1329 static void 1330 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) 1331 { 1332 struct vnode *vp; 1333 struct vndxfer *vnx; 1334 daddr_t nbn; 1335 char *addr; 1336 off_t byteoff; 1337 int s, off, nra, error, sz, resid; 1338 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1339 1340 /* 1341 * allocate a vndxfer head for this transfer and point it to 1342 * our buffer. 1343 */ 1344 vnx = pool_get(&vndxfer_pool, PR_WAITOK); 1345 vnx->vx_flags = VX_BUSY; 1346 vnx->vx_error = 0; 1347 vnx->vx_pending = 0; 1348 vnx->vx_bp = bp; 1349 vnx->vx_sdp = sdp; 1350 1351 /* 1352 * setup for main loop where we read filesystem blocks into 1353 * our buffer. 1354 */ 1355 error = 0; 1356 bp->b_resid = bp->b_bcount; /* nothing transferred yet! */ 1357 addr = bp->b_data; /* current position in buffer */ 1358 byteoff = dbtob((uint64_t)bn); 1359 1360 for (resid = bp->b_resid; resid; resid -= sz) { 1361 struct vndbuf *nbp; 1362 1363 /* 1364 * translate byteoffset into block number. return values: 1365 * vp = vnode of underlying device 1366 * nbn = new block number (on underlying vnode dev) 1367 * nra = num blocks we can read-ahead (excludes requested 1368 * block) 1369 */ 1370 nra = 0; 1371 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1372 &vp, &nbn, &nra); 1373 1374 if (error == 0 && nbn == (daddr_t)-1) { 1375 /* 1376 * this used to just set error, but that doesn't 1377 * do the right thing. Instead, it causes random 1378 * memory errors. The panic() should remain until 1379 * this condition doesn't destabilize the system. 1380 */ 1381 #if 1 1382 panic("%s: swap to sparse file", __func__); 1383 #else 1384 error = EIO; /* failure */ 1385 #endif 1386 } 1387 1388 /* 1389 * punt if there was an error or a hole in the file. 1390 * we must wait for any i/o ops we have already started 1391 * to finish before returning. 1392 * 1393 * XXX we could deal with holes here but it would be 1394 * a hassle (in the write case). 1395 */ 1396 if (error) { 1397 s = splbio(); 1398 vnx->vx_error = error; /* pass error up */ 1399 goto out; 1400 } 1401 1402 /* 1403 * compute the size ("sz") of this transfer (in bytes). 1404 */ 1405 off = byteoff % sdp->swd_bsize; 1406 sz = (1 + nra) * sdp->swd_bsize - off; 1407 if (sz > resid) 1408 sz = resid; 1409 1410 UVMHIST_LOG(pdhist, "sw_reg_strategy: " 1411 "vp %#jx/%#jx offset %#jx/%#jx", 1412 (uintptr_t)sdp->swd_vp, (uintptr_t)vp, byteoff, nbn); 1413 1414 /* 1415 * now get a buf structure. note that the vb_buf is 1416 * at the front of the nbp structure so that you can 1417 * cast pointers between the two structure easily. 1418 */ 1419 nbp = pool_get(&vndbuf_pool, PR_WAITOK); 1420 buf_init(&nbp->vb_buf); 1421 nbp->vb_buf.b_flags = bp->b_flags; 1422 nbp->vb_buf.b_cflags = bp->b_cflags; 1423 nbp->vb_buf.b_oflags = bp->b_oflags; 1424 nbp->vb_buf.b_bcount = sz; 1425 nbp->vb_buf.b_bufsize = sz; 1426 nbp->vb_buf.b_error = 0; 1427 nbp->vb_buf.b_data = addr; 1428 nbp->vb_buf.b_lblkno = 0; 1429 nbp->vb_buf.b_blkno = nbn + btodb(off); 1430 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno; 1431 nbp->vb_buf.b_iodone = sw_reg_biodone; 1432 nbp->vb_buf.b_vp = vp; 1433 nbp->vb_buf.b_objlock = vp->v_interlock; 1434 if (vp->v_type == VBLK) { 1435 nbp->vb_buf.b_dev = vp->v_rdev; 1436 } 1437 1438 nbp->vb_xfer = vnx; /* patch it back in to vnx */ 1439 1440 /* 1441 * Just sort by block number 1442 */ 1443 s = splbio(); 1444 if (vnx->vx_error != 0) { 1445 buf_destroy(&nbp->vb_buf); 1446 pool_put(&vndbuf_pool, nbp); 1447 goto out; 1448 } 1449 vnx->vx_pending++; 1450 1451 /* sort it in and start I/O if we are not over our limit */ 1452 /* XXXAD locking */ 1453 bufq_put(sdp->swd_tab, &nbp->vb_buf); 1454 sw_reg_start(sdp); 1455 splx(s); 1456 1457 /* 1458 * advance to the next I/O 1459 */ 1460 byteoff += sz; 1461 addr += sz; 1462 } 1463 1464 s = splbio(); 1465 1466 out: /* Arrive here at splbio */ 1467 vnx->vx_flags &= ~VX_BUSY; 1468 if (vnx->vx_pending == 0) { 1469 error = vnx->vx_error; 1470 pool_put(&vndxfer_pool, vnx); 1471 bp->b_error = error; 1472 biodone(bp); 1473 } 1474 splx(s); 1475 } 1476 1477 /* 1478 * sw_reg_start: start an I/O request on the requested swapdev 1479 * 1480 * => reqs are sorted by b_rawblkno (above) 1481 */ 1482 static void 1483 sw_reg_start(struct swapdev *sdp) 1484 { 1485 struct buf *bp; 1486 struct vnode *vp; 1487 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1488 1489 /* recursion control */ 1490 if ((sdp->swd_flags & SWF_BUSY) != 0) 1491 return; 1492 1493 sdp->swd_flags |= SWF_BUSY; 1494 1495 while (sdp->swd_active < sdp->swd_maxactive) { 1496 bp = bufq_get(sdp->swd_tab); 1497 if (bp == NULL) 1498 break; 1499 sdp->swd_active++; 1500 1501 UVMHIST_LOG(pdhist, 1502 "sw_reg_start: bp %#jx vp %#jx blkno %#jx cnt %#jx", 1503 (uintptr_t)bp, (uintptr_t)bp->b_vp, (uintptr_t)bp->b_blkno, 1504 bp->b_bcount); 1505 vp = bp->b_vp; 1506 KASSERT(bp->b_objlock == vp->v_interlock); 1507 if ((bp->b_flags & B_READ) == 0) { 1508 mutex_enter(vp->v_interlock); 1509 vp->v_numoutput++; 1510 mutex_exit(vp->v_interlock); 1511 } 1512 VOP_STRATEGY(vp, bp); 1513 } 1514 sdp->swd_flags &= ~SWF_BUSY; 1515 } 1516 1517 /* 1518 * sw_reg_biodone: one of our i/o's has completed 1519 */ 1520 static void 1521 sw_reg_biodone(struct buf *bp) 1522 { 1523 workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL); 1524 } 1525 1526 /* 1527 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1528 * 1529 * => note that we can recover the vndbuf struct by casting the buf ptr 1530 */ 1531 static void 1532 sw_reg_iodone(struct work *wk, void *dummy) 1533 { 1534 struct vndbuf *vbp = (void *)wk; 1535 struct vndxfer *vnx = vbp->vb_xfer; 1536 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1537 struct swapdev *sdp = vnx->vx_sdp; 1538 int s, resid, error; 1539 KASSERT(&vbp->vb_buf.b_work == wk); 1540 UVMHIST_FUNC(__func__); 1541 UVMHIST_CALLARGS(pdhist, " vbp=%#jx vp=%#jx blkno=%#jx addr=%#jx", 1542 (uintptr_t)vbp, (uintptr_t)vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, 1543 (uintptr_t)vbp->vb_buf.b_data); 1544 UVMHIST_LOG(pdhist, " cnt=%#jx resid=%#jx", 1545 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); 1546 1547 /* 1548 * protect vbp at splbio and update. 1549 */ 1550 1551 s = splbio(); 1552 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1553 pbp->b_resid -= resid; 1554 vnx->vx_pending--; 1555 1556 if (vbp->vb_buf.b_error != 0) { 1557 /* pass error upward */ 1558 error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO; 1559 UVMHIST_LOG(pdhist, " got error=%jd !", error, 0, 0, 0); 1560 vnx->vx_error = error; 1561 } 1562 1563 /* 1564 * kill vbp structure 1565 */ 1566 buf_destroy(&vbp->vb_buf); 1567 pool_put(&vndbuf_pool, vbp); 1568 1569 /* 1570 * wrap up this transaction if it has run to completion or, in 1571 * case of an error, when all auxiliary buffers have returned. 1572 */ 1573 if (vnx->vx_error != 0) { 1574 /* pass error upward */ 1575 error = vnx->vx_error; 1576 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1577 pbp->b_error = error; 1578 biodone(pbp); 1579 pool_put(&vndxfer_pool, vnx); 1580 } 1581 } else if (pbp->b_resid == 0) { 1582 KASSERT(vnx->vx_pending == 0); 1583 if ((vnx->vx_flags & VX_BUSY) == 0) { 1584 UVMHIST_LOG(pdhist, " iodone, pbp=%#jx error=%jd !", 1585 (uintptr_t)pbp, vnx->vx_error, 0, 0); 1586 biodone(pbp); 1587 pool_put(&vndxfer_pool, vnx); 1588 } 1589 } 1590 1591 /* 1592 * done! start next swapdev I/O if one is pending 1593 */ 1594 sdp->swd_active--; 1595 sw_reg_start(sdp); 1596 splx(s); 1597 } 1598 1599 1600 /* 1601 * uvm_swap_alloc: allocate space on swap 1602 * 1603 * => allocation is done "round robin" down the priority list, as we 1604 * allocate in a priority we "rotate" the circle queue. 1605 * => space can be freed with uvm_swap_free 1606 * => we return the page slot number in /dev/drum (0 == invalid slot) 1607 * => we lock uvm_swap_data_lock 1608 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1609 */ 1610 int 1611 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok) 1612 { 1613 struct swapdev *sdp; 1614 struct swappri *spp; 1615 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1616 1617 /* 1618 * no swap devices configured yet? definite failure. 1619 */ 1620 if (uvmexp.nswapdev < 1) 1621 return 0; 1622 1623 /* 1624 * XXXJAK: BEGIN HACK 1625 * 1626 * blist_alloc() in subr_blist.c will panic if we try to allocate 1627 * too many slots. 1628 */ 1629 if (*nslots > BLIST_MAX_ALLOC) { 1630 if (__predict_false(lessok == false)) 1631 return 0; 1632 *nslots = BLIST_MAX_ALLOC; 1633 } 1634 /* XXXJAK: END HACK */ 1635 1636 /* 1637 * lock data lock, convert slots into blocks, and enter loop 1638 */ 1639 mutex_enter(&uvm_swap_data_lock); 1640 1641 ReTry: /* XXXMRG */ 1642 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1643 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1644 uint64_t result; 1645 1646 /* if it's not enabled, then we can't swap from it */ 1647 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1648 continue; 1649 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1650 continue; 1651 result = blist_alloc(sdp->swd_blist, *nslots); 1652 if (result == BLIST_NONE) { 1653 continue; 1654 } 1655 KASSERT(result < sdp->swd_drumsize); 1656 1657 /* 1658 * successful allocation! now rotate the tailq. 1659 */ 1660 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1661 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1662 sdp->swd_npginuse += *nslots; 1663 uvmexp.swpginuse += *nslots; 1664 mutex_exit(&uvm_swap_data_lock); 1665 /* done! return drum slot number */ 1666 UVMHIST_LOG(pdhist, 1667 "success! returning %jd slots starting at %jd", 1668 *nslots, result + sdp->swd_drumoffset, 0, 0); 1669 return (result + sdp->swd_drumoffset); 1670 } 1671 } 1672 1673 /* XXXMRG: BEGIN HACK */ 1674 if (*nslots > 1 && lessok) { 1675 *nslots = 1; 1676 /* XXXMRG: ugh! blist should support this for us */ 1677 goto ReTry; 1678 } 1679 /* XXXMRG: END HACK */ 1680 1681 mutex_exit(&uvm_swap_data_lock); 1682 return 0; 1683 } 1684 1685 /* 1686 * uvm_swapisfull: return true if most of available swap is allocated 1687 * and in use. we don't count some small portion as it may be inaccessible 1688 * to us at any given moment, for example if there is lock contention or if 1689 * pages are busy. 1690 */ 1691 bool 1692 uvm_swapisfull(void) 1693 { 1694 int swpgonly; 1695 bool rv; 1696 1697 if (uvmexp.swpages == 0) { 1698 return true; 1699 } 1700 1701 mutex_enter(&uvm_swap_data_lock); 1702 KASSERT(uvmexp.swpgonly <= uvmexp.swpages); 1703 swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 / 1704 uvm_swapisfull_factor); 1705 rv = (swpgonly >= uvmexp.swpgavail); 1706 mutex_exit(&uvm_swap_data_lock); 1707 1708 return (rv); 1709 } 1710 1711 /* 1712 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1713 * 1714 * => we lock uvm_swap_data_lock 1715 */ 1716 void 1717 uvm_swap_markbad(int startslot, int nslots) 1718 { 1719 struct swapdev *sdp; 1720 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1721 1722 mutex_enter(&uvm_swap_data_lock); 1723 sdp = swapdrum_getsdp(startslot); 1724 KASSERT(sdp != NULL); 1725 1726 /* 1727 * we just keep track of how many pages have been marked bad 1728 * in this device, to make everything add up in swap_off(). 1729 * we assume here that the range of slots will all be within 1730 * one swap device. 1731 */ 1732 1733 KASSERT(uvmexp.swpgonly >= nslots); 1734 atomic_add_int(&uvmexp.swpgonly, -nslots); 1735 sdp->swd_npgbad += nslots; 1736 UVMHIST_LOG(pdhist, "now %jd bad", sdp->swd_npgbad, 0,0,0); 1737 mutex_exit(&uvm_swap_data_lock); 1738 } 1739 1740 /* 1741 * uvm_swap_free: free swap slots 1742 * 1743 * => this can be all or part of an allocation made by uvm_swap_alloc 1744 * => we lock uvm_swap_data_lock 1745 */ 1746 void 1747 uvm_swap_free(int startslot, int nslots) 1748 { 1749 struct swapdev *sdp; 1750 UVMHIST_FUNC(__func__); 1751 UVMHIST_CALLARGS(pdhist, "freeing %jd slots starting at %jd", nslots, 1752 startslot, 0, 0); 1753 1754 /* 1755 * ignore attempts to free the "bad" slot. 1756 */ 1757 1758 if (startslot == SWSLOT_BAD) { 1759 return; 1760 } 1761 1762 /* 1763 * convert drum slot offset back to sdp, free the blocks 1764 * in the extent, and return. must hold pri lock to do 1765 * lookup and access the extent. 1766 */ 1767 1768 mutex_enter(&uvm_swap_data_lock); 1769 sdp = swapdrum_getsdp(startslot); 1770 KASSERT(uvmexp.nswapdev >= 1); 1771 KASSERT(sdp != NULL); 1772 KASSERT(sdp->swd_npginuse >= nslots); 1773 blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots); 1774 sdp->swd_npginuse -= nslots; 1775 uvmexp.swpginuse -= nslots; 1776 mutex_exit(&uvm_swap_data_lock); 1777 } 1778 1779 /* 1780 * uvm_swap_put: put any number of pages into a contig place on swap 1781 * 1782 * => can be sync or async 1783 */ 1784 1785 int 1786 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) 1787 { 1788 int error; 1789 1790 error = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1791 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1792 return error; 1793 } 1794 1795 /* 1796 * uvm_swap_get: get a single page from swap 1797 * 1798 * => usually a sync op (from fault) 1799 */ 1800 1801 int 1802 uvm_swap_get(struct vm_page *page, int swslot, int flags) 1803 { 1804 int error; 1805 1806 atomic_inc_uint(&uvmexp.nswget); 1807 KASSERT(flags & PGO_SYNCIO); 1808 if (swslot == SWSLOT_BAD) { 1809 return EIO; 1810 } 1811 1812 error = uvm_swap_io(&page, swslot, 1, B_READ | 1813 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1814 if (error == 0) { 1815 1816 /* 1817 * this page is no longer only in swap. 1818 */ 1819 1820 KASSERT(uvmexp.swpgonly > 0); 1821 atomic_dec_uint(&uvmexp.swpgonly); 1822 } 1823 return error; 1824 } 1825 1826 /* 1827 * uvm_swap_io: do an i/o operation to swap 1828 */ 1829 1830 static int 1831 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) 1832 { 1833 daddr_t startblk; 1834 struct buf *bp; 1835 vaddr_t kva; 1836 int error, mapinflags; 1837 bool write, async, swap_encrypt; 1838 UVMHIST_FUNC(__func__); 1839 UVMHIST_CALLARGS(pdhist, "<- called, startslot=%jd, npages=%jd, flags=%#jx", 1840 startslot, npages, flags, 0); 1841 1842 write = (flags & B_READ) == 0; 1843 async = (flags & B_ASYNC) != 0; 1844 swap_encrypt = atomic_load_relaxed(&uvm_swap_encrypt); 1845 1846 /* 1847 * allocate a buf for the i/o. 1848 */ 1849 1850 KASSERT(curlwp != uvm.pagedaemon_lwp || (write && async)); 1851 bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp); 1852 if (bp == NULL) { 1853 uvm_aio_aiodone_pages(pps, npages, true, ENOMEM); 1854 return ENOMEM; 1855 } 1856 1857 /* 1858 * convert starting drum slot to block number 1859 */ 1860 1861 startblk = btodb((uint64_t)startslot << PAGE_SHIFT); 1862 1863 /* 1864 * first, map the pages into the kernel. 1865 */ 1866 1867 mapinflags = !write ? 1868 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ : 1869 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE; 1870 if (write && swap_encrypt) /* need to encrypt in-place */ 1871 mapinflags |= UVMPAGER_MAPIN_READ; 1872 kva = uvm_pagermapin(pps, npages, mapinflags); 1873 1874 /* 1875 * encrypt writes in place if requested 1876 */ 1877 1878 if (write) do { 1879 struct swapdev *sdp; 1880 int i; 1881 1882 /* 1883 * Get the swapdev so we can discriminate on the 1884 * encryption state. There may or may not be an 1885 * encryption key generated; we may or may not be asked 1886 * to encrypt swap. 1887 * 1888 * 1. NO KEY, NO ENCRYPTION: Nothing to do. 1889 * 1890 * 2. NO KEY, BUT ENCRYPTION: Generate a key, encrypt, 1891 * and mark the slots encrypted. 1892 * 1893 * 3. KEY, BUT NO ENCRYPTION: The slots may already be 1894 * marked encrypted from a past life. Mark them not 1895 * encrypted. 1896 * 1897 * 4. KEY, ENCRYPTION: Encrypt and mark the slots 1898 * encrypted. 1899 */ 1900 mutex_enter(&uvm_swap_data_lock); 1901 sdp = swapdrum_getsdp(startslot); 1902 if (!sdp->swd_encinit) { 1903 if (!swap_encrypt) { 1904 mutex_exit(&uvm_swap_data_lock); 1905 break; 1906 } 1907 uvm_swap_genkey(sdp); 1908 } 1909 KASSERT(sdp->swd_encinit); 1910 mutex_exit(&uvm_swap_data_lock); 1911 1912 for (i = 0; i < npages; i++) { 1913 int s = startslot + i; 1914 KDASSERT(swapdrum_sdp_is(s, sdp)); 1915 KASSERT(s >= sdp->swd_drumoffset); 1916 s -= sdp->swd_drumoffset; 1917 KASSERT(s < sdp->swd_drumsize); 1918 1919 if (swap_encrypt) { 1920 uvm_swap_encryptpage(sdp, 1921 (void *)(kva + (vsize_t)i*PAGE_SIZE), s); 1922 atomic_or_32(&sdp->swd_encmap[s/32], 1923 __BIT(s%32)); 1924 } else { 1925 atomic_and_32(&sdp->swd_encmap[s/32], 1926 ~__BIT(s%32)); 1927 } 1928 } 1929 } while (0); 1930 1931 /* 1932 * fill in the bp/sbp. we currently route our i/o through 1933 * /dev/drum's vnode [swapdev_vp]. 1934 */ 1935 1936 bp->b_cflags = BC_BUSY | BC_NOCACHE; 1937 bp->b_flags = (flags & (B_READ|B_ASYNC)); 1938 bp->b_proc = &proc0; /* XXX */ 1939 bp->b_vnbufs.le_next = NOLIST; 1940 bp->b_data = (void *)kva; 1941 bp->b_blkno = startblk; 1942 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT; 1943 1944 /* 1945 * bump v_numoutput (counter of number of active outputs). 1946 */ 1947 1948 if (write) { 1949 mutex_enter(swapdev_vp->v_interlock); 1950 swapdev_vp->v_numoutput++; 1951 mutex_exit(swapdev_vp->v_interlock); 1952 } 1953 1954 /* 1955 * for async ops we must set up the iodone handler. 1956 */ 1957 1958 if (async) { 1959 bp->b_iodone = uvm_aio_aiodone; 1960 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); 1961 if (curlwp == uvm.pagedaemon_lwp) 1962 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1963 else 1964 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 1965 } else { 1966 bp->b_iodone = NULL; 1967 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1968 } 1969 UVMHIST_LOG(pdhist, 1970 "about to start io: data = %#jx blkno = %#jx, bcount = %jd", 1971 (uintptr_t)bp->b_data, bp->b_blkno, bp->b_bcount, 0); 1972 1973 /* 1974 * now we start the I/O, and if async, return. 1975 */ 1976 1977 VOP_STRATEGY(swapdev_vp, bp); 1978 if (async) { 1979 /* 1980 * Reads are always synchronous; if this changes, we 1981 * need to add an asynchronous path for decryption. 1982 */ 1983 KASSERT(write); 1984 return 0; 1985 } 1986 1987 /* 1988 * must be sync i/o. wait for it to finish 1989 */ 1990 1991 error = biowait(bp); 1992 if (error) 1993 goto out; 1994 1995 /* 1996 * decrypt reads in place if needed 1997 */ 1998 1999 if (!write) do { 2000 struct swapdev *sdp; 2001 bool encinit; 2002 int i; 2003 2004 /* 2005 * Get the sdp. Everything about it except the encinit 2006 * bit, saying whether the encryption key is 2007 * initialized or not, and the encrypted bit for each 2008 * page, is stable until all swap pages have been 2009 * released and the device is removed. 2010 */ 2011 mutex_enter(&uvm_swap_data_lock); 2012 sdp = swapdrum_getsdp(startslot); 2013 encinit = sdp->swd_encinit; 2014 mutex_exit(&uvm_swap_data_lock); 2015 2016 if (!encinit) 2017 /* 2018 * If there's no encryption key, there's no way 2019 * any of these slots can be encrypted, so 2020 * nothing to do here. 2021 */ 2022 break; 2023 for (i = 0; i < npages; i++) { 2024 int s = startslot + i; 2025 KDASSERT(swapdrum_sdp_is(s, sdp)); 2026 KASSERT(s >= sdp->swd_drumoffset); 2027 s -= sdp->swd_drumoffset; 2028 KASSERT(s < sdp->swd_drumsize); 2029 if ((atomic_load_relaxed(&sdp->swd_encmap[s/32]) & 2030 __BIT(s%32)) == 0) 2031 continue; 2032 uvm_swap_decryptpage(sdp, 2033 (void *)(kva + (vsize_t)i*PAGE_SIZE), s); 2034 } 2035 } while (0); 2036 out: 2037 /* 2038 * kill the pager mapping 2039 */ 2040 2041 uvm_pagermapout(kva, npages); 2042 2043 /* 2044 * now dispose of the buf and we're done. 2045 */ 2046 2047 if (write) { 2048 mutex_enter(swapdev_vp->v_interlock); 2049 vwakeup(bp); 2050 mutex_exit(swapdev_vp->v_interlock); 2051 } 2052 putiobuf(bp); 2053 UVMHIST_LOG(pdhist, "<- done (sync) error=%jd", error, 0, 0, 0); 2054 2055 return (error); 2056 } 2057 2058 /* 2059 * uvm_swap_genkey(sdp) 2060 * 2061 * Generate a key for swap encryption. 2062 */ 2063 static void 2064 uvm_swap_genkey(struct swapdev *sdp) 2065 { 2066 uint8_t key[32]; 2067 2068 KASSERT(!sdp->swd_encinit); 2069 2070 cprng_strong(kern_cprng, key, sizeof key, 0); 2071 aes_setenckey256(&sdp->swd_enckey, key); 2072 aes_setdeckey256(&sdp->swd_deckey, key); 2073 explicit_memset(key, 0, sizeof key); 2074 2075 sdp->swd_encinit = true; 2076 } 2077 2078 /* 2079 * uvm_swap_encryptpage(sdp, kva, slot) 2080 * 2081 * Encrypt one page of data at kva for the specified slot number 2082 * in the swap device. 2083 */ 2084 static void 2085 uvm_swap_encryptpage(struct swapdev *sdp, void *kva, int slot) 2086 { 2087 uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16); 2088 2089 /* iv := AES_k(le32enc(slot) || 0^96) */ 2090 le32enc(preiv, slot); 2091 aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS); 2092 2093 /* *kva := AES-CBC_k(iv, *kva) */ 2094 aes_cbc_enc(&sdp->swd_enckey, kva, kva, PAGE_SIZE, iv, 2095 AES_256_NROUNDS); 2096 2097 explicit_memset(&iv, 0, sizeof iv); 2098 } 2099 2100 /* 2101 * uvm_swap_decryptpage(sdp, kva, slot) 2102 * 2103 * Decrypt one page of data at kva for the specified slot number 2104 * in the swap device. 2105 */ 2106 static void 2107 uvm_swap_decryptpage(struct swapdev *sdp, void *kva, int slot) 2108 { 2109 uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16); 2110 2111 /* iv := AES_k(le32enc(slot) || 0^96) */ 2112 le32enc(preiv, slot); 2113 aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS); 2114 2115 /* *kva := AES-CBC^{-1}_k(iv, *kva) */ 2116 aes_cbc_dec(&sdp->swd_deckey, kva, kva, PAGE_SIZE, iv, 2117 AES_256_NROUNDS); 2118 2119 explicit_memset(&iv, 0, sizeof iv); 2120 } 2121 2122 SYSCTL_SETUP(sysctl_uvmswap_setup, "sysctl uvmswap setup") 2123 { 2124 2125 sysctl_createv(clog, 0, NULL, NULL, 2126 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "swap_encrypt", 2127 SYSCTL_DESCR("Encrypt data when swapped out to disk"), 2128 NULL, 0, &uvm_swap_encrypt, 0, 2129 CTL_VM, CTL_CREATE, CTL_EOL); 2130 } 2131