1 /* $NetBSD: uvm_swap.c,v 1.204 2021/05/23 00:36:36 mrg Exp $ */ 2 3 /* 4 * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 29 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.204 2021/05/23 00:36:36 mrg Exp $"); 34 35 #include "opt_uvmhist.h" 36 #include "opt_compat_netbsd.h" 37 #include "opt_ddb.h" 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/atomic.h> 42 #include <sys/buf.h> 43 #include <sys/bufq.h> 44 #include <sys/conf.h> 45 #include <sys/cprng.h> 46 #include <sys/proc.h> 47 #include <sys/namei.h> 48 #include <sys/disklabel.h> 49 #include <sys/errno.h> 50 #include <sys/kernel.h> 51 #include <sys/vnode.h> 52 #include <sys/file.h> 53 #include <sys/vmem.h> 54 #include <sys/blist.h> 55 #include <sys/mount.h> 56 #include <sys/pool.h> 57 #include <sys/kmem.h> 58 #include <sys/syscallargs.h> 59 #include <sys/swap.h> 60 #include <sys/kauth.h> 61 #include <sys/sysctl.h> 62 #include <sys/workqueue.h> 63 64 #include <uvm/uvm.h> 65 66 #include <miscfs/specfs/specdev.h> 67 68 #include <crypto/aes/aes.h> 69 #include <crypto/aes/aes_cbc.h> 70 71 /* 72 * uvm_swap.c: manage configuration and i/o to swap space. 73 */ 74 75 /* 76 * swap space is managed in the following way: 77 * 78 * each swap partition or file is described by a "swapdev" structure. 79 * each "swapdev" structure contains a "swapent" structure which contains 80 * information that is passed up to the user (via system calls). 81 * 82 * each swap partition is assigned a "priority" (int) which controls 83 * swap partition usage. 84 * 85 * the system maintains a global data structure describing all swap 86 * partitions/files. there is a sorted LIST of "swappri" structures 87 * which describe "swapdev"'s at that priority. this LIST is headed 88 * by the "swap_priority" global var. each "swappri" contains a 89 * TAILQ of "swapdev" structures at that priority. 90 * 91 * locking: 92 * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl 93 * system call and prevents the swap priority list from changing 94 * while we are in the middle of a system call (e.g. SWAP_STATS). 95 * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data 96 * structures including the priority list, the swapdev structures, 97 * and the swapmap arena. 98 * 99 * each swap device has the following info: 100 * - swap device in use (could be disabled, preventing future use) 101 * - swap enabled (allows new allocations on swap) 102 * - map info in /dev/drum 103 * - vnode pointer 104 * for swap files only: 105 * - block size 106 * - max byte count in buffer 107 * - buffer 108 * 109 * userland controls and configures swap with the swapctl(2) system call. 110 * the sys_swapctl performs the following operations: 111 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 112 * [2] SWAP_STATS: given a pointer to an array of swapent structures 113 * (passed in via "arg") of a size passed in via "misc" ... we load 114 * the current swap config into the array. The actual work is done 115 * in the uvm_swap_stats() function. 116 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 117 * priority in "misc", start swapping on it. 118 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 119 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 120 * "misc") 121 */ 122 123 /* 124 * swapdev: describes a single swap partition/file 125 * 126 * note the following should be true: 127 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 128 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] 129 */ 130 struct swapdev { 131 dev_t swd_dev; /* device id */ 132 int swd_flags; /* flags:inuse/enable/fake */ 133 int swd_priority; /* our priority */ 134 int swd_nblks; /* blocks in this device */ 135 char *swd_path; /* saved pathname of device */ 136 int swd_pathlen; /* length of pathname */ 137 int swd_npages; /* #pages we can use */ 138 int swd_npginuse; /* #pages in use */ 139 int swd_npgbad; /* #pages bad */ 140 int swd_drumoffset; /* page0 offset in drum */ 141 int swd_drumsize; /* #pages in drum */ 142 blist_t swd_blist; /* blist for this swapdev */ 143 struct vnode *swd_vp; /* backing vnode */ 144 TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */ 145 146 int swd_bsize; /* blocksize (bytes) */ 147 int swd_maxactive; /* max active i/o reqs */ 148 struct bufq_state *swd_tab; /* buffer list */ 149 int swd_active; /* number of active buffers */ 150 151 volatile uint32_t *swd_encmap; /* bitmap of encrypted slots */ 152 struct aesenc swd_enckey; /* AES key expanded for enc */ 153 struct aesdec swd_deckey; /* AES key expanded for dec */ 154 bool swd_encinit; /* true if keys initialized */ 155 }; 156 157 /* 158 * swap device priority entry; the list is kept sorted on `spi_priority'. 159 */ 160 struct swappri { 161 int spi_priority; /* priority */ 162 TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 163 /* tailq of swapdevs at this priority */ 164 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 165 }; 166 167 /* 168 * The following two structures are used to keep track of data transfers 169 * on swap devices associated with regular files. 170 * NOTE: this code is more or less a copy of vnd.c; we use the same 171 * structure names here to ease porting.. 172 */ 173 struct vndxfer { 174 struct buf *vx_bp; /* Pointer to parent buffer */ 175 struct swapdev *vx_sdp; 176 int vx_error; 177 int vx_pending; /* # of pending aux buffers */ 178 int vx_flags; 179 #define VX_BUSY 1 180 #define VX_DEAD 2 181 }; 182 183 struct vndbuf { 184 struct buf vb_buf; 185 struct vndxfer *vb_xfer; 186 }; 187 188 /* 189 * We keep a of pool vndbuf's and vndxfer structures. 190 */ 191 static struct pool vndxfer_pool, vndbuf_pool; 192 193 /* 194 * local variables 195 */ 196 static vmem_t *swapmap; /* controls the mapping of /dev/drum */ 197 198 /* list of all active swap devices [by priority] */ 199 LIST_HEAD(swap_priority, swappri); 200 static struct swap_priority swap_priority; 201 202 /* locks */ 203 static kmutex_t uvm_swap_data_lock __cacheline_aligned; 204 static krwlock_t swap_syscall_lock; 205 bool uvm_swap_init_done = false; 206 207 /* workqueue and use counter for swap to regular files */ 208 static int sw_reg_count = 0; 209 static struct workqueue *sw_reg_workqueue; 210 211 /* tuneables */ 212 u_int uvm_swapisfull_factor = 99; 213 bool uvm_swap_encrypt = false; 214 215 /* 216 * prototypes 217 */ 218 static struct swapdev *swapdrum_getsdp(int); 219 220 static struct swapdev *swaplist_find(struct vnode *, bool); 221 static void swaplist_insert(struct swapdev *, 222 struct swappri *, int); 223 static void swaplist_trim(void); 224 225 static int swap_on(struct lwp *, struct swapdev *); 226 static int swap_off(struct lwp *, struct swapdev *); 227 228 static void sw_reg_strategy(struct swapdev *, struct buf *, int); 229 static void sw_reg_biodone(struct buf *); 230 static void sw_reg_iodone(struct work *wk, void *dummy); 231 static void sw_reg_start(struct swapdev *); 232 233 static int uvm_swap_io(struct vm_page **, int, int, int); 234 235 static void uvm_swap_genkey(struct swapdev *); 236 static void uvm_swap_encryptpage(struct swapdev *, void *, int); 237 static void uvm_swap_decryptpage(struct swapdev *, void *, int); 238 239 static size_t 240 encmap_size(size_t npages) 241 { 242 struct swapdev *sdp; 243 const size_t bytesperword = sizeof(sdp->swd_encmap[0]); 244 const size_t bitsperword = NBBY * bytesperword; 245 const size_t nbits = npages; /* one bit for each page */ 246 const size_t nwords = howmany(nbits, bitsperword); 247 const size_t nbytes = nwords * bytesperword; 248 249 return nbytes; 250 } 251 252 /* 253 * uvm_swap_init: init the swap system data structures and locks 254 * 255 * => called at boot time from init_main.c after the filesystems 256 * are brought up (which happens after uvm_init()) 257 */ 258 void 259 uvm_swap_init(void) 260 { 261 UVMHIST_FUNC(__func__); 262 263 UVMHIST_CALLED(pdhist); 264 /* 265 * first, init the swap list, its counter, and its lock. 266 * then get a handle on the vnode for /dev/drum by using 267 * the its dev_t number ("swapdev", from MD conf.c). 268 */ 269 270 LIST_INIT(&swap_priority); 271 uvmexp.nswapdev = 0; 272 rw_init(&swap_syscall_lock); 273 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE); 274 275 if (bdevvp(swapdev, &swapdev_vp)) 276 panic("%s: can't get vnode for swap device", __func__); 277 if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY)) 278 panic("%s: can't lock swap device", __func__); 279 if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED)) 280 panic("%s: can't open swap device", __func__); 281 VOP_UNLOCK(swapdev_vp); 282 283 /* 284 * create swap block resource map to map /dev/drum. the range 285 * from 1 to INT_MAX allows 2 gigablocks of swap space. note 286 * that block 0 is reserved (used to indicate an allocation 287 * failure, or no allocation). 288 */ 289 swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0, 290 VM_NOSLEEP, IPL_NONE); 291 if (swapmap == 0) { 292 panic("%s: vmem_create failed", __func__); 293 } 294 295 pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx", 296 NULL, IPL_BIO); 297 pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd", 298 NULL, IPL_BIO); 299 300 uvm_swap_init_done = true; 301 302 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); 303 } 304 305 /* 306 * swaplist functions: functions that operate on the list of swap 307 * devices on the system. 308 */ 309 310 /* 311 * swaplist_insert: insert swap device "sdp" into the global list 312 * 313 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 314 * => caller must provide a newly allocated swappri structure (we will 315 * FREE it if we don't need it... this it to prevent allocation 316 * blocking here while adding swap) 317 */ 318 static void 319 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) 320 { 321 struct swappri *spp, *pspp; 322 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 323 324 KASSERT(rw_write_held(&swap_syscall_lock)); 325 KASSERT(mutex_owned(&uvm_swap_data_lock)); 326 327 /* 328 * find entry at or after which to insert the new device. 329 */ 330 pspp = NULL; 331 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 332 if (priority <= spp->spi_priority) 333 break; 334 pspp = spp; 335 } 336 337 /* 338 * new priority? 339 */ 340 if (spp == NULL || spp->spi_priority != priority) { 341 spp = newspp; /* use newspp! */ 342 UVMHIST_LOG(pdhist, "created new swappri = %jd", 343 priority, 0, 0, 0); 344 345 spp->spi_priority = priority; 346 TAILQ_INIT(&spp->spi_swapdev); 347 348 if (pspp) 349 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 350 else 351 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 352 } else { 353 /* we don't need a new priority structure, free it */ 354 kmem_free(newspp, sizeof(*newspp)); 355 } 356 357 /* 358 * priority found (or created). now insert on the priority's 359 * tailq list and bump the total number of swapdevs. 360 */ 361 sdp->swd_priority = priority; 362 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 363 uvmexp.nswapdev++; 364 } 365 366 /* 367 * swaplist_find: find and optionally remove a swap device from the 368 * global list. 369 * 370 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 371 * => we return the swapdev we found (and removed) 372 */ 373 static struct swapdev * 374 swaplist_find(struct vnode *vp, bool remove) 375 { 376 struct swapdev *sdp; 377 struct swappri *spp; 378 379 KASSERT(rw_lock_held(&swap_syscall_lock)); 380 KASSERT(remove ? rw_write_held(&swap_syscall_lock) : 1); 381 KASSERT(mutex_owned(&uvm_swap_data_lock)); 382 383 /* 384 * search the lists for the requested vp 385 */ 386 387 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 388 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 389 if (sdp->swd_vp == vp) { 390 if (remove) { 391 TAILQ_REMOVE(&spp->spi_swapdev, 392 sdp, swd_next); 393 uvmexp.nswapdev--; 394 } 395 return(sdp); 396 } 397 } 398 } 399 return (NULL); 400 } 401 402 /* 403 * swaplist_trim: scan priority list for empty priority entries and kill 404 * them. 405 * 406 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 407 */ 408 static void 409 swaplist_trim(void) 410 { 411 struct swappri *spp, *nextspp; 412 413 KASSERT(rw_write_held(&swap_syscall_lock)); 414 KASSERT(mutex_owned(&uvm_swap_data_lock)); 415 416 LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) { 417 if (!TAILQ_EMPTY(&spp->spi_swapdev)) 418 continue; 419 LIST_REMOVE(spp, spi_swappri); 420 kmem_free(spp, sizeof(*spp)); 421 } 422 } 423 424 /* 425 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 426 * to the "swapdev" that maps that section of the drum. 427 * 428 * => each swapdev takes one big contig chunk of the drum 429 * => caller must hold uvm_swap_data_lock 430 */ 431 static struct swapdev * 432 swapdrum_getsdp(int pgno) 433 { 434 struct swapdev *sdp; 435 struct swappri *spp; 436 437 KASSERT(mutex_owned(&uvm_swap_data_lock)); 438 439 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 440 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 441 if (sdp->swd_flags & SWF_FAKE) 442 continue; 443 if (pgno >= sdp->swd_drumoffset && 444 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 445 return sdp; 446 } 447 } 448 } 449 return NULL; 450 } 451 452 /* 453 * swapdrum_sdp_is: true iff the swap device for pgno is sdp 454 * 455 * => for use in positive assertions only; result is not stable 456 */ 457 static bool __debugused 458 swapdrum_sdp_is(int pgno, struct swapdev *sdp) 459 { 460 bool result; 461 462 mutex_enter(&uvm_swap_data_lock); 463 result = swapdrum_getsdp(pgno) == sdp; 464 mutex_exit(&uvm_swap_data_lock); 465 466 return result; 467 } 468 469 void swapsys_lock(krw_t op) 470 { 471 rw_enter(&swap_syscall_lock, op); 472 } 473 474 void swapsys_unlock(void) 475 { 476 rw_exit(&swap_syscall_lock); 477 } 478 479 static void 480 swapent_cvt(struct swapent *se, const struct swapdev *sdp, int inuse) 481 { 482 se->se_dev = sdp->swd_dev; 483 se->se_flags = sdp->swd_flags; 484 se->se_nblks = sdp->swd_nblks; 485 se->se_inuse = inuse; 486 se->se_priority = sdp->swd_priority; 487 KASSERT(sdp->swd_pathlen < sizeof(se->se_path)); 488 strcpy(se->se_path, sdp->swd_path); 489 } 490 491 int (*uvm_swap_stats13)(const struct sys_swapctl_args *, register_t *) = 492 (void *)enosys; 493 int (*uvm_swap_stats50)(const struct sys_swapctl_args *, register_t *) = 494 (void *)enosys; 495 496 /* 497 * sys_swapctl: main entry point for swapctl(2) system call 498 * [with two helper functions: swap_on and swap_off] 499 */ 500 int 501 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval) 502 { 503 /* { 504 syscallarg(int) cmd; 505 syscallarg(void *) arg; 506 syscallarg(int) misc; 507 } */ 508 struct vnode *vp; 509 struct nameidata nd; 510 struct swappri *spp; 511 struct swapdev *sdp; 512 #define SWAP_PATH_MAX (PATH_MAX + 1) 513 char *userpath; 514 size_t len = 0; 515 int error; 516 int priority; 517 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 518 519 /* 520 * we handle the non-priv NSWAP and STATS request first. 521 * 522 * SWAP_NSWAP: return number of config'd swap devices 523 * [can also be obtained with uvmexp sysctl] 524 */ 525 if (SCARG(uap, cmd) == SWAP_NSWAP) { 526 const int nswapdev = uvmexp.nswapdev; 527 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%jd", nswapdev, 528 0, 0, 0); 529 *retval = nswapdev; 530 return 0; 531 } 532 533 userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP); 534 535 /* 536 * ensure serialized syscall access by grabbing the swap_syscall_lock 537 */ 538 rw_enter(&swap_syscall_lock, RW_WRITER); 539 540 /* 541 * SWAP_STATS: get stats on current # of configured swap devs 542 * 543 * note that the swap_priority list can't change as long 544 * as we are holding the swap_syscall_lock. we don't want 545 * to grab the uvm_swap_data_lock because we may fault&sleep during 546 * copyout() and we don't want to be holding that lock then! 547 */ 548 switch (SCARG(uap, cmd)) { 549 case SWAP_STATS13: 550 error = (*uvm_swap_stats13)(uap, retval); 551 goto out; 552 case SWAP_STATS50: 553 error = (*uvm_swap_stats50)(uap, retval); 554 goto out; 555 case SWAP_STATS: 556 error = uvm_swap_stats(SCARG(uap, arg), SCARG(uap, misc), 557 NULL, sizeof(struct swapent), retval); 558 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); 559 goto out; 560 561 case SWAP_GETDUMPDEV: 562 error = copyout(&dumpdev, SCARG(uap, arg), sizeof(dumpdev)); 563 goto out; 564 default: 565 break; 566 } 567 568 /* 569 * all other requests require superuser privs. verify. 570 */ 571 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL, 572 0, NULL, NULL, NULL))) 573 goto out; 574 575 if (SCARG(uap, cmd) == SWAP_DUMPOFF) { 576 /* drop the current dump device */ 577 dumpdev = NODEV; 578 dumpcdev = NODEV; 579 cpu_dumpconf(); 580 goto out; 581 } 582 583 /* 584 * at this point we expect a path name in arg. we will 585 * use namei() to gain a vnode reference (vref), and lock 586 * the vnode (VOP_LOCK). 587 * 588 * XXX: a NULL arg means use the root vnode pointer (e.g. for 589 * miniroot) 590 */ 591 if (SCARG(uap, arg) == NULL) { 592 vp = rootvp; /* miniroot */ 593 vref(vp); 594 if (vn_lock(vp, LK_EXCLUSIVE)) { 595 vrele(vp); 596 error = EBUSY; 597 goto out; 598 } 599 if (SCARG(uap, cmd) == SWAP_ON && 600 copystr("miniroot", userpath, SWAP_PATH_MAX, &len)) 601 panic("swapctl: miniroot copy failed"); 602 } else { 603 struct pathbuf *pb; 604 605 /* 606 * This used to allow copying in one extra byte 607 * (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON. 608 * This was completely pointless because if anyone 609 * used that extra byte namei would fail with 610 * ENAMETOOLONG anyway, so I've removed the excess 611 * logic. - dholland 20100215 612 */ 613 614 error = pathbuf_copyin(SCARG(uap, arg), &pb); 615 if (error) { 616 goto out; 617 } 618 if (SCARG(uap, cmd) == SWAP_ON) { 619 /* get a copy of the string */ 620 pathbuf_copystring(pb, userpath, SWAP_PATH_MAX); 621 len = strlen(userpath) + 1; 622 } 623 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb); 624 if ((error = namei(&nd))) { 625 pathbuf_destroy(pb); 626 goto out; 627 } 628 vp = nd.ni_vp; 629 pathbuf_destroy(pb); 630 } 631 /* note: "vp" is referenced and locked */ 632 633 error = 0; /* assume no error */ 634 switch(SCARG(uap, cmd)) { 635 636 case SWAP_DUMPDEV: 637 if (vp->v_type != VBLK) { 638 error = ENOTBLK; 639 break; 640 } 641 if (bdevsw_lookup(vp->v_rdev)) { 642 dumpdev = vp->v_rdev; 643 dumpcdev = devsw_blk2chr(dumpdev); 644 } else 645 dumpdev = NODEV; 646 cpu_dumpconf(); 647 break; 648 649 case SWAP_CTL: 650 /* 651 * get new priority, remove old entry (if any) and then 652 * reinsert it in the correct place. finally, prune out 653 * any empty priority structures. 654 */ 655 priority = SCARG(uap, misc); 656 spp = kmem_alloc(sizeof(*spp), KM_SLEEP); 657 mutex_enter(&uvm_swap_data_lock); 658 if ((sdp = swaplist_find(vp, true)) == NULL) { 659 error = ENOENT; 660 } else { 661 swaplist_insert(sdp, spp, priority); 662 swaplist_trim(); 663 } 664 mutex_exit(&uvm_swap_data_lock); 665 if (error) 666 kmem_free(spp, sizeof(*spp)); 667 break; 668 669 case SWAP_ON: 670 671 /* 672 * check for duplicates. if none found, then insert a 673 * dummy entry on the list to prevent someone else from 674 * trying to enable this device while we are working on 675 * it. 676 */ 677 678 priority = SCARG(uap, misc); 679 sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP); 680 spp = kmem_alloc(sizeof(*spp), KM_SLEEP); 681 sdp->swd_flags = SWF_FAKE; 682 sdp->swd_vp = vp; 683 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 684 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK); 685 mutex_enter(&uvm_swap_data_lock); 686 if (swaplist_find(vp, false) != NULL) { 687 error = EBUSY; 688 mutex_exit(&uvm_swap_data_lock); 689 bufq_free(sdp->swd_tab); 690 kmem_free(sdp, sizeof(*sdp)); 691 kmem_free(spp, sizeof(*spp)); 692 break; 693 } 694 swaplist_insert(sdp, spp, priority); 695 mutex_exit(&uvm_swap_data_lock); 696 697 KASSERT(len > 0); 698 sdp->swd_pathlen = len; 699 sdp->swd_path = kmem_alloc(len, KM_SLEEP); 700 if (copystr(userpath, sdp->swd_path, len, 0) != 0) 701 panic("swapctl: copystr"); 702 703 /* 704 * we've now got a FAKE placeholder in the swap list. 705 * now attempt to enable swap on it. if we fail, undo 706 * what we've done and kill the fake entry we just inserted. 707 * if swap_on is a success, it will clear the SWF_FAKE flag 708 */ 709 710 if ((error = swap_on(l, sdp)) != 0) { 711 mutex_enter(&uvm_swap_data_lock); 712 (void) swaplist_find(vp, true); /* kill fake entry */ 713 swaplist_trim(); 714 mutex_exit(&uvm_swap_data_lock); 715 bufq_free(sdp->swd_tab); 716 kmem_free(sdp->swd_path, sdp->swd_pathlen); 717 kmem_free(sdp, sizeof(*sdp)); 718 break; 719 } 720 break; 721 722 case SWAP_OFF: 723 mutex_enter(&uvm_swap_data_lock); 724 if ((sdp = swaplist_find(vp, false)) == NULL) { 725 mutex_exit(&uvm_swap_data_lock); 726 error = ENXIO; 727 break; 728 } 729 730 /* 731 * If a device isn't in use or enabled, we 732 * can't stop swapping from it (again). 733 */ 734 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 735 mutex_exit(&uvm_swap_data_lock); 736 error = EBUSY; 737 break; 738 } 739 740 /* 741 * do the real work. 742 */ 743 error = swap_off(l, sdp); 744 break; 745 746 default: 747 error = EINVAL; 748 } 749 750 /* 751 * done! release the ref gained by namei() and unlock. 752 */ 753 vput(vp); 754 out: 755 rw_exit(&swap_syscall_lock); 756 kmem_free(userpath, SWAP_PATH_MAX); 757 758 UVMHIST_LOG(pdhist, "<- done! error=%jd", error, 0, 0, 0); 759 return (error); 760 } 761 762 /* 763 * uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept 764 * away from sys_swapctl() in order to allow COMPAT_* swapctl() 765 * emulation to use it directly without going through sys_swapctl(). 766 * The problem with using sys_swapctl() there is that it involves 767 * copying the swapent array to the stackgap, and this array's size 768 * is not known at build time. Hence it would not be possible to 769 * ensure it would fit in the stackgap in any case. 770 */ 771 int 772 uvm_swap_stats(char *ptr, int misc, 773 void (*f)(void *, const struct swapent *), size_t len, 774 register_t *retval) 775 { 776 struct swappri *spp; 777 struct swapdev *sdp; 778 struct swapent sep; 779 int count = 0; 780 int error; 781 782 KASSERT(len <= sizeof(sep)); 783 if (len == 0) 784 return ENOSYS; 785 786 if (misc < 0) 787 return EINVAL; 788 789 if (misc == 0 || uvmexp.nswapdev == 0) 790 return 0; 791 792 /* Make sure userland cannot exhaust kernel memory */ 793 if ((size_t)misc > (size_t)uvmexp.nswapdev) 794 misc = uvmexp.nswapdev; 795 796 KASSERT(rw_lock_held(&swap_syscall_lock)); 797 798 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 799 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 800 int inuse; 801 802 if (misc-- <= 0) 803 break; 804 805 inuse = btodb((uint64_t)sdp->swd_npginuse << 806 PAGE_SHIFT); 807 808 memset(&sep, 0, sizeof(sep)); 809 swapent_cvt(&sep, sdp, inuse); 810 if (f) 811 (*f)(&sep, &sep); 812 if ((error = copyout(&sep, ptr, len)) != 0) 813 return error; 814 ptr += len; 815 count++; 816 } 817 } 818 *retval = count; 819 return 0; 820 } 821 822 /* 823 * swap_on: attempt to enable a swapdev for swapping. note that the 824 * swapdev is already on the global list, but disabled (marked 825 * SWF_FAKE). 826 * 827 * => we avoid the start of the disk (to protect disk labels) 828 * => we also avoid the miniroot, if we are swapping to root. 829 * => caller should leave uvm_swap_data_lock unlocked, we may lock it 830 * if needed. 831 */ 832 static int 833 swap_on(struct lwp *l, struct swapdev *sdp) 834 { 835 struct vnode *vp; 836 int error, npages, nblocks, size; 837 long addr; 838 vmem_addr_t result; 839 struct vattr va; 840 dev_t dev; 841 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 842 843 /* 844 * we want to enable swapping on sdp. the swd_vp contains 845 * the vnode we want (locked and ref'd), and the swd_dev 846 * contains the dev_t of the file, if it a block device. 847 */ 848 849 vp = sdp->swd_vp; 850 dev = sdp->swd_dev; 851 852 /* 853 * open the swap file (mostly useful for block device files to 854 * let device driver know what is up). 855 * 856 * we skip the open/close for root on swap because the root 857 * has already been opened when root was mounted (mountroot). 858 */ 859 if (vp != rootvp) { 860 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred))) 861 return (error); 862 } 863 864 /* XXX this only works for block devices */ 865 UVMHIST_LOG(pdhist, " dev=%jd, major(dev)=%jd", dev, major(dev), 0, 0); 866 867 /* 868 * we now need to determine the size of the swap area. for 869 * block specials we can call the d_psize function. 870 * for normal files, we must stat [get attrs]. 871 * 872 * we put the result in nblks. 873 * for normal files, we also want the filesystem block size 874 * (which we get with statfs). 875 */ 876 switch (vp->v_type) { 877 case VBLK: 878 if ((nblocks = bdev_size(dev)) == -1) { 879 error = ENXIO; 880 goto bad; 881 } 882 break; 883 884 case VREG: 885 if ((error = VOP_GETATTR(vp, &va, l->l_cred))) 886 goto bad; 887 nblocks = (int)btodb(va.va_size); 888 sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift; 889 /* 890 * limit the max # of outstanding I/O requests we issue 891 * at any one time. take it easy on NFS servers. 892 */ 893 if (vp->v_tag == VT_NFS) 894 sdp->swd_maxactive = 2; /* XXX */ 895 else 896 sdp->swd_maxactive = 8; /* XXX */ 897 break; 898 899 default: 900 error = ENXIO; 901 goto bad; 902 } 903 904 /* 905 * save nblocks in a safe place and convert to pages. 906 */ 907 908 sdp->swd_nblks = nblocks; 909 npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT; 910 911 /* 912 * for block special files, we want to make sure that leave 913 * the disklabel and bootblocks alone, so we arrange to skip 914 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 915 * note that because of this the "size" can be less than the 916 * actual number of blocks on the device. 917 */ 918 if (vp->v_type == VBLK) { 919 /* we use pages 1 to (size - 1) [inclusive] */ 920 size = npages - 1; 921 addr = 1; 922 } else { 923 /* we use pages 0 to (size - 1) [inclusive] */ 924 size = npages; 925 addr = 0; 926 } 927 928 /* 929 * make sure we have enough blocks for a reasonable sized swap 930 * area. we want at least one page. 931 */ 932 933 if (size < 1) { 934 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); 935 error = EINVAL; 936 goto bad; 937 } 938 939 UVMHIST_LOG(pdhist, " dev=%#jx: size=%jd addr=%jd", dev, size, addr, 0); 940 941 /* 942 * now we need to allocate an extent to manage this swap device 943 */ 944 945 sdp->swd_blist = blist_create(npages); 946 /* mark all expect the `saved' region free. */ 947 blist_free(sdp->swd_blist, addr, size); 948 949 /* 950 * allocate space to for swap encryption state and mark the 951 * keys uninitialized so we generate them lazily 952 */ 953 sdp->swd_encmap = kmem_zalloc(encmap_size(npages), KM_SLEEP); 954 sdp->swd_encinit = false; 955 956 /* 957 * if the vnode we are swapping to is the root vnode 958 * (i.e. we are swapping to the miniroot) then we want 959 * to make sure we don't overwrite it. do a statfs to 960 * find its size and skip over it. 961 */ 962 if (vp == rootvp) { 963 struct mount *mp; 964 struct statvfs *sp; 965 int rootblocks, rootpages; 966 967 mp = rootvnode->v_mount; 968 sp = &mp->mnt_stat; 969 rootblocks = sp->f_blocks * btodb(sp->f_frsize); 970 /* 971 * XXX: sp->f_blocks isn't the total number of 972 * blocks in the filesystem, it's the number of 973 * data blocks. so, our rootblocks almost 974 * definitely underestimates the total size 975 * of the filesystem - how badly depends on the 976 * details of the filesystem type. there isn't 977 * an obvious way to deal with this cleanly 978 * and perfectly, so for now we just pad our 979 * rootblocks estimate with an extra 5 percent. 980 */ 981 rootblocks += (rootblocks >> 5) + 982 (rootblocks >> 6) + 983 (rootblocks >> 7); 984 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; 985 if (rootpages > size) 986 panic("swap_on: miniroot larger than swap?"); 987 988 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) { 989 panic("swap_on: unable to preserve miniroot"); 990 } 991 992 size -= rootpages; 993 printf("Preserved %d pages of miniroot ", rootpages); 994 printf("leaving %d pages of swap\n", size); 995 } 996 997 /* 998 * add a ref to vp to reflect usage as a swap device. 999 */ 1000 vref(vp); 1001 1002 /* 1003 * now add the new swapdev to the drum and enable. 1004 */ 1005 error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result); 1006 if (error != 0) 1007 panic("swapdrum_add"); 1008 /* 1009 * If this is the first regular swap create the workqueue. 1010 * => Protected by swap_syscall_lock. 1011 */ 1012 if (vp->v_type != VBLK) { 1013 if (sw_reg_count++ == 0) { 1014 KASSERT(sw_reg_workqueue == NULL); 1015 if (workqueue_create(&sw_reg_workqueue, "swapiod", 1016 sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0) 1017 panic("%s: workqueue_create failed", __func__); 1018 } 1019 } 1020 1021 sdp->swd_drumoffset = (int)result; 1022 sdp->swd_drumsize = npages; 1023 sdp->swd_npages = size; 1024 mutex_enter(&uvm_swap_data_lock); 1025 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 1026 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 1027 uvmexp.swpages += size; 1028 uvmexp.swpgavail += size; 1029 mutex_exit(&uvm_swap_data_lock); 1030 return (0); 1031 1032 /* 1033 * failure: clean up and return error. 1034 */ 1035 1036 bad: 1037 if (sdp->swd_blist) { 1038 blist_destroy(sdp->swd_blist); 1039 } 1040 if (vp != rootvp) { 1041 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred); 1042 } 1043 return (error); 1044 } 1045 1046 /* 1047 * swap_off: stop swapping on swapdev 1048 * 1049 * => swap data should be locked, we will unlock. 1050 */ 1051 static int 1052 swap_off(struct lwp *l, struct swapdev *sdp) 1053 { 1054 int npages = sdp->swd_npages; 1055 int error = 0; 1056 1057 UVMHIST_FUNC(__func__); 1058 UVMHIST_CALLARGS(pdhist, " dev=%#jx, npages=%jd", sdp->swd_dev,npages, 0, 0); 1059 1060 KASSERT(rw_write_held(&swap_syscall_lock)); 1061 KASSERT(mutex_owned(&uvm_swap_data_lock)); 1062 1063 /* disable the swap area being removed */ 1064 sdp->swd_flags &= ~SWF_ENABLE; 1065 uvmexp.swpgavail -= npages; 1066 mutex_exit(&uvm_swap_data_lock); 1067 1068 /* 1069 * the idea is to find all the pages that are paged out to this 1070 * device, and page them all in. in uvm, swap-backed pageable 1071 * memory can take two forms: aobjs and anons. call the 1072 * swapoff hook for each subsystem to bring in pages. 1073 */ 1074 1075 if (uao_swap_off(sdp->swd_drumoffset, 1076 sdp->swd_drumoffset + sdp->swd_drumsize) || 1077 amap_swap_off(sdp->swd_drumoffset, 1078 sdp->swd_drumoffset + sdp->swd_drumsize)) { 1079 error = ENOMEM; 1080 } else if (sdp->swd_npginuse > sdp->swd_npgbad) { 1081 error = EBUSY; 1082 } 1083 1084 if (error) { 1085 mutex_enter(&uvm_swap_data_lock); 1086 sdp->swd_flags |= SWF_ENABLE; 1087 uvmexp.swpgavail += npages; 1088 mutex_exit(&uvm_swap_data_lock); 1089 1090 return error; 1091 } 1092 1093 /* 1094 * If this is the last regular swap destroy the workqueue. 1095 * => Protected by swap_syscall_lock. 1096 */ 1097 if (sdp->swd_vp->v_type != VBLK) { 1098 KASSERT(sw_reg_count > 0); 1099 KASSERT(sw_reg_workqueue != NULL); 1100 if (--sw_reg_count == 0) { 1101 workqueue_destroy(sw_reg_workqueue); 1102 sw_reg_workqueue = NULL; 1103 } 1104 } 1105 1106 /* 1107 * done with the vnode. 1108 * drop our ref on the vnode before calling VOP_CLOSE() 1109 * so that spec_close() can tell if this is the last close. 1110 */ 1111 vrele(sdp->swd_vp); 1112 if (sdp->swd_vp != rootvp) { 1113 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred); 1114 } 1115 1116 mutex_enter(&uvm_swap_data_lock); 1117 uvmexp.swpages -= npages; 1118 uvmexp.swpginuse -= sdp->swd_npgbad; 1119 1120 if (swaplist_find(sdp->swd_vp, true) == NULL) 1121 panic("%s: swapdev not in list", __func__); 1122 swaplist_trim(); 1123 mutex_exit(&uvm_swap_data_lock); 1124 1125 /* 1126 * free all resources! 1127 */ 1128 vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize); 1129 blist_destroy(sdp->swd_blist); 1130 bufq_free(sdp->swd_tab); 1131 kmem_free(__UNVOLATILE(sdp->swd_encmap), 1132 encmap_size(sdp->swd_drumsize)); 1133 explicit_memset(&sdp->swd_enckey, 0, sizeof sdp->swd_enckey); 1134 explicit_memset(&sdp->swd_deckey, 0, sizeof sdp->swd_deckey); 1135 kmem_free(sdp, sizeof(*sdp)); 1136 return (0); 1137 } 1138 1139 void 1140 uvm_swap_shutdown(struct lwp *l) 1141 { 1142 struct swapdev *sdp; 1143 struct swappri *spp; 1144 struct vnode *vp; 1145 int error; 1146 1147 if (!uvm_swap_init_done) 1148 return; 1149 printf("turning off swap..."); 1150 rw_enter(&swap_syscall_lock, RW_WRITER); 1151 mutex_enter(&uvm_swap_data_lock); 1152 again: 1153 LIST_FOREACH(spp, &swap_priority, spi_swappri) 1154 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1155 if (sdp->swd_flags & SWF_FAKE) 1156 continue; 1157 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) 1158 continue; 1159 #ifdef DEBUG 1160 printf("\nturning off swap on %s...", sdp->swd_path); 1161 #endif 1162 /* Have to lock and reference vnode for swap_off(). */ 1163 vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE|LK_RETRY); 1164 vref(vp); 1165 error = swap_off(l, sdp); 1166 vput(vp); 1167 mutex_enter(&uvm_swap_data_lock); 1168 if (error) { 1169 printf("stopping swap on %s failed " 1170 "with error %d\n", sdp->swd_path, error); 1171 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1172 uvmexp.nswapdev--; 1173 swaplist_trim(); 1174 } 1175 goto again; 1176 } 1177 printf(" done\n"); 1178 mutex_exit(&uvm_swap_data_lock); 1179 rw_exit(&swap_syscall_lock); 1180 } 1181 1182 1183 /* 1184 * /dev/drum interface and i/o functions 1185 */ 1186 1187 /* 1188 * swstrategy: perform I/O on the drum 1189 * 1190 * => we must map the i/o request from the drum to the correct swapdev. 1191 */ 1192 static void 1193 swstrategy(struct buf *bp) 1194 { 1195 struct swapdev *sdp; 1196 struct vnode *vp; 1197 int pageno, bn; 1198 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1199 1200 /* 1201 * convert block number to swapdev. note that swapdev can't 1202 * be yanked out from under us because we are holding resources 1203 * in it (i.e. the blocks we are doing I/O on). 1204 */ 1205 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT; 1206 mutex_enter(&uvm_swap_data_lock); 1207 sdp = swapdrum_getsdp(pageno); 1208 mutex_exit(&uvm_swap_data_lock); 1209 if (sdp == NULL) { 1210 bp->b_error = EINVAL; 1211 bp->b_resid = bp->b_bcount; 1212 biodone(bp); 1213 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); 1214 return; 1215 } 1216 1217 /* 1218 * convert drum page number to block number on this swapdev. 1219 */ 1220 1221 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1222 bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ 1223 1224 UVMHIST_LOG(pdhist, " Rd/Wr (0/1) %jd: mapoff=%#jx bn=%#jx bcount=%jd", 1225 ((bp->b_flags & B_READ) == 0) ? 1 : 0, 1226 sdp->swd_drumoffset, bn, bp->b_bcount); 1227 1228 /* 1229 * for block devices we finish up here. 1230 * for regular files we have to do more work which we delegate 1231 * to sw_reg_strategy(). 1232 */ 1233 1234 vp = sdp->swd_vp; /* swapdev vnode pointer */ 1235 switch (vp->v_type) { 1236 default: 1237 panic("%s: vnode type 0x%x", __func__, vp->v_type); 1238 1239 case VBLK: 1240 1241 /* 1242 * must convert "bp" from an I/O on /dev/drum to an I/O 1243 * on the swapdev (sdp). 1244 */ 1245 bp->b_blkno = bn; /* swapdev block number */ 1246 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */ 1247 1248 /* 1249 * if we are doing a write, we have to redirect the i/o on 1250 * drum's v_numoutput counter to the swapdevs. 1251 */ 1252 if ((bp->b_flags & B_READ) == 0) { 1253 mutex_enter(bp->b_objlock); 1254 vwakeup(bp); /* kills one 'v_numoutput' on drum */ 1255 mutex_exit(bp->b_objlock); 1256 mutex_enter(vp->v_interlock); 1257 vp->v_numoutput++; /* put it on swapdev */ 1258 mutex_exit(vp->v_interlock); 1259 } 1260 1261 /* 1262 * finally plug in swapdev vnode and start I/O 1263 */ 1264 bp->b_vp = vp; 1265 bp->b_objlock = vp->v_interlock; 1266 VOP_STRATEGY(vp, bp); 1267 return; 1268 1269 case VREG: 1270 /* 1271 * delegate to sw_reg_strategy function. 1272 */ 1273 sw_reg_strategy(sdp, bp, bn); 1274 return; 1275 } 1276 /* NOTREACHED */ 1277 } 1278 1279 /* 1280 * swread: the read function for the drum (just a call to physio) 1281 */ 1282 /*ARGSUSED*/ 1283 static int 1284 swread(dev_t dev, struct uio *uio, int ioflag) 1285 { 1286 UVMHIST_FUNC(__func__); 1287 UVMHIST_CALLARGS(pdhist, " dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0); 1288 1289 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); 1290 } 1291 1292 /* 1293 * swwrite: the write function for the drum (just a call to physio) 1294 */ 1295 /*ARGSUSED*/ 1296 static int 1297 swwrite(dev_t dev, struct uio *uio, int ioflag) 1298 { 1299 UVMHIST_FUNC(__func__); 1300 UVMHIST_CALLARGS(pdhist, " dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0); 1301 1302 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); 1303 } 1304 1305 const struct bdevsw swap_bdevsw = { 1306 .d_open = nullopen, 1307 .d_close = nullclose, 1308 .d_strategy = swstrategy, 1309 .d_ioctl = noioctl, 1310 .d_dump = nodump, 1311 .d_psize = nosize, 1312 .d_discard = nodiscard, 1313 .d_flag = D_OTHER 1314 }; 1315 1316 const struct cdevsw swap_cdevsw = { 1317 .d_open = nullopen, 1318 .d_close = nullclose, 1319 .d_read = swread, 1320 .d_write = swwrite, 1321 .d_ioctl = noioctl, 1322 .d_stop = nostop, 1323 .d_tty = notty, 1324 .d_poll = nopoll, 1325 .d_mmap = nommap, 1326 .d_kqfilter = nokqfilter, 1327 .d_discard = nodiscard, 1328 .d_flag = D_OTHER, 1329 }; 1330 1331 /* 1332 * sw_reg_strategy: handle swap i/o to regular files 1333 */ 1334 static void 1335 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) 1336 { 1337 struct vnode *vp; 1338 struct vndxfer *vnx; 1339 daddr_t nbn; 1340 char *addr; 1341 off_t byteoff; 1342 int s, off, nra, error, sz, resid; 1343 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1344 1345 /* 1346 * allocate a vndxfer head for this transfer and point it to 1347 * our buffer. 1348 */ 1349 vnx = pool_get(&vndxfer_pool, PR_WAITOK); 1350 vnx->vx_flags = VX_BUSY; 1351 vnx->vx_error = 0; 1352 vnx->vx_pending = 0; 1353 vnx->vx_bp = bp; 1354 vnx->vx_sdp = sdp; 1355 1356 /* 1357 * setup for main loop where we read filesystem blocks into 1358 * our buffer. 1359 */ 1360 error = 0; 1361 bp->b_resid = bp->b_bcount; /* nothing transferred yet! */ 1362 addr = bp->b_data; /* current position in buffer */ 1363 byteoff = dbtob((uint64_t)bn); 1364 1365 for (resid = bp->b_resid; resid; resid -= sz) { 1366 struct vndbuf *nbp; 1367 1368 /* 1369 * translate byteoffset into block number. return values: 1370 * vp = vnode of underlying device 1371 * nbn = new block number (on underlying vnode dev) 1372 * nra = num blocks we can read-ahead (excludes requested 1373 * block) 1374 */ 1375 nra = 0; 1376 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1377 &vp, &nbn, &nra); 1378 1379 if (error == 0 && nbn == (daddr_t)-1) { 1380 /* 1381 * this used to just set error, but that doesn't 1382 * do the right thing. Instead, it causes random 1383 * memory errors. The panic() should remain until 1384 * this condition doesn't destabilize the system. 1385 */ 1386 #if 1 1387 panic("%s: swap to sparse file", __func__); 1388 #else 1389 error = EIO; /* failure */ 1390 #endif 1391 } 1392 1393 /* 1394 * punt if there was an error or a hole in the file. 1395 * we must wait for any i/o ops we have already started 1396 * to finish before returning. 1397 * 1398 * XXX we could deal with holes here but it would be 1399 * a hassle (in the write case). 1400 */ 1401 if (error) { 1402 s = splbio(); 1403 vnx->vx_error = error; /* pass error up */ 1404 goto out; 1405 } 1406 1407 /* 1408 * compute the size ("sz") of this transfer (in bytes). 1409 */ 1410 off = byteoff % sdp->swd_bsize; 1411 sz = (1 + nra) * sdp->swd_bsize - off; 1412 if (sz > resid) 1413 sz = resid; 1414 1415 UVMHIST_LOG(pdhist, "sw_reg_strategy: " 1416 "vp %#jx/%#jx offset %#jx/%#jx", 1417 (uintptr_t)sdp->swd_vp, (uintptr_t)vp, byteoff, nbn); 1418 1419 /* 1420 * now get a buf structure. note that the vb_buf is 1421 * at the front of the nbp structure so that you can 1422 * cast pointers between the two structure easily. 1423 */ 1424 nbp = pool_get(&vndbuf_pool, PR_WAITOK); 1425 buf_init(&nbp->vb_buf); 1426 nbp->vb_buf.b_flags = bp->b_flags; 1427 nbp->vb_buf.b_cflags = bp->b_cflags; 1428 nbp->vb_buf.b_oflags = bp->b_oflags; 1429 nbp->vb_buf.b_bcount = sz; 1430 nbp->vb_buf.b_bufsize = sz; 1431 nbp->vb_buf.b_error = 0; 1432 nbp->vb_buf.b_data = addr; 1433 nbp->vb_buf.b_lblkno = 0; 1434 nbp->vb_buf.b_blkno = nbn + btodb(off); 1435 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno; 1436 nbp->vb_buf.b_iodone = sw_reg_biodone; 1437 nbp->vb_buf.b_vp = vp; 1438 nbp->vb_buf.b_objlock = vp->v_interlock; 1439 if (vp->v_type == VBLK) { 1440 nbp->vb_buf.b_dev = vp->v_rdev; 1441 } 1442 1443 nbp->vb_xfer = vnx; /* patch it back in to vnx */ 1444 1445 /* 1446 * Just sort by block number 1447 */ 1448 s = splbio(); 1449 if (vnx->vx_error != 0) { 1450 buf_destroy(&nbp->vb_buf); 1451 pool_put(&vndbuf_pool, nbp); 1452 goto out; 1453 } 1454 vnx->vx_pending++; 1455 1456 /* sort it in and start I/O if we are not over our limit */ 1457 /* XXXAD locking */ 1458 bufq_put(sdp->swd_tab, &nbp->vb_buf); 1459 sw_reg_start(sdp); 1460 splx(s); 1461 1462 /* 1463 * advance to the next I/O 1464 */ 1465 byteoff += sz; 1466 addr += sz; 1467 } 1468 1469 s = splbio(); 1470 1471 out: /* Arrive here at splbio */ 1472 vnx->vx_flags &= ~VX_BUSY; 1473 if (vnx->vx_pending == 0) { 1474 error = vnx->vx_error; 1475 pool_put(&vndxfer_pool, vnx); 1476 bp->b_error = error; 1477 biodone(bp); 1478 } 1479 splx(s); 1480 } 1481 1482 /* 1483 * sw_reg_start: start an I/O request on the requested swapdev 1484 * 1485 * => reqs are sorted by b_rawblkno (above) 1486 */ 1487 static void 1488 sw_reg_start(struct swapdev *sdp) 1489 { 1490 struct buf *bp; 1491 struct vnode *vp; 1492 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1493 1494 /* recursion control */ 1495 if ((sdp->swd_flags & SWF_BUSY) != 0) 1496 return; 1497 1498 sdp->swd_flags |= SWF_BUSY; 1499 1500 while (sdp->swd_active < sdp->swd_maxactive) { 1501 bp = bufq_get(sdp->swd_tab); 1502 if (bp == NULL) 1503 break; 1504 sdp->swd_active++; 1505 1506 UVMHIST_LOG(pdhist, 1507 "sw_reg_start: bp %#jx vp %#jx blkno %#jx cnt %#jx", 1508 (uintptr_t)bp, (uintptr_t)bp->b_vp, (uintptr_t)bp->b_blkno, 1509 bp->b_bcount); 1510 vp = bp->b_vp; 1511 KASSERT(bp->b_objlock == vp->v_interlock); 1512 if ((bp->b_flags & B_READ) == 0) { 1513 mutex_enter(vp->v_interlock); 1514 vp->v_numoutput++; 1515 mutex_exit(vp->v_interlock); 1516 } 1517 VOP_STRATEGY(vp, bp); 1518 } 1519 sdp->swd_flags &= ~SWF_BUSY; 1520 } 1521 1522 /* 1523 * sw_reg_biodone: one of our i/o's has completed 1524 */ 1525 static void 1526 sw_reg_biodone(struct buf *bp) 1527 { 1528 workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL); 1529 } 1530 1531 /* 1532 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1533 * 1534 * => note that we can recover the vndbuf struct by casting the buf ptr 1535 */ 1536 static void 1537 sw_reg_iodone(struct work *wk, void *dummy) 1538 { 1539 struct vndbuf *vbp = (void *)wk; 1540 struct vndxfer *vnx = vbp->vb_xfer; 1541 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1542 struct swapdev *sdp = vnx->vx_sdp; 1543 int s, resid, error; 1544 KASSERT(&vbp->vb_buf.b_work == wk); 1545 UVMHIST_FUNC(__func__); 1546 UVMHIST_CALLARGS(pdhist, " vbp=%#jx vp=%#jx blkno=%#jx addr=%#jx", 1547 (uintptr_t)vbp, (uintptr_t)vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, 1548 (uintptr_t)vbp->vb_buf.b_data); 1549 UVMHIST_LOG(pdhist, " cnt=%#jx resid=%#jx", 1550 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); 1551 1552 /* 1553 * protect vbp at splbio and update. 1554 */ 1555 1556 s = splbio(); 1557 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1558 pbp->b_resid -= resid; 1559 vnx->vx_pending--; 1560 1561 if (vbp->vb_buf.b_error != 0) { 1562 /* pass error upward */ 1563 error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO; 1564 UVMHIST_LOG(pdhist, " got error=%jd !", error, 0, 0, 0); 1565 vnx->vx_error = error; 1566 } 1567 1568 /* 1569 * kill vbp structure 1570 */ 1571 buf_destroy(&vbp->vb_buf); 1572 pool_put(&vndbuf_pool, vbp); 1573 1574 /* 1575 * wrap up this transaction if it has run to completion or, in 1576 * case of an error, when all auxiliary buffers have returned. 1577 */ 1578 if (vnx->vx_error != 0) { 1579 /* pass error upward */ 1580 error = vnx->vx_error; 1581 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1582 pbp->b_error = error; 1583 biodone(pbp); 1584 pool_put(&vndxfer_pool, vnx); 1585 } 1586 } else if (pbp->b_resid == 0) { 1587 KASSERT(vnx->vx_pending == 0); 1588 if ((vnx->vx_flags & VX_BUSY) == 0) { 1589 UVMHIST_LOG(pdhist, " iodone, pbp=%#jx error=%jd !", 1590 (uintptr_t)pbp, vnx->vx_error, 0, 0); 1591 biodone(pbp); 1592 pool_put(&vndxfer_pool, vnx); 1593 } 1594 } 1595 1596 /* 1597 * done! start next swapdev I/O if one is pending 1598 */ 1599 sdp->swd_active--; 1600 sw_reg_start(sdp); 1601 splx(s); 1602 } 1603 1604 1605 /* 1606 * uvm_swap_alloc: allocate space on swap 1607 * 1608 * => allocation is done "round robin" down the priority list, as we 1609 * allocate in a priority we "rotate" the circle queue. 1610 * => space can be freed with uvm_swap_free 1611 * => we return the page slot number in /dev/drum (0 == invalid slot) 1612 * => we lock uvm_swap_data_lock 1613 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1614 */ 1615 int 1616 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok) 1617 { 1618 struct swapdev *sdp; 1619 struct swappri *spp; 1620 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1621 1622 /* 1623 * no swap devices configured yet? definite failure. 1624 */ 1625 if (uvmexp.nswapdev < 1) 1626 return 0; 1627 1628 /* 1629 * XXXJAK: BEGIN HACK 1630 * 1631 * blist_alloc() in subr_blist.c will panic if we try to allocate 1632 * too many slots. 1633 */ 1634 if (*nslots > BLIST_MAX_ALLOC) { 1635 if (__predict_false(lessok == false)) 1636 return 0; 1637 *nslots = BLIST_MAX_ALLOC; 1638 } 1639 /* XXXJAK: END HACK */ 1640 1641 /* 1642 * lock data lock, convert slots into blocks, and enter loop 1643 */ 1644 mutex_enter(&uvm_swap_data_lock); 1645 1646 ReTry: /* XXXMRG */ 1647 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1648 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1649 uint64_t result; 1650 1651 /* if it's not enabled, then we can't swap from it */ 1652 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1653 continue; 1654 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1655 continue; 1656 result = blist_alloc(sdp->swd_blist, *nslots); 1657 if (result == BLIST_NONE) { 1658 continue; 1659 } 1660 KASSERT(result < sdp->swd_drumsize); 1661 1662 /* 1663 * successful allocation! now rotate the tailq. 1664 */ 1665 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1666 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1667 sdp->swd_npginuse += *nslots; 1668 uvmexp.swpginuse += *nslots; 1669 mutex_exit(&uvm_swap_data_lock); 1670 /* done! return drum slot number */ 1671 UVMHIST_LOG(pdhist, 1672 "success! returning %jd slots starting at %jd", 1673 *nslots, result + sdp->swd_drumoffset, 0, 0); 1674 return (result + sdp->swd_drumoffset); 1675 } 1676 } 1677 1678 /* XXXMRG: BEGIN HACK */ 1679 if (*nslots > 1 && lessok) { 1680 *nslots = 1; 1681 /* XXXMRG: ugh! blist should support this for us */ 1682 goto ReTry; 1683 } 1684 /* XXXMRG: END HACK */ 1685 1686 mutex_exit(&uvm_swap_data_lock); 1687 return 0; 1688 } 1689 1690 /* 1691 * uvm_swapisfull: return true if most of available swap is allocated 1692 * and in use. we don't count some small portion as it may be inaccessible 1693 * to us at any given moment, for example if there is lock contention or if 1694 * pages are busy. 1695 */ 1696 bool 1697 uvm_swapisfull(void) 1698 { 1699 int swpgonly; 1700 bool rv; 1701 1702 if (uvmexp.swpages == 0) { 1703 return true; 1704 } 1705 1706 mutex_enter(&uvm_swap_data_lock); 1707 KASSERT(uvmexp.swpgonly <= uvmexp.swpages); 1708 swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 / 1709 uvm_swapisfull_factor); 1710 rv = (swpgonly >= uvmexp.swpgavail); 1711 mutex_exit(&uvm_swap_data_lock); 1712 1713 return (rv); 1714 } 1715 1716 /* 1717 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1718 * 1719 * => we lock uvm_swap_data_lock 1720 */ 1721 void 1722 uvm_swap_markbad(int startslot, int nslots) 1723 { 1724 struct swapdev *sdp; 1725 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1726 1727 mutex_enter(&uvm_swap_data_lock); 1728 sdp = swapdrum_getsdp(startslot); 1729 KASSERT(sdp != NULL); 1730 1731 /* 1732 * we just keep track of how many pages have been marked bad 1733 * in this device, to make everything add up in swap_off(). 1734 * we assume here that the range of slots will all be within 1735 * one swap device. 1736 */ 1737 1738 KASSERT(uvmexp.swpgonly >= nslots); 1739 atomic_add_int(&uvmexp.swpgonly, -nslots); 1740 sdp->swd_npgbad += nslots; 1741 UVMHIST_LOG(pdhist, "now %jd bad", sdp->swd_npgbad, 0,0,0); 1742 mutex_exit(&uvm_swap_data_lock); 1743 } 1744 1745 /* 1746 * uvm_swap_free: free swap slots 1747 * 1748 * => this can be all or part of an allocation made by uvm_swap_alloc 1749 * => we lock uvm_swap_data_lock 1750 */ 1751 void 1752 uvm_swap_free(int startslot, int nslots) 1753 { 1754 struct swapdev *sdp; 1755 UVMHIST_FUNC(__func__); 1756 UVMHIST_CALLARGS(pdhist, "freeing %jd slots starting at %jd", nslots, 1757 startslot, 0, 0); 1758 1759 /* 1760 * ignore attempts to free the "bad" slot. 1761 */ 1762 1763 if (startslot == SWSLOT_BAD) { 1764 return; 1765 } 1766 1767 /* 1768 * convert drum slot offset back to sdp, free the blocks 1769 * in the extent, and return. must hold pri lock to do 1770 * lookup and access the extent. 1771 */ 1772 1773 mutex_enter(&uvm_swap_data_lock); 1774 sdp = swapdrum_getsdp(startslot); 1775 KASSERT(uvmexp.nswapdev >= 1); 1776 KASSERT(sdp != NULL); 1777 KASSERT(sdp->swd_npginuse >= nslots); 1778 blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots); 1779 sdp->swd_npginuse -= nslots; 1780 uvmexp.swpginuse -= nslots; 1781 mutex_exit(&uvm_swap_data_lock); 1782 } 1783 1784 /* 1785 * uvm_swap_put: put any number of pages into a contig place on swap 1786 * 1787 * => can be sync or async 1788 */ 1789 1790 int 1791 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) 1792 { 1793 int error; 1794 1795 error = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1796 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1797 return error; 1798 } 1799 1800 /* 1801 * uvm_swap_get: get a single page from swap 1802 * 1803 * => usually a sync op (from fault) 1804 */ 1805 1806 int 1807 uvm_swap_get(struct vm_page *page, int swslot, int flags) 1808 { 1809 int error; 1810 1811 atomic_inc_uint(&uvmexp.nswget); 1812 KASSERT(flags & PGO_SYNCIO); 1813 if (swslot == SWSLOT_BAD) { 1814 return EIO; 1815 } 1816 1817 error = uvm_swap_io(&page, swslot, 1, B_READ | 1818 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1819 if (error == 0) { 1820 1821 /* 1822 * this page is no longer only in swap. 1823 */ 1824 1825 KASSERT(uvmexp.swpgonly > 0); 1826 atomic_dec_uint(&uvmexp.swpgonly); 1827 } 1828 return error; 1829 } 1830 1831 /* 1832 * uvm_swap_io: do an i/o operation to swap 1833 */ 1834 1835 static int 1836 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) 1837 { 1838 daddr_t startblk; 1839 struct buf *bp; 1840 vaddr_t kva; 1841 int error, mapinflags; 1842 bool write, async, swap_encrypt; 1843 UVMHIST_FUNC(__func__); 1844 UVMHIST_CALLARGS(pdhist, "<- called, startslot=%jd, npages=%jd, flags=%#jx", 1845 startslot, npages, flags, 0); 1846 1847 write = (flags & B_READ) == 0; 1848 async = (flags & B_ASYNC) != 0; 1849 swap_encrypt = atomic_load_relaxed(&uvm_swap_encrypt); 1850 1851 /* 1852 * allocate a buf for the i/o. 1853 */ 1854 1855 KASSERT(curlwp != uvm.pagedaemon_lwp || (write && async)); 1856 bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp); 1857 if (bp == NULL) { 1858 uvm_aio_aiodone_pages(pps, npages, true, ENOMEM); 1859 return ENOMEM; 1860 } 1861 1862 /* 1863 * convert starting drum slot to block number 1864 */ 1865 1866 startblk = btodb((uint64_t)startslot << PAGE_SHIFT); 1867 1868 /* 1869 * first, map the pages into the kernel. 1870 */ 1871 1872 mapinflags = !write ? 1873 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ : 1874 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE; 1875 if (write && swap_encrypt) /* need to encrypt in-place */ 1876 mapinflags |= UVMPAGER_MAPIN_READ; 1877 kva = uvm_pagermapin(pps, npages, mapinflags); 1878 1879 /* 1880 * encrypt writes in place if requested 1881 */ 1882 1883 if (write) do { 1884 struct swapdev *sdp; 1885 int i; 1886 1887 /* 1888 * Get the swapdev so we can discriminate on the 1889 * encryption state. There may or may not be an 1890 * encryption key generated; we may or may not be asked 1891 * to encrypt swap. 1892 * 1893 * 1. NO KEY, NO ENCRYPTION: Nothing to do. 1894 * 1895 * 2. NO KEY, BUT ENCRYPTION: Generate a key, encrypt, 1896 * and mark the slots encrypted. 1897 * 1898 * 3. KEY, BUT NO ENCRYPTION: The slots may already be 1899 * marked encrypted from a past life. Mark them not 1900 * encrypted. 1901 * 1902 * 4. KEY, ENCRYPTION: Encrypt and mark the slots 1903 * encrypted. 1904 */ 1905 mutex_enter(&uvm_swap_data_lock); 1906 sdp = swapdrum_getsdp(startslot); 1907 if (!sdp->swd_encinit) { 1908 if (!swap_encrypt) { 1909 mutex_exit(&uvm_swap_data_lock); 1910 break; 1911 } 1912 uvm_swap_genkey(sdp); 1913 } 1914 KASSERT(sdp->swd_encinit); 1915 mutex_exit(&uvm_swap_data_lock); 1916 1917 for (i = 0; i < npages; i++) { 1918 int s = startslot + i; 1919 KDASSERT(swapdrum_sdp_is(s, sdp)); 1920 KASSERT(s >= sdp->swd_drumoffset); 1921 s -= sdp->swd_drumoffset; 1922 KASSERT(s < sdp->swd_drumsize); 1923 1924 if (swap_encrypt) { 1925 uvm_swap_encryptpage(sdp, 1926 (void *)(kva + (vsize_t)i*PAGE_SIZE), s); 1927 atomic_or_32(&sdp->swd_encmap[s/32], 1928 __BIT(s%32)); 1929 } else { 1930 atomic_and_32(&sdp->swd_encmap[s/32], 1931 ~__BIT(s%32)); 1932 } 1933 } 1934 } while (0); 1935 1936 /* 1937 * fill in the bp/sbp. we currently route our i/o through 1938 * /dev/drum's vnode [swapdev_vp]. 1939 */ 1940 1941 bp->b_cflags = BC_BUSY | BC_NOCACHE; 1942 bp->b_flags = (flags & (B_READ|B_ASYNC)); 1943 bp->b_proc = &proc0; /* XXX */ 1944 bp->b_vnbufs.le_next = NOLIST; 1945 bp->b_data = (void *)kva; 1946 bp->b_blkno = startblk; 1947 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT; 1948 1949 /* 1950 * bump v_numoutput (counter of number of active outputs). 1951 */ 1952 1953 if (write) { 1954 mutex_enter(swapdev_vp->v_interlock); 1955 swapdev_vp->v_numoutput++; 1956 mutex_exit(swapdev_vp->v_interlock); 1957 } 1958 1959 /* 1960 * for async ops we must set up the iodone handler. 1961 */ 1962 1963 if (async) { 1964 bp->b_iodone = uvm_aio_aiodone; 1965 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); 1966 if (curlwp == uvm.pagedaemon_lwp) 1967 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1968 else 1969 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 1970 } else { 1971 bp->b_iodone = NULL; 1972 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1973 } 1974 UVMHIST_LOG(pdhist, 1975 "about to start io: data = %#jx blkno = %#jx, bcount = %jd", 1976 (uintptr_t)bp->b_data, bp->b_blkno, bp->b_bcount, 0); 1977 1978 /* 1979 * now we start the I/O, and if async, return. 1980 */ 1981 1982 VOP_STRATEGY(swapdev_vp, bp); 1983 if (async) { 1984 /* 1985 * Reads are always synchronous; if this changes, we 1986 * need to add an asynchronous path for decryption. 1987 */ 1988 KASSERT(write); 1989 return 0; 1990 } 1991 1992 /* 1993 * must be sync i/o. wait for it to finish 1994 */ 1995 1996 error = biowait(bp); 1997 if (error) 1998 goto out; 1999 2000 /* 2001 * decrypt reads in place if needed 2002 */ 2003 2004 if (!write) do { 2005 struct swapdev *sdp; 2006 bool encinit; 2007 int i; 2008 2009 /* 2010 * Get the sdp. Everything about it except the encinit 2011 * bit, saying whether the encryption key is 2012 * initialized or not, and the encrypted bit for each 2013 * page, is stable until all swap pages have been 2014 * released and the device is removed. 2015 */ 2016 mutex_enter(&uvm_swap_data_lock); 2017 sdp = swapdrum_getsdp(startslot); 2018 encinit = sdp->swd_encinit; 2019 mutex_exit(&uvm_swap_data_lock); 2020 2021 if (!encinit) 2022 /* 2023 * If there's no encryption key, there's no way 2024 * any of these slots can be encrypted, so 2025 * nothing to do here. 2026 */ 2027 break; 2028 for (i = 0; i < npages; i++) { 2029 int s = startslot + i; 2030 KDASSERT(swapdrum_sdp_is(s, sdp)); 2031 KASSERT(s >= sdp->swd_drumoffset); 2032 s -= sdp->swd_drumoffset; 2033 KASSERT(s < sdp->swd_drumsize); 2034 if ((atomic_load_relaxed(&sdp->swd_encmap[s/32]) & 2035 __BIT(s%32)) == 0) 2036 continue; 2037 uvm_swap_decryptpage(sdp, 2038 (void *)(kva + (vsize_t)i*PAGE_SIZE), s); 2039 } 2040 } while (0); 2041 out: 2042 /* 2043 * kill the pager mapping 2044 */ 2045 2046 uvm_pagermapout(kva, npages); 2047 2048 /* 2049 * now dispose of the buf and we're done. 2050 */ 2051 2052 if (write) { 2053 mutex_enter(swapdev_vp->v_interlock); 2054 vwakeup(bp); 2055 mutex_exit(swapdev_vp->v_interlock); 2056 } 2057 putiobuf(bp); 2058 UVMHIST_LOG(pdhist, "<- done (sync) error=%jd", error, 0, 0, 0); 2059 2060 return (error); 2061 } 2062 2063 /* 2064 * uvm_swap_genkey(sdp) 2065 * 2066 * Generate a key for swap encryption. 2067 */ 2068 static void 2069 uvm_swap_genkey(struct swapdev *sdp) 2070 { 2071 uint8_t key[32]; 2072 2073 KASSERT(!sdp->swd_encinit); 2074 2075 cprng_strong(kern_cprng, key, sizeof key, 0); 2076 aes_setenckey256(&sdp->swd_enckey, key); 2077 aes_setdeckey256(&sdp->swd_deckey, key); 2078 explicit_memset(key, 0, sizeof key); 2079 2080 sdp->swd_encinit = true; 2081 } 2082 2083 /* 2084 * uvm_swap_encryptpage(sdp, kva, slot) 2085 * 2086 * Encrypt one page of data at kva for the specified slot number 2087 * in the swap device. 2088 */ 2089 static void 2090 uvm_swap_encryptpage(struct swapdev *sdp, void *kva, int slot) 2091 { 2092 uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16); 2093 2094 /* iv := AES_k(le32enc(slot) || 0^96) */ 2095 le32enc(preiv, slot); 2096 aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS); 2097 2098 /* *kva := AES-CBC_k(iv, *kva) */ 2099 aes_cbc_enc(&sdp->swd_enckey, kva, kva, PAGE_SIZE, iv, 2100 AES_256_NROUNDS); 2101 2102 explicit_memset(&iv, 0, sizeof iv); 2103 } 2104 2105 /* 2106 * uvm_swap_decryptpage(sdp, kva, slot) 2107 * 2108 * Decrypt one page of data at kva for the specified slot number 2109 * in the swap device. 2110 */ 2111 static void 2112 uvm_swap_decryptpage(struct swapdev *sdp, void *kva, int slot) 2113 { 2114 uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16); 2115 2116 /* iv := AES_k(le32enc(slot) || 0^96) */ 2117 le32enc(preiv, slot); 2118 aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS); 2119 2120 /* *kva := AES-CBC^{-1}_k(iv, *kva) */ 2121 aes_cbc_dec(&sdp->swd_deckey, kva, kva, PAGE_SIZE, iv, 2122 AES_256_NROUNDS); 2123 2124 explicit_memset(&iv, 0, sizeof iv); 2125 } 2126 2127 SYSCTL_SETUP(sysctl_uvmswap_setup, "sysctl uvmswap setup") 2128 { 2129 2130 sysctl_createv(clog, 0, NULL, NULL, 2131 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "swap_encrypt", 2132 SYSCTL_DESCR("Encrypt data when swapped out to disk"), 2133 NULL, 0, &uvm_swap_encrypt, 0, 2134 CTL_VM, CTL_CREATE, CTL_EOL); 2135 } 2136