1 /* $NetBSD: uvm_swap.c,v 1.208 2023/04/09 09:00:56 riastradh Exp $ */ 2 3 /* 4 * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 29 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.208 2023/04/09 09:00:56 riastradh Exp $"); 34 35 #include "opt_uvmhist.h" 36 #include "opt_compat_netbsd.h" 37 #include "opt_ddb.h" 38 #include "opt_vmswap.h" 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/atomic.h> 43 #include <sys/buf.h> 44 #include <sys/bufq.h> 45 #include <sys/conf.h> 46 #include <sys/cprng.h> 47 #include <sys/proc.h> 48 #include <sys/namei.h> 49 #include <sys/disklabel.h> 50 #include <sys/errno.h> 51 #include <sys/kernel.h> 52 #include <sys/vnode.h> 53 #include <sys/file.h> 54 #include <sys/vmem.h> 55 #include <sys/blist.h> 56 #include <sys/mount.h> 57 #include <sys/pool.h> 58 #include <sys/kmem.h> 59 #include <sys/syscallargs.h> 60 #include <sys/swap.h> 61 #include <sys/kauth.h> 62 #include <sys/sysctl.h> 63 #include <sys/workqueue.h> 64 65 #include <uvm/uvm.h> 66 67 #include <miscfs/specfs/specdev.h> 68 69 #include <crypto/aes/aes.h> 70 #include <crypto/aes/aes_cbc.h> 71 72 /* 73 * uvm_swap.c: manage configuration and i/o to swap space. 74 */ 75 76 /* 77 * swap space is managed in the following way: 78 * 79 * each swap partition or file is described by a "swapdev" structure. 80 * each "swapdev" structure contains a "swapent" structure which contains 81 * information that is passed up to the user (via system calls). 82 * 83 * each swap partition is assigned a "priority" (int) which controls 84 * swap partition usage. 85 * 86 * the system maintains a global data structure describing all swap 87 * partitions/files. there is a sorted LIST of "swappri" structures 88 * which describe "swapdev"'s at that priority. this LIST is headed 89 * by the "swap_priority" global var. each "swappri" contains a 90 * TAILQ of "swapdev" structures at that priority. 91 * 92 * locking: 93 * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl 94 * system call and prevents the swap priority list from changing 95 * while we are in the middle of a system call (e.g. SWAP_STATS). 96 * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data 97 * structures including the priority list, the swapdev structures, 98 * and the swapmap arena. 99 * 100 * each swap device has the following info: 101 * - swap device in use (could be disabled, preventing future use) 102 * - swap enabled (allows new allocations on swap) 103 * - map info in /dev/drum 104 * - vnode pointer 105 * for swap files only: 106 * - block size 107 * - max byte count in buffer 108 * - buffer 109 * 110 * userland controls and configures swap with the swapctl(2) system call. 111 * the sys_swapctl performs the following operations: 112 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 113 * [2] SWAP_STATS: given a pointer to an array of swapent structures 114 * (passed in via "arg") of a size passed in via "misc" ... we load 115 * the current swap config into the array. The actual work is done 116 * in the uvm_swap_stats() function. 117 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 118 * priority in "misc", start swapping on it. 119 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 120 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 121 * "misc") 122 */ 123 124 /* 125 * swapdev: describes a single swap partition/file 126 * 127 * note the following should be true: 128 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 129 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] 130 */ 131 struct swapdev { 132 dev_t swd_dev; /* device id */ 133 int swd_flags; /* flags:inuse/enable/fake */ 134 int swd_priority; /* our priority */ 135 int swd_nblks; /* blocks in this device */ 136 char *swd_path; /* saved pathname of device */ 137 int swd_pathlen; /* length of pathname */ 138 int swd_npages; /* #pages we can use */ 139 int swd_npginuse; /* #pages in use */ 140 int swd_npgbad; /* #pages bad */ 141 int swd_drumoffset; /* page0 offset in drum */ 142 int swd_drumsize; /* #pages in drum */ 143 blist_t swd_blist; /* blist for this swapdev */ 144 struct vnode *swd_vp; /* backing vnode */ 145 TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */ 146 147 int swd_bsize; /* blocksize (bytes) */ 148 int swd_maxactive; /* max active i/o reqs */ 149 struct bufq_state *swd_tab; /* buffer list */ 150 int swd_active; /* number of active buffers */ 151 152 volatile uint32_t *swd_encmap; /* bitmap of encrypted slots */ 153 struct aesenc swd_enckey; /* AES key expanded for enc */ 154 struct aesdec swd_deckey; /* AES key expanded for dec */ 155 bool swd_encinit; /* true if keys initialized */ 156 }; 157 158 /* 159 * swap device priority entry; the list is kept sorted on `spi_priority'. 160 */ 161 struct swappri { 162 int spi_priority; /* priority */ 163 TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 164 /* tailq of swapdevs at this priority */ 165 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 166 }; 167 168 /* 169 * The following two structures are used to keep track of data transfers 170 * on swap devices associated with regular files. 171 * NOTE: this code is more or less a copy of vnd.c; we use the same 172 * structure names here to ease porting.. 173 */ 174 struct vndxfer { 175 struct buf *vx_bp; /* Pointer to parent buffer */ 176 struct swapdev *vx_sdp; 177 int vx_error; 178 int vx_pending; /* # of pending aux buffers */ 179 int vx_flags; 180 #define VX_BUSY 1 181 #define VX_DEAD 2 182 }; 183 184 struct vndbuf { 185 struct buf vb_buf; 186 struct vndxfer *vb_xfer; 187 }; 188 189 /* 190 * We keep a of pool vndbuf's and vndxfer structures. 191 */ 192 static struct pool vndxfer_pool, vndbuf_pool; 193 194 /* 195 * local variables 196 */ 197 static vmem_t *swapmap; /* controls the mapping of /dev/drum */ 198 199 /* list of all active swap devices [by priority] */ 200 LIST_HEAD(swap_priority, swappri); 201 static struct swap_priority swap_priority; 202 203 /* locks */ 204 static kmutex_t uvm_swap_data_lock __cacheline_aligned; 205 static krwlock_t swap_syscall_lock; 206 bool uvm_swap_init_done = false; 207 208 /* workqueue and use counter for swap to regular files */ 209 static int sw_reg_count = 0; 210 static struct workqueue *sw_reg_workqueue; 211 212 /* tuneables */ 213 u_int uvm_swapisfull_factor = 99; 214 #if VMSWAP_DEFAULT_PLAINTEXT 215 bool uvm_swap_encrypt = false; 216 #else 217 bool uvm_swap_encrypt = true; 218 #endif 219 220 /* 221 * prototypes 222 */ 223 static struct swapdev *swapdrum_getsdp(int); 224 225 static struct swapdev *swaplist_find(struct vnode *, bool); 226 static void swaplist_insert(struct swapdev *, 227 struct swappri *, int); 228 static void swaplist_trim(void); 229 230 static int swap_on(struct lwp *, struct swapdev *); 231 static int swap_off(struct lwp *, struct swapdev *); 232 233 static void sw_reg_strategy(struct swapdev *, struct buf *, int); 234 static void sw_reg_biodone(struct buf *); 235 static void sw_reg_iodone(struct work *wk, void *dummy); 236 static void sw_reg_start(struct swapdev *); 237 238 static int uvm_swap_io(struct vm_page **, int, int, int); 239 240 static void uvm_swap_genkey(struct swapdev *); 241 static void uvm_swap_encryptpage(struct swapdev *, void *, int); 242 static void uvm_swap_decryptpage(struct swapdev *, void *, int); 243 244 static size_t 245 encmap_size(size_t npages) 246 { 247 struct swapdev *sdp; 248 const size_t bytesperword = sizeof(sdp->swd_encmap[0]); 249 const size_t bitsperword = NBBY * bytesperword; 250 const size_t nbits = npages; /* one bit for each page */ 251 const size_t nwords = howmany(nbits, bitsperword); 252 const size_t nbytes = nwords * bytesperword; 253 254 return nbytes; 255 } 256 257 /* 258 * uvm_swap_init: init the swap system data structures and locks 259 * 260 * => called at boot time from init_main.c after the filesystems 261 * are brought up (which happens after uvm_init()) 262 */ 263 void 264 uvm_swap_init(void) 265 { 266 UVMHIST_FUNC(__func__); 267 268 UVMHIST_CALLED(pdhist); 269 /* 270 * first, init the swap list, its counter, and its lock. 271 * then get a handle on the vnode for /dev/drum by using 272 * the its dev_t number ("swapdev", from MD conf.c). 273 */ 274 275 LIST_INIT(&swap_priority); 276 uvmexp.nswapdev = 0; 277 rw_init(&swap_syscall_lock); 278 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE); 279 280 if (bdevvp(swapdev, &swapdev_vp)) 281 panic("%s: can't get vnode for swap device", __func__); 282 if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY)) 283 panic("%s: can't lock swap device", __func__); 284 if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED)) 285 panic("%s: can't open swap device", __func__); 286 VOP_UNLOCK(swapdev_vp); 287 288 /* 289 * create swap block resource map to map /dev/drum. the range 290 * from 1 to INT_MAX allows 2 gigablocks of swap space. note 291 * that block 0 is reserved (used to indicate an allocation 292 * failure, or no allocation). 293 */ 294 swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0, 295 VM_NOSLEEP, IPL_NONE); 296 if (swapmap == 0) { 297 panic("%s: vmem_create failed", __func__); 298 } 299 300 pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx", 301 NULL, IPL_BIO); 302 pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd", 303 NULL, IPL_BIO); 304 305 uvm_swap_init_done = true; 306 307 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); 308 } 309 310 /* 311 * swaplist functions: functions that operate on the list of swap 312 * devices on the system. 313 */ 314 315 /* 316 * swaplist_insert: insert swap device "sdp" into the global list 317 * 318 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 319 * => caller must provide a newly allocated swappri structure (we will 320 * FREE it if we don't need it... this it to prevent allocation 321 * blocking here while adding swap) 322 */ 323 static void 324 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) 325 { 326 struct swappri *spp, *pspp; 327 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 328 329 KASSERT(rw_write_held(&swap_syscall_lock)); 330 KASSERT(mutex_owned(&uvm_swap_data_lock)); 331 332 /* 333 * find entry at or after which to insert the new device. 334 */ 335 pspp = NULL; 336 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 337 if (priority <= spp->spi_priority) 338 break; 339 pspp = spp; 340 } 341 342 /* 343 * new priority? 344 */ 345 if (spp == NULL || spp->spi_priority != priority) { 346 spp = newspp; /* use newspp! */ 347 UVMHIST_LOG(pdhist, "created new swappri = %jd", 348 priority, 0, 0, 0); 349 350 spp->spi_priority = priority; 351 TAILQ_INIT(&spp->spi_swapdev); 352 353 if (pspp) 354 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 355 else 356 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 357 } else { 358 /* we don't need a new priority structure, free it */ 359 kmem_free(newspp, sizeof(*newspp)); 360 } 361 362 /* 363 * priority found (or created). now insert on the priority's 364 * tailq list and bump the total number of swapdevs. 365 */ 366 sdp->swd_priority = priority; 367 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 368 uvmexp.nswapdev++; 369 } 370 371 /* 372 * swaplist_find: find and optionally remove a swap device from the 373 * global list. 374 * 375 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 376 * => we return the swapdev we found (and removed) 377 */ 378 static struct swapdev * 379 swaplist_find(struct vnode *vp, bool remove) 380 { 381 struct swapdev *sdp; 382 struct swappri *spp; 383 384 KASSERT(rw_lock_held(&swap_syscall_lock)); 385 KASSERT(remove ? rw_write_held(&swap_syscall_lock) : 1); 386 KASSERT(mutex_owned(&uvm_swap_data_lock)); 387 388 /* 389 * search the lists for the requested vp 390 */ 391 392 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 393 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 394 if (sdp->swd_vp == vp) { 395 if (remove) { 396 TAILQ_REMOVE(&spp->spi_swapdev, 397 sdp, swd_next); 398 uvmexp.nswapdev--; 399 } 400 return(sdp); 401 } 402 } 403 } 404 return (NULL); 405 } 406 407 /* 408 * swaplist_trim: scan priority list for empty priority entries and kill 409 * them. 410 * 411 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 412 */ 413 static void 414 swaplist_trim(void) 415 { 416 struct swappri *spp, *nextspp; 417 418 KASSERT(rw_write_held(&swap_syscall_lock)); 419 KASSERT(mutex_owned(&uvm_swap_data_lock)); 420 421 LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) { 422 if (!TAILQ_EMPTY(&spp->spi_swapdev)) 423 continue; 424 LIST_REMOVE(spp, spi_swappri); 425 kmem_free(spp, sizeof(*spp)); 426 } 427 } 428 429 /* 430 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 431 * to the "swapdev" that maps that section of the drum. 432 * 433 * => each swapdev takes one big contig chunk of the drum 434 * => caller must hold uvm_swap_data_lock 435 */ 436 static struct swapdev * 437 swapdrum_getsdp(int pgno) 438 { 439 struct swapdev *sdp; 440 struct swappri *spp; 441 442 KASSERT(mutex_owned(&uvm_swap_data_lock)); 443 444 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 445 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 446 if (sdp->swd_flags & SWF_FAKE) 447 continue; 448 if (pgno >= sdp->swd_drumoffset && 449 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 450 return sdp; 451 } 452 } 453 } 454 return NULL; 455 } 456 457 /* 458 * swapdrum_sdp_is: true iff the swap device for pgno is sdp 459 * 460 * => for use in positive assertions only; result is not stable 461 */ 462 static bool __debugused 463 swapdrum_sdp_is(int pgno, struct swapdev *sdp) 464 { 465 bool result; 466 467 mutex_enter(&uvm_swap_data_lock); 468 result = swapdrum_getsdp(pgno) == sdp; 469 mutex_exit(&uvm_swap_data_lock); 470 471 return result; 472 } 473 474 void swapsys_lock(krw_t op) 475 { 476 rw_enter(&swap_syscall_lock, op); 477 } 478 479 void swapsys_unlock(void) 480 { 481 rw_exit(&swap_syscall_lock); 482 } 483 484 static void 485 swapent_cvt(struct swapent *se, const struct swapdev *sdp, int inuse) 486 { 487 se->se_dev = sdp->swd_dev; 488 se->se_flags = sdp->swd_flags; 489 se->se_nblks = sdp->swd_nblks; 490 se->se_inuse = inuse; 491 se->se_priority = sdp->swd_priority; 492 KASSERT(sdp->swd_pathlen < sizeof(se->se_path)); 493 strcpy(se->se_path, sdp->swd_path); 494 } 495 496 int (*uvm_swap_stats13)(const struct sys_swapctl_args *, register_t *) = 497 (void *)enosys; 498 int (*uvm_swap_stats50)(const struct sys_swapctl_args *, register_t *) = 499 (void *)enosys; 500 501 /* 502 * sys_swapctl: main entry point for swapctl(2) system call 503 * [with two helper functions: swap_on and swap_off] 504 */ 505 int 506 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval) 507 { 508 /* { 509 syscallarg(int) cmd; 510 syscallarg(void *) arg; 511 syscallarg(int) misc; 512 } */ 513 struct vnode *vp; 514 struct nameidata nd; 515 struct swappri *spp; 516 struct swapdev *sdp; 517 #define SWAP_PATH_MAX (PATH_MAX + 1) 518 char *userpath; 519 size_t len = 0; 520 int error; 521 int priority; 522 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 523 524 /* 525 * we handle the non-priv NSWAP and STATS request first. 526 * 527 * SWAP_NSWAP: return number of config'd swap devices 528 * [can also be obtained with uvmexp sysctl] 529 */ 530 if (SCARG(uap, cmd) == SWAP_NSWAP) { 531 const int nswapdev = uvmexp.nswapdev; 532 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%jd", nswapdev, 533 0, 0, 0); 534 *retval = nswapdev; 535 return 0; 536 } 537 538 userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP); 539 540 /* 541 * ensure serialized syscall access by grabbing the swap_syscall_lock 542 */ 543 rw_enter(&swap_syscall_lock, RW_WRITER); 544 545 /* 546 * SWAP_STATS: get stats on current # of configured swap devs 547 * 548 * note that the swap_priority list can't change as long 549 * as we are holding the swap_syscall_lock. we don't want 550 * to grab the uvm_swap_data_lock because we may fault&sleep during 551 * copyout() and we don't want to be holding that lock then! 552 */ 553 switch (SCARG(uap, cmd)) { 554 case SWAP_STATS13: 555 error = (*uvm_swap_stats13)(uap, retval); 556 goto out; 557 case SWAP_STATS50: 558 error = (*uvm_swap_stats50)(uap, retval); 559 goto out; 560 case SWAP_STATS: 561 error = uvm_swap_stats(SCARG(uap, arg), SCARG(uap, misc), 562 NULL, sizeof(struct swapent), retval); 563 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); 564 goto out; 565 566 case SWAP_GETDUMPDEV: 567 error = copyout(&dumpdev, SCARG(uap, arg), sizeof(dumpdev)); 568 goto out; 569 default: 570 break; 571 } 572 573 /* 574 * all other requests require superuser privs. verify. 575 */ 576 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL, 577 0, NULL, NULL, NULL))) 578 goto out; 579 580 if (SCARG(uap, cmd) == SWAP_DUMPOFF) { 581 /* drop the current dump device */ 582 dumpdev = NODEV; 583 dumpcdev = NODEV; 584 cpu_dumpconf(); 585 goto out; 586 } 587 588 /* 589 * at this point we expect a path name in arg. we will 590 * use namei() to gain a vnode reference (vref), and lock 591 * the vnode (VOP_LOCK). 592 * 593 * XXX: a NULL arg means use the root vnode pointer (e.g. for 594 * miniroot) 595 */ 596 if (SCARG(uap, arg) == NULL) { 597 vp = rootvp; /* miniroot */ 598 vref(vp); 599 if (vn_lock(vp, LK_EXCLUSIVE)) { 600 vrele(vp); 601 error = EBUSY; 602 goto out; 603 } 604 if (SCARG(uap, cmd) == SWAP_ON && 605 copystr("miniroot", userpath, SWAP_PATH_MAX, &len)) 606 panic("swapctl: miniroot copy failed"); 607 } else { 608 struct pathbuf *pb; 609 610 /* 611 * This used to allow copying in one extra byte 612 * (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON. 613 * This was completely pointless because if anyone 614 * used that extra byte namei would fail with 615 * ENAMETOOLONG anyway, so I've removed the excess 616 * logic. - dholland 20100215 617 */ 618 619 error = pathbuf_copyin(SCARG(uap, arg), &pb); 620 if (error) { 621 goto out; 622 } 623 if (SCARG(uap, cmd) == SWAP_ON) { 624 /* get a copy of the string */ 625 pathbuf_copystring(pb, userpath, SWAP_PATH_MAX); 626 len = strlen(userpath) + 1; 627 } 628 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb); 629 if ((error = namei(&nd))) { 630 pathbuf_destroy(pb); 631 goto out; 632 } 633 vp = nd.ni_vp; 634 pathbuf_destroy(pb); 635 } 636 /* note: "vp" is referenced and locked */ 637 638 error = 0; /* assume no error */ 639 switch(SCARG(uap, cmd)) { 640 641 case SWAP_DUMPDEV: 642 if (vp->v_type != VBLK) { 643 error = ENOTBLK; 644 break; 645 } 646 if (bdevsw_lookup(vp->v_rdev)) { 647 dumpdev = vp->v_rdev; 648 dumpcdev = devsw_blk2chr(dumpdev); 649 } else 650 dumpdev = NODEV; 651 cpu_dumpconf(); 652 break; 653 654 case SWAP_CTL: 655 /* 656 * get new priority, remove old entry (if any) and then 657 * reinsert it in the correct place. finally, prune out 658 * any empty priority structures. 659 */ 660 priority = SCARG(uap, misc); 661 spp = kmem_alloc(sizeof(*spp), KM_SLEEP); 662 mutex_enter(&uvm_swap_data_lock); 663 if ((sdp = swaplist_find(vp, true)) == NULL) { 664 error = ENOENT; 665 } else { 666 swaplist_insert(sdp, spp, priority); 667 swaplist_trim(); 668 } 669 mutex_exit(&uvm_swap_data_lock); 670 if (error) 671 kmem_free(spp, sizeof(*spp)); 672 break; 673 674 case SWAP_ON: 675 676 /* 677 * check for duplicates. if none found, then insert a 678 * dummy entry on the list to prevent someone else from 679 * trying to enable this device while we are working on 680 * it. 681 */ 682 683 priority = SCARG(uap, misc); 684 sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP); 685 spp = kmem_alloc(sizeof(*spp), KM_SLEEP); 686 sdp->swd_flags = SWF_FAKE; 687 sdp->swd_vp = vp; 688 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 689 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK); 690 mutex_enter(&uvm_swap_data_lock); 691 if (swaplist_find(vp, false) != NULL) { 692 error = EBUSY; 693 mutex_exit(&uvm_swap_data_lock); 694 bufq_free(sdp->swd_tab); 695 kmem_free(sdp, sizeof(*sdp)); 696 kmem_free(spp, sizeof(*spp)); 697 break; 698 } 699 swaplist_insert(sdp, spp, priority); 700 mutex_exit(&uvm_swap_data_lock); 701 702 KASSERT(len > 0); 703 sdp->swd_pathlen = len; 704 sdp->swd_path = kmem_alloc(len, KM_SLEEP); 705 if (copystr(userpath, sdp->swd_path, len, 0) != 0) 706 panic("swapctl: copystr"); 707 708 /* 709 * we've now got a FAKE placeholder in the swap list. 710 * now attempt to enable swap on it. if we fail, undo 711 * what we've done and kill the fake entry we just inserted. 712 * if swap_on is a success, it will clear the SWF_FAKE flag 713 */ 714 715 if ((error = swap_on(l, sdp)) != 0) { 716 mutex_enter(&uvm_swap_data_lock); 717 (void) swaplist_find(vp, true); /* kill fake entry */ 718 swaplist_trim(); 719 mutex_exit(&uvm_swap_data_lock); 720 bufq_free(sdp->swd_tab); 721 kmem_free(sdp->swd_path, sdp->swd_pathlen); 722 kmem_free(sdp, sizeof(*sdp)); 723 break; 724 } 725 break; 726 727 case SWAP_OFF: 728 mutex_enter(&uvm_swap_data_lock); 729 if ((sdp = swaplist_find(vp, false)) == NULL) { 730 mutex_exit(&uvm_swap_data_lock); 731 error = ENXIO; 732 break; 733 } 734 735 /* 736 * If a device isn't in use or enabled, we 737 * can't stop swapping from it (again). 738 */ 739 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 740 mutex_exit(&uvm_swap_data_lock); 741 error = EBUSY; 742 break; 743 } 744 745 /* 746 * do the real work. 747 */ 748 error = swap_off(l, sdp); 749 break; 750 751 default: 752 error = EINVAL; 753 } 754 755 /* 756 * done! release the ref gained by namei() and unlock. 757 */ 758 vput(vp); 759 out: 760 rw_exit(&swap_syscall_lock); 761 kmem_free(userpath, SWAP_PATH_MAX); 762 763 UVMHIST_LOG(pdhist, "<- done! error=%jd", error, 0, 0, 0); 764 return (error); 765 } 766 767 /* 768 * uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept 769 * away from sys_swapctl() in order to allow COMPAT_* swapctl() 770 * emulation to use it directly without going through sys_swapctl(). 771 * The problem with using sys_swapctl() there is that it involves 772 * copying the swapent array to the stackgap, and this array's size 773 * is not known at build time. Hence it would not be possible to 774 * ensure it would fit in the stackgap in any case. 775 */ 776 int 777 uvm_swap_stats(char *ptr, int misc, 778 void (*f)(void *, const struct swapent *), size_t len, 779 register_t *retval) 780 { 781 struct swappri *spp; 782 struct swapdev *sdp; 783 struct swapent sep; 784 int count = 0; 785 int error; 786 787 KASSERT(len <= sizeof(sep)); 788 if (len == 0) 789 return ENOSYS; 790 791 if (misc < 0) 792 return EINVAL; 793 794 if (misc == 0 || uvmexp.nswapdev == 0) 795 return 0; 796 797 /* Make sure userland cannot exhaust kernel memory */ 798 if ((size_t)misc > (size_t)uvmexp.nswapdev) 799 misc = uvmexp.nswapdev; 800 801 KASSERT(rw_lock_held(&swap_syscall_lock)); 802 803 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 804 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 805 int inuse; 806 807 if (misc-- <= 0) 808 break; 809 810 inuse = btodb((uint64_t)sdp->swd_npginuse << 811 PAGE_SHIFT); 812 813 memset(&sep, 0, sizeof(sep)); 814 swapent_cvt(&sep, sdp, inuse); 815 if (f) 816 (*f)(&sep, &sep); 817 if ((error = copyout(&sep, ptr, len)) != 0) 818 return error; 819 ptr += len; 820 count++; 821 } 822 } 823 *retval = count; 824 return 0; 825 } 826 827 /* 828 * swap_on: attempt to enable a swapdev for swapping. note that the 829 * swapdev is already on the global list, but disabled (marked 830 * SWF_FAKE). 831 * 832 * => we avoid the start of the disk (to protect disk labels) 833 * => we also avoid the miniroot, if we are swapping to root. 834 * => caller should leave uvm_swap_data_lock unlocked, we may lock it 835 * if needed. 836 */ 837 static int 838 swap_on(struct lwp *l, struct swapdev *sdp) 839 { 840 struct vnode *vp; 841 int error, npages, nblocks, size; 842 long addr; 843 vmem_addr_t result; 844 struct vattr va; 845 dev_t dev; 846 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 847 848 /* 849 * we want to enable swapping on sdp. the swd_vp contains 850 * the vnode we want (locked and ref'd), and the swd_dev 851 * contains the dev_t of the file, if it a block device. 852 */ 853 854 vp = sdp->swd_vp; 855 dev = sdp->swd_dev; 856 857 /* 858 * open the swap file (mostly useful for block device files to 859 * let device driver know what is up). 860 * 861 * we skip the open/close for root on swap because the root 862 * has already been opened when root was mounted (mountroot). 863 */ 864 if (vp != rootvp) { 865 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred))) 866 return (error); 867 } 868 869 /* XXX this only works for block devices */ 870 UVMHIST_LOG(pdhist, " dev=%jd, major(dev)=%jd", dev, major(dev), 0, 0); 871 872 /* 873 * we now need to determine the size of the swap area. for 874 * block specials we can call the d_psize function. 875 * for normal files, we must stat [get attrs]. 876 * 877 * we put the result in nblks. 878 * for normal files, we also want the filesystem block size 879 * (which we get with statfs). 880 */ 881 switch (vp->v_type) { 882 case VBLK: 883 if ((nblocks = bdev_size(dev)) == -1) { 884 error = ENXIO; 885 goto bad; 886 } 887 break; 888 889 case VREG: 890 if ((error = VOP_GETATTR(vp, &va, l->l_cred))) 891 goto bad; 892 nblocks = (int)btodb(va.va_size); 893 sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift; 894 /* 895 * limit the max # of outstanding I/O requests we issue 896 * at any one time. take it easy on NFS servers. 897 */ 898 if (vp->v_tag == VT_NFS) 899 sdp->swd_maxactive = 2; /* XXX */ 900 else 901 sdp->swd_maxactive = 8; /* XXX */ 902 break; 903 904 default: 905 error = ENXIO; 906 goto bad; 907 } 908 909 /* 910 * save nblocks in a safe place and convert to pages. 911 */ 912 913 sdp->swd_nblks = nblocks; 914 npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT; 915 916 /* 917 * for block special files, we want to make sure that leave 918 * the disklabel and bootblocks alone, so we arrange to skip 919 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 920 * note that because of this the "size" can be less than the 921 * actual number of blocks on the device. 922 */ 923 if (vp->v_type == VBLK) { 924 /* we use pages 1 to (size - 1) [inclusive] */ 925 size = npages - 1; 926 addr = 1; 927 } else { 928 /* we use pages 0 to (size - 1) [inclusive] */ 929 size = npages; 930 addr = 0; 931 } 932 933 /* 934 * make sure we have enough blocks for a reasonable sized swap 935 * area. we want at least one page. 936 */ 937 938 if (size < 1) { 939 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); 940 error = EINVAL; 941 goto bad; 942 } 943 944 UVMHIST_LOG(pdhist, " dev=%#jx: size=%jd addr=%jd", dev, size, addr, 0); 945 946 /* 947 * now we need to allocate an extent to manage this swap device 948 */ 949 950 sdp->swd_blist = blist_create(npages); 951 /* mark all expect the `saved' region free. */ 952 blist_free(sdp->swd_blist, addr, size); 953 954 /* 955 * allocate space to for swap encryption state and mark the 956 * keys uninitialized so we generate them lazily 957 */ 958 sdp->swd_encmap = kmem_zalloc(encmap_size(npages), KM_SLEEP); 959 sdp->swd_encinit = false; 960 961 /* 962 * if the vnode we are swapping to is the root vnode 963 * (i.e. we are swapping to the miniroot) then we want 964 * to make sure we don't overwrite it. do a statfs to 965 * find its size and skip over it. 966 */ 967 if (vp == rootvp) { 968 struct mount *mp; 969 struct statvfs *sp; 970 int rootblocks, rootpages; 971 972 mp = rootvnode->v_mount; 973 sp = &mp->mnt_stat; 974 rootblocks = sp->f_blocks * btodb(sp->f_frsize); 975 /* 976 * XXX: sp->f_blocks isn't the total number of 977 * blocks in the filesystem, it's the number of 978 * data blocks. so, our rootblocks almost 979 * definitely underestimates the total size 980 * of the filesystem - how badly depends on the 981 * details of the filesystem type. there isn't 982 * an obvious way to deal with this cleanly 983 * and perfectly, so for now we just pad our 984 * rootblocks estimate with an extra 5 percent. 985 */ 986 rootblocks += (rootblocks >> 5) + 987 (rootblocks >> 6) + 988 (rootblocks >> 7); 989 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; 990 if (rootpages > size) 991 panic("swap_on: miniroot larger than swap?"); 992 993 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) { 994 panic("swap_on: unable to preserve miniroot"); 995 } 996 997 size -= rootpages; 998 printf("Preserved %d pages of miniroot ", rootpages); 999 printf("leaving %d pages of swap\n", size); 1000 } 1001 1002 /* 1003 * add a ref to vp to reflect usage as a swap device. 1004 */ 1005 vref(vp); 1006 1007 /* 1008 * now add the new swapdev to the drum and enable. 1009 */ 1010 error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result); 1011 if (error != 0) 1012 panic("swapdrum_add"); 1013 /* 1014 * If this is the first regular swap create the workqueue. 1015 * => Protected by swap_syscall_lock. 1016 */ 1017 if (vp->v_type != VBLK) { 1018 if (sw_reg_count++ == 0) { 1019 KASSERT(sw_reg_workqueue == NULL); 1020 if (workqueue_create(&sw_reg_workqueue, "swapiod", 1021 sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0) 1022 panic("%s: workqueue_create failed", __func__); 1023 } 1024 } 1025 1026 sdp->swd_drumoffset = (int)result; 1027 sdp->swd_drumsize = npages; 1028 sdp->swd_npages = size; 1029 mutex_enter(&uvm_swap_data_lock); 1030 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 1031 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 1032 uvmexp.swpages += size; 1033 uvmexp.swpgavail += size; 1034 mutex_exit(&uvm_swap_data_lock); 1035 return (0); 1036 1037 /* 1038 * failure: clean up and return error. 1039 */ 1040 1041 bad: 1042 if (sdp->swd_blist) { 1043 blist_destroy(sdp->swd_blist); 1044 } 1045 if (vp != rootvp) { 1046 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred); 1047 } 1048 return (error); 1049 } 1050 1051 /* 1052 * swap_off: stop swapping on swapdev 1053 * 1054 * => swap data should be locked, we will unlock. 1055 */ 1056 static int 1057 swap_off(struct lwp *l, struct swapdev *sdp) 1058 { 1059 int npages = sdp->swd_npages; 1060 int error = 0; 1061 1062 UVMHIST_FUNC(__func__); 1063 UVMHIST_CALLARGS(pdhist, " dev=%#jx, npages=%jd", sdp->swd_dev,npages, 0, 0); 1064 1065 KASSERT(rw_write_held(&swap_syscall_lock)); 1066 KASSERT(mutex_owned(&uvm_swap_data_lock)); 1067 1068 /* disable the swap area being removed */ 1069 sdp->swd_flags &= ~SWF_ENABLE; 1070 uvmexp.swpgavail -= npages; 1071 mutex_exit(&uvm_swap_data_lock); 1072 1073 /* 1074 * the idea is to find all the pages that are paged out to this 1075 * device, and page them all in. in uvm, swap-backed pageable 1076 * memory can take two forms: aobjs and anons. call the 1077 * swapoff hook for each subsystem to bring in pages. 1078 */ 1079 1080 if (uao_swap_off(sdp->swd_drumoffset, 1081 sdp->swd_drumoffset + sdp->swd_drumsize) || 1082 amap_swap_off(sdp->swd_drumoffset, 1083 sdp->swd_drumoffset + sdp->swd_drumsize)) { 1084 error = ENOMEM; 1085 } else if (sdp->swd_npginuse > sdp->swd_npgbad) { 1086 error = EBUSY; 1087 } 1088 1089 if (error) { 1090 mutex_enter(&uvm_swap_data_lock); 1091 sdp->swd_flags |= SWF_ENABLE; 1092 uvmexp.swpgavail += npages; 1093 mutex_exit(&uvm_swap_data_lock); 1094 1095 return error; 1096 } 1097 1098 /* 1099 * If this is the last regular swap destroy the workqueue. 1100 * => Protected by swap_syscall_lock. 1101 */ 1102 if (sdp->swd_vp->v_type != VBLK) { 1103 KASSERT(sw_reg_count > 0); 1104 KASSERT(sw_reg_workqueue != NULL); 1105 if (--sw_reg_count == 0) { 1106 workqueue_destroy(sw_reg_workqueue); 1107 sw_reg_workqueue = NULL; 1108 } 1109 } 1110 1111 /* 1112 * done with the vnode. 1113 * drop our ref on the vnode before calling VOP_CLOSE() 1114 * so that spec_close() can tell if this is the last close. 1115 */ 1116 vrele(sdp->swd_vp); 1117 if (sdp->swd_vp != rootvp) { 1118 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred); 1119 } 1120 1121 mutex_enter(&uvm_swap_data_lock); 1122 uvmexp.swpages -= npages; 1123 uvmexp.swpginuse -= sdp->swd_npgbad; 1124 1125 if (swaplist_find(sdp->swd_vp, true) == NULL) 1126 panic("%s: swapdev not in list", __func__); 1127 swaplist_trim(); 1128 mutex_exit(&uvm_swap_data_lock); 1129 1130 /* 1131 * free all resources! 1132 */ 1133 vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize); 1134 blist_destroy(sdp->swd_blist); 1135 bufq_free(sdp->swd_tab); 1136 kmem_free(__UNVOLATILE(sdp->swd_encmap), 1137 encmap_size(sdp->swd_drumsize)); 1138 explicit_memset(&sdp->swd_enckey, 0, sizeof sdp->swd_enckey); 1139 explicit_memset(&sdp->swd_deckey, 0, sizeof sdp->swd_deckey); 1140 kmem_free(sdp, sizeof(*sdp)); 1141 return (0); 1142 } 1143 1144 void 1145 uvm_swap_shutdown(struct lwp *l) 1146 { 1147 struct swapdev *sdp; 1148 struct swappri *spp; 1149 struct vnode *vp; 1150 int error; 1151 1152 if (!uvm_swap_init_done || uvmexp.nswapdev == 0) 1153 return; 1154 printf("turning off swap..."); 1155 rw_enter(&swap_syscall_lock, RW_WRITER); 1156 mutex_enter(&uvm_swap_data_lock); 1157 again: 1158 LIST_FOREACH(spp, &swap_priority, spi_swappri) 1159 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1160 if (sdp->swd_flags & SWF_FAKE) 1161 continue; 1162 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) 1163 continue; 1164 #ifdef DEBUG 1165 printf("\nturning off swap on %s...", sdp->swd_path); 1166 #endif 1167 /* Have to lock and reference vnode for swap_off(). */ 1168 vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE|LK_RETRY); 1169 vref(vp); 1170 error = swap_off(l, sdp); 1171 vput(vp); 1172 mutex_enter(&uvm_swap_data_lock); 1173 if (error) { 1174 printf("stopping swap on %s failed " 1175 "with error %d\n", sdp->swd_path, error); 1176 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1177 uvmexp.nswapdev--; 1178 swaplist_trim(); 1179 } 1180 goto again; 1181 } 1182 printf(" done\n"); 1183 mutex_exit(&uvm_swap_data_lock); 1184 rw_exit(&swap_syscall_lock); 1185 } 1186 1187 1188 /* 1189 * /dev/drum interface and i/o functions 1190 */ 1191 1192 /* 1193 * swopen: allow the initial open from uvm_swap_init() and reject all others. 1194 */ 1195 1196 static int 1197 swopen(dev_t dev, int flag, int mode, struct lwp *l) 1198 { 1199 static bool inited = false; 1200 1201 if (!inited) { 1202 inited = true; 1203 return 0; 1204 } 1205 return ENODEV; 1206 } 1207 1208 /* 1209 * swstrategy: perform I/O on the drum 1210 * 1211 * => we must map the i/o request from the drum to the correct swapdev. 1212 */ 1213 static void 1214 swstrategy(struct buf *bp) 1215 { 1216 struct swapdev *sdp; 1217 struct vnode *vp; 1218 int pageno, bn; 1219 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1220 1221 /* 1222 * convert block number to swapdev. note that swapdev can't 1223 * be yanked out from under us because we are holding resources 1224 * in it (i.e. the blocks we are doing I/O on). 1225 */ 1226 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT; 1227 mutex_enter(&uvm_swap_data_lock); 1228 sdp = swapdrum_getsdp(pageno); 1229 mutex_exit(&uvm_swap_data_lock); 1230 if (sdp == NULL) { 1231 bp->b_error = EINVAL; 1232 bp->b_resid = bp->b_bcount; 1233 biodone(bp); 1234 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); 1235 return; 1236 } 1237 1238 /* 1239 * convert drum page number to block number on this swapdev. 1240 */ 1241 1242 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1243 bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ 1244 1245 UVMHIST_LOG(pdhist, " Rd/Wr (0/1) %jd: mapoff=%#jx bn=%#jx bcount=%jd", 1246 ((bp->b_flags & B_READ) == 0) ? 1 : 0, 1247 sdp->swd_drumoffset, bn, bp->b_bcount); 1248 1249 /* 1250 * for block devices we finish up here. 1251 * for regular files we have to do more work which we delegate 1252 * to sw_reg_strategy(). 1253 */ 1254 1255 vp = sdp->swd_vp; /* swapdev vnode pointer */ 1256 switch (vp->v_type) { 1257 default: 1258 panic("%s: vnode type 0x%x", __func__, vp->v_type); 1259 1260 case VBLK: 1261 1262 /* 1263 * must convert "bp" from an I/O on /dev/drum to an I/O 1264 * on the swapdev (sdp). 1265 */ 1266 bp->b_blkno = bn; /* swapdev block number */ 1267 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */ 1268 1269 /* 1270 * if we are doing a write, we have to redirect the i/o on 1271 * drum's v_numoutput counter to the swapdevs. 1272 */ 1273 if ((bp->b_flags & B_READ) == 0) { 1274 mutex_enter(bp->b_objlock); 1275 vwakeup(bp); /* kills one 'v_numoutput' on drum */ 1276 mutex_exit(bp->b_objlock); 1277 mutex_enter(vp->v_interlock); 1278 vp->v_numoutput++; /* put it on swapdev */ 1279 mutex_exit(vp->v_interlock); 1280 } 1281 1282 /* 1283 * finally plug in swapdev vnode and start I/O 1284 */ 1285 bp->b_vp = vp; 1286 bp->b_objlock = vp->v_interlock; 1287 VOP_STRATEGY(vp, bp); 1288 return; 1289 1290 case VREG: 1291 /* 1292 * delegate to sw_reg_strategy function. 1293 */ 1294 sw_reg_strategy(sdp, bp, bn); 1295 return; 1296 } 1297 /* NOTREACHED */ 1298 } 1299 1300 /* 1301 * swread: the read function for the drum (just a call to physio) 1302 */ 1303 /*ARGSUSED*/ 1304 static int 1305 swread(dev_t dev, struct uio *uio, int ioflag) 1306 { 1307 UVMHIST_FUNC(__func__); 1308 UVMHIST_CALLARGS(pdhist, " dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0); 1309 1310 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); 1311 } 1312 1313 /* 1314 * swwrite: the write function for the drum (just a call to physio) 1315 */ 1316 /*ARGSUSED*/ 1317 static int 1318 swwrite(dev_t dev, struct uio *uio, int ioflag) 1319 { 1320 UVMHIST_FUNC(__func__); 1321 UVMHIST_CALLARGS(pdhist, " dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0); 1322 1323 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); 1324 } 1325 1326 const struct bdevsw swap_bdevsw = { 1327 .d_open = swopen, 1328 .d_close = noclose, 1329 .d_strategy = swstrategy, 1330 .d_ioctl = noioctl, 1331 .d_dump = nodump, 1332 .d_psize = nosize, 1333 .d_discard = nodiscard, 1334 .d_flag = D_OTHER 1335 }; 1336 1337 const struct cdevsw swap_cdevsw = { 1338 .d_open = nullopen, 1339 .d_close = nullclose, 1340 .d_read = swread, 1341 .d_write = swwrite, 1342 .d_ioctl = noioctl, 1343 .d_stop = nostop, 1344 .d_tty = notty, 1345 .d_poll = nopoll, 1346 .d_mmap = nommap, 1347 .d_kqfilter = nokqfilter, 1348 .d_discard = nodiscard, 1349 .d_flag = D_OTHER, 1350 }; 1351 1352 /* 1353 * sw_reg_strategy: handle swap i/o to regular files 1354 */ 1355 static void 1356 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) 1357 { 1358 struct vnode *vp; 1359 struct vndxfer *vnx; 1360 daddr_t nbn; 1361 char *addr; 1362 off_t byteoff; 1363 int s, off, nra, error, sz, resid; 1364 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1365 1366 /* 1367 * allocate a vndxfer head for this transfer and point it to 1368 * our buffer. 1369 */ 1370 vnx = pool_get(&vndxfer_pool, PR_WAITOK); 1371 vnx->vx_flags = VX_BUSY; 1372 vnx->vx_error = 0; 1373 vnx->vx_pending = 0; 1374 vnx->vx_bp = bp; 1375 vnx->vx_sdp = sdp; 1376 1377 /* 1378 * setup for main loop where we read filesystem blocks into 1379 * our buffer. 1380 */ 1381 error = 0; 1382 bp->b_resid = bp->b_bcount; /* nothing transferred yet! */ 1383 addr = bp->b_data; /* current position in buffer */ 1384 byteoff = dbtob((uint64_t)bn); 1385 1386 for (resid = bp->b_resid; resid; resid -= sz) { 1387 struct vndbuf *nbp; 1388 1389 /* 1390 * translate byteoffset into block number. return values: 1391 * vp = vnode of underlying device 1392 * nbn = new block number (on underlying vnode dev) 1393 * nra = num blocks we can read-ahead (excludes requested 1394 * block) 1395 */ 1396 nra = 0; 1397 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1398 &vp, &nbn, &nra); 1399 1400 if (error == 0 && nbn == (daddr_t)-1) { 1401 /* 1402 * this used to just set error, but that doesn't 1403 * do the right thing. Instead, it causes random 1404 * memory errors. The panic() should remain until 1405 * this condition doesn't destabilize the system. 1406 */ 1407 #if 1 1408 panic("%s: swap to sparse file", __func__); 1409 #else 1410 error = EIO; /* failure */ 1411 #endif 1412 } 1413 1414 /* 1415 * punt if there was an error or a hole in the file. 1416 * we must wait for any i/o ops we have already started 1417 * to finish before returning. 1418 * 1419 * XXX we could deal with holes here but it would be 1420 * a hassle (in the write case). 1421 */ 1422 if (error) { 1423 s = splbio(); 1424 vnx->vx_error = error; /* pass error up */ 1425 goto out; 1426 } 1427 1428 /* 1429 * compute the size ("sz") of this transfer (in bytes). 1430 */ 1431 off = byteoff % sdp->swd_bsize; 1432 sz = (1 + nra) * sdp->swd_bsize - off; 1433 if (sz > resid) 1434 sz = resid; 1435 1436 UVMHIST_LOG(pdhist, "sw_reg_strategy: " 1437 "vp %#jx/%#jx offset %#jx/%#jx", 1438 (uintptr_t)sdp->swd_vp, (uintptr_t)vp, byteoff, nbn); 1439 1440 /* 1441 * now get a buf structure. note that the vb_buf is 1442 * at the front of the nbp structure so that you can 1443 * cast pointers between the two structure easily. 1444 */ 1445 nbp = pool_get(&vndbuf_pool, PR_WAITOK); 1446 buf_init(&nbp->vb_buf); 1447 nbp->vb_buf.b_flags = bp->b_flags; 1448 nbp->vb_buf.b_cflags = bp->b_cflags; 1449 nbp->vb_buf.b_oflags = bp->b_oflags; 1450 nbp->vb_buf.b_bcount = sz; 1451 nbp->vb_buf.b_bufsize = sz; 1452 nbp->vb_buf.b_error = 0; 1453 nbp->vb_buf.b_data = addr; 1454 nbp->vb_buf.b_lblkno = 0; 1455 nbp->vb_buf.b_blkno = nbn + btodb(off); 1456 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno; 1457 nbp->vb_buf.b_iodone = sw_reg_biodone; 1458 nbp->vb_buf.b_vp = vp; 1459 nbp->vb_buf.b_objlock = vp->v_interlock; 1460 if (vp->v_type == VBLK) { 1461 nbp->vb_buf.b_dev = vp->v_rdev; 1462 } 1463 1464 nbp->vb_xfer = vnx; /* patch it back in to vnx */ 1465 1466 /* 1467 * Just sort by block number 1468 */ 1469 s = splbio(); 1470 if (vnx->vx_error != 0) { 1471 buf_destroy(&nbp->vb_buf); 1472 pool_put(&vndbuf_pool, nbp); 1473 goto out; 1474 } 1475 vnx->vx_pending++; 1476 1477 /* sort it in and start I/O if we are not over our limit */ 1478 /* XXXAD locking */ 1479 bufq_put(sdp->swd_tab, &nbp->vb_buf); 1480 sw_reg_start(sdp); 1481 splx(s); 1482 1483 /* 1484 * advance to the next I/O 1485 */ 1486 byteoff += sz; 1487 addr += sz; 1488 } 1489 1490 s = splbio(); 1491 1492 out: /* Arrive here at splbio */ 1493 vnx->vx_flags &= ~VX_BUSY; 1494 if (vnx->vx_pending == 0) { 1495 error = vnx->vx_error; 1496 pool_put(&vndxfer_pool, vnx); 1497 bp->b_error = error; 1498 biodone(bp); 1499 } 1500 splx(s); 1501 } 1502 1503 /* 1504 * sw_reg_start: start an I/O request on the requested swapdev 1505 * 1506 * => reqs are sorted by b_rawblkno (above) 1507 */ 1508 static void 1509 sw_reg_start(struct swapdev *sdp) 1510 { 1511 struct buf *bp; 1512 struct vnode *vp; 1513 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1514 1515 /* recursion control */ 1516 if ((sdp->swd_flags & SWF_BUSY) != 0) 1517 return; 1518 1519 sdp->swd_flags |= SWF_BUSY; 1520 1521 while (sdp->swd_active < sdp->swd_maxactive) { 1522 bp = bufq_get(sdp->swd_tab); 1523 if (bp == NULL) 1524 break; 1525 sdp->swd_active++; 1526 1527 UVMHIST_LOG(pdhist, 1528 "sw_reg_start: bp %#jx vp %#jx blkno %#jx cnt %#jx", 1529 (uintptr_t)bp, (uintptr_t)bp->b_vp, (uintptr_t)bp->b_blkno, 1530 bp->b_bcount); 1531 vp = bp->b_vp; 1532 KASSERT(bp->b_objlock == vp->v_interlock); 1533 if ((bp->b_flags & B_READ) == 0) { 1534 mutex_enter(vp->v_interlock); 1535 vp->v_numoutput++; 1536 mutex_exit(vp->v_interlock); 1537 } 1538 VOP_STRATEGY(vp, bp); 1539 } 1540 sdp->swd_flags &= ~SWF_BUSY; 1541 } 1542 1543 /* 1544 * sw_reg_biodone: one of our i/o's has completed 1545 */ 1546 static void 1547 sw_reg_biodone(struct buf *bp) 1548 { 1549 workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL); 1550 } 1551 1552 /* 1553 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1554 * 1555 * => note that we can recover the vndbuf struct by casting the buf ptr 1556 */ 1557 static void 1558 sw_reg_iodone(struct work *wk, void *dummy) 1559 { 1560 struct vndbuf *vbp = (void *)wk; 1561 struct vndxfer *vnx = vbp->vb_xfer; 1562 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1563 struct swapdev *sdp = vnx->vx_sdp; 1564 int s, resid, error; 1565 KASSERT(&vbp->vb_buf.b_work == wk); 1566 UVMHIST_FUNC(__func__); 1567 UVMHIST_CALLARGS(pdhist, " vbp=%#jx vp=%#jx blkno=%#jx addr=%#jx", 1568 (uintptr_t)vbp, (uintptr_t)vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, 1569 (uintptr_t)vbp->vb_buf.b_data); 1570 UVMHIST_LOG(pdhist, " cnt=%#jx resid=%#jx", 1571 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); 1572 1573 /* 1574 * protect vbp at splbio and update. 1575 */ 1576 1577 s = splbio(); 1578 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1579 pbp->b_resid -= resid; 1580 vnx->vx_pending--; 1581 1582 if (vbp->vb_buf.b_error != 0) { 1583 /* pass error upward */ 1584 error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO; 1585 UVMHIST_LOG(pdhist, " got error=%jd !", error, 0, 0, 0); 1586 vnx->vx_error = error; 1587 } 1588 1589 /* 1590 * kill vbp structure 1591 */ 1592 buf_destroy(&vbp->vb_buf); 1593 pool_put(&vndbuf_pool, vbp); 1594 1595 /* 1596 * wrap up this transaction if it has run to completion or, in 1597 * case of an error, when all auxiliary buffers have returned. 1598 */ 1599 if (vnx->vx_error != 0) { 1600 /* pass error upward */ 1601 error = vnx->vx_error; 1602 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1603 pbp->b_error = error; 1604 biodone(pbp); 1605 pool_put(&vndxfer_pool, vnx); 1606 } 1607 } else if (pbp->b_resid == 0) { 1608 KASSERT(vnx->vx_pending == 0); 1609 if ((vnx->vx_flags & VX_BUSY) == 0) { 1610 UVMHIST_LOG(pdhist, " iodone, pbp=%#jx error=%jd !", 1611 (uintptr_t)pbp, vnx->vx_error, 0, 0); 1612 biodone(pbp); 1613 pool_put(&vndxfer_pool, vnx); 1614 } 1615 } 1616 1617 /* 1618 * done! start next swapdev I/O if one is pending 1619 */ 1620 sdp->swd_active--; 1621 sw_reg_start(sdp); 1622 splx(s); 1623 } 1624 1625 1626 /* 1627 * uvm_swap_alloc: allocate space on swap 1628 * 1629 * => allocation is done "round robin" down the priority list, as we 1630 * allocate in a priority we "rotate" the circle queue. 1631 * => space can be freed with uvm_swap_free 1632 * => we return the page slot number in /dev/drum (0 == invalid slot) 1633 * => we lock uvm_swap_data_lock 1634 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1635 */ 1636 int 1637 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok) 1638 { 1639 struct swapdev *sdp; 1640 struct swappri *spp; 1641 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1642 1643 /* 1644 * no swap devices configured yet? definite failure. 1645 */ 1646 if (uvmexp.nswapdev < 1) 1647 return 0; 1648 1649 /* 1650 * XXXJAK: BEGIN HACK 1651 * 1652 * blist_alloc() in subr_blist.c will panic if we try to allocate 1653 * too many slots. 1654 */ 1655 if (*nslots > BLIST_MAX_ALLOC) { 1656 if (__predict_false(lessok == false)) 1657 return 0; 1658 *nslots = BLIST_MAX_ALLOC; 1659 } 1660 /* XXXJAK: END HACK */ 1661 1662 /* 1663 * lock data lock, convert slots into blocks, and enter loop 1664 */ 1665 mutex_enter(&uvm_swap_data_lock); 1666 1667 ReTry: /* XXXMRG */ 1668 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1669 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1670 uint64_t result; 1671 1672 /* if it's not enabled, then we can't swap from it */ 1673 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1674 continue; 1675 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1676 continue; 1677 result = blist_alloc(sdp->swd_blist, *nslots); 1678 if (result == BLIST_NONE) { 1679 continue; 1680 } 1681 KASSERT(result < sdp->swd_drumsize); 1682 1683 /* 1684 * successful allocation! now rotate the tailq. 1685 */ 1686 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1687 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1688 sdp->swd_npginuse += *nslots; 1689 uvmexp.swpginuse += *nslots; 1690 mutex_exit(&uvm_swap_data_lock); 1691 /* done! return drum slot number */ 1692 UVMHIST_LOG(pdhist, 1693 "success! returning %jd slots starting at %jd", 1694 *nslots, result + sdp->swd_drumoffset, 0, 0); 1695 return (result + sdp->swd_drumoffset); 1696 } 1697 } 1698 1699 /* XXXMRG: BEGIN HACK */ 1700 if (*nslots > 1 && lessok) { 1701 *nslots = 1; 1702 /* XXXMRG: ugh! blist should support this for us */ 1703 goto ReTry; 1704 } 1705 /* XXXMRG: END HACK */ 1706 1707 mutex_exit(&uvm_swap_data_lock); 1708 return 0; 1709 } 1710 1711 /* 1712 * uvm_swapisfull: return true if most of available swap is allocated 1713 * and in use. we don't count some small portion as it may be inaccessible 1714 * to us at any given moment, for example if there is lock contention or if 1715 * pages are busy. 1716 */ 1717 bool 1718 uvm_swapisfull(void) 1719 { 1720 int swpgonly; 1721 bool rv; 1722 1723 if (uvmexp.swpages == 0) { 1724 return true; 1725 } 1726 1727 mutex_enter(&uvm_swap_data_lock); 1728 KASSERT(uvmexp.swpgonly <= uvmexp.swpages); 1729 swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 / 1730 uvm_swapisfull_factor); 1731 rv = (swpgonly >= uvmexp.swpgavail); 1732 mutex_exit(&uvm_swap_data_lock); 1733 1734 return (rv); 1735 } 1736 1737 /* 1738 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1739 * 1740 * => we lock uvm_swap_data_lock 1741 */ 1742 void 1743 uvm_swap_markbad(int startslot, int nslots) 1744 { 1745 struct swapdev *sdp; 1746 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1747 1748 mutex_enter(&uvm_swap_data_lock); 1749 sdp = swapdrum_getsdp(startslot); 1750 KASSERT(sdp != NULL); 1751 1752 /* 1753 * we just keep track of how many pages have been marked bad 1754 * in this device, to make everything add up in swap_off(). 1755 * we assume here that the range of slots will all be within 1756 * one swap device. 1757 */ 1758 1759 KASSERT(uvmexp.swpgonly >= nslots); 1760 atomic_add_int(&uvmexp.swpgonly, -nslots); 1761 sdp->swd_npgbad += nslots; 1762 UVMHIST_LOG(pdhist, "now %jd bad", sdp->swd_npgbad, 0,0,0); 1763 mutex_exit(&uvm_swap_data_lock); 1764 } 1765 1766 /* 1767 * uvm_swap_free: free swap slots 1768 * 1769 * => this can be all or part of an allocation made by uvm_swap_alloc 1770 * => we lock uvm_swap_data_lock 1771 */ 1772 void 1773 uvm_swap_free(int startslot, int nslots) 1774 { 1775 struct swapdev *sdp; 1776 UVMHIST_FUNC(__func__); 1777 UVMHIST_CALLARGS(pdhist, "freeing %jd slots starting at %jd", nslots, 1778 startslot, 0, 0); 1779 1780 /* 1781 * ignore attempts to free the "bad" slot. 1782 */ 1783 1784 if (startslot == SWSLOT_BAD) { 1785 return; 1786 } 1787 1788 /* 1789 * convert drum slot offset back to sdp, free the blocks 1790 * in the extent, and return. must hold pri lock to do 1791 * lookup and access the extent. 1792 */ 1793 1794 mutex_enter(&uvm_swap_data_lock); 1795 sdp = swapdrum_getsdp(startslot); 1796 KASSERT(uvmexp.nswapdev >= 1); 1797 KASSERT(sdp != NULL); 1798 KASSERT(sdp->swd_npginuse >= nslots); 1799 blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots); 1800 sdp->swd_npginuse -= nslots; 1801 uvmexp.swpginuse -= nslots; 1802 mutex_exit(&uvm_swap_data_lock); 1803 } 1804 1805 /* 1806 * uvm_swap_put: put any number of pages into a contig place on swap 1807 * 1808 * => can be sync or async 1809 */ 1810 1811 int 1812 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) 1813 { 1814 int error; 1815 1816 error = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1817 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1818 return error; 1819 } 1820 1821 /* 1822 * uvm_swap_get: get a single page from swap 1823 * 1824 * => usually a sync op (from fault) 1825 */ 1826 1827 int 1828 uvm_swap_get(struct vm_page *page, int swslot, int flags) 1829 { 1830 int error; 1831 1832 atomic_inc_uint(&uvmexp.nswget); 1833 KASSERT(flags & PGO_SYNCIO); 1834 if (swslot == SWSLOT_BAD) { 1835 return EIO; 1836 } 1837 1838 error = uvm_swap_io(&page, swslot, 1, B_READ | 1839 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1840 if (error == 0) { 1841 1842 /* 1843 * this page is no longer only in swap. 1844 */ 1845 1846 KASSERT(uvmexp.swpgonly > 0); 1847 atomic_dec_uint(&uvmexp.swpgonly); 1848 } 1849 return error; 1850 } 1851 1852 /* 1853 * uvm_swap_io: do an i/o operation to swap 1854 */ 1855 1856 static int 1857 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) 1858 { 1859 daddr_t startblk; 1860 struct buf *bp; 1861 vaddr_t kva; 1862 int error, mapinflags; 1863 bool write, async, swap_encrypt; 1864 UVMHIST_FUNC(__func__); 1865 UVMHIST_CALLARGS(pdhist, "<- called, startslot=%jd, npages=%jd, flags=%#jx", 1866 startslot, npages, flags, 0); 1867 1868 write = (flags & B_READ) == 0; 1869 async = (flags & B_ASYNC) != 0; 1870 swap_encrypt = atomic_load_relaxed(&uvm_swap_encrypt); 1871 1872 /* 1873 * allocate a buf for the i/o. 1874 */ 1875 1876 KASSERT(curlwp != uvm.pagedaemon_lwp || write); 1877 KASSERT(curlwp != uvm.pagedaemon_lwp || async); 1878 bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp); 1879 if (bp == NULL) { 1880 uvm_aio_aiodone_pages(pps, npages, true, ENOMEM); 1881 return ENOMEM; 1882 } 1883 1884 /* 1885 * convert starting drum slot to block number 1886 */ 1887 1888 startblk = btodb((uint64_t)startslot << PAGE_SHIFT); 1889 1890 /* 1891 * first, map the pages into the kernel. 1892 */ 1893 1894 mapinflags = !write ? 1895 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ : 1896 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE; 1897 if (write && swap_encrypt) /* need to encrypt in-place */ 1898 mapinflags |= UVMPAGER_MAPIN_READ; 1899 kva = uvm_pagermapin(pps, npages, mapinflags); 1900 1901 /* 1902 * encrypt writes in place if requested 1903 */ 1904 1905 if (write) do { 1906 struct swapdev *sdp; 1907 int i; 1908 1909 /* 1910 * Get the swapdev so we can discriminate on the 1911 * encryption state. There may or may not be an 1912 * encryption key generated; we may or may not be asked 1913 * to encrypt swap. 1914 * 1915 * 1. NO KEY, NO ENCRYPTION: Nothing to do. 1916 * 1917 * 2. NO KEY, BUT ENCRYPTION: Generate a key, encrypt, 1918 * and mark the slots encrypted. 1919 * 1920 * 3. KEY, BUT NO ENCRYPTION: The slots may already be 1921 * marked encrypted from a past life. Mark them not 1922 * encrypted. 1923 * 1924 * 4. KEY, ENCRYPTION: Encrypt and mark the slots 1925 * encrypted. 1926 */ 1927 mutex_enter(&uvm_swap_data_lock); 1928 sdp = swapdrum_getsdp(startslot); 1929 if (!sdp->swd_encinit) { 1930 if (!swap_encrypt) { 1931 mutex_exit(&uvm_swap_data_lock); 1932 break; 1933 } 1934 uvm_swap_genkey(sdp); 1935 } 1936 KASSERT(sdp->swd_encinit); 1937 mutex_exit(&uvm_swap_data_lock); 1938 1939 for (i = 0; i < npages; i++) { 1940 int s = startslot + i; 1941 KDASSERT(swapdrum_sdp_is(s, sdp)); 1942 KASSERT(s >= sdp->swd_drumoffset); 1943 s -= sdp->swd_drumoffset; 1944 KASSERT(s < sdp->swd_drumsize); 1945 1946 if (swap_encrypt) { 1947 uvm_swap_encryptpage(sdp, 1948 (void *)(kva + (vsize_t)i*PAGE_SIZE), s); 1949 atomic_or_32(&sdp->swd_encmap[s/32], 1950 __BIT(s%32)); 1951 } else { 1952 atomic_and_32(&sdp->swd_encmap[s/32], 1953 ~__BIT(s%32)); 1954 } 1955 } 1956 } while (0); 1957 1958 /* 1959 * fill in the bp/sbp. we currently route our i/o through 1960 * /dev/drum's vnode [swapdev_vp]. 1961 */ 1962 1963 bp->b_cflags = BC_BUSY | BC_NOCACHE; 1964 bp->b_flags = (flags & (B_READ|B_ASYNC)); 1965 bp->b_proc = &proc0; /* XXX */ 1966 bp->b_vnbufs.le_next = NOLIST; 1967 bp->b_data = (void *)kva; 1968 bp->b_blkno = startblk; 1969 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT; 1970 1971 /* 1972 * bump v_numoutput (counter of number of active outputs). 1973 */ 1974 1975 if (write) { 1976 mutex_enter(swapdev_vp->v_interlock); 1977 swapdev_vp->v_numoutput++; 1978 mutex_exit(swapdev_vp->v_interlock); 1979 } 1980 1981 /* 1982 * for async ops we must set up the iodone handler. 1983 */ 1984 1985 if (async) { 1986 bp->b_iodone = uvm_aio_aiodone; 1987 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); 1988 if (curlwp == uvm.pagedaemon_lwp) 1989 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1990 else 1991 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 1992 } else { 1993 bp->b_iodone = NULL; 1994 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1995 } 1996 UVMHIST_LOG(pdhist, 1997 "about to start io: data = %#jx blkno = %#jx, bcount = %jd", 1998 (uintptr_t)bp->b_data, bp->b_blkno, bp->b_bcount, 0); 1999 2000 /* 2001 * now we start the I/O, and if async, return. 2002 */ 2003 2004 VOP_STRATEGY(swapdev_vp, bp); 2005 if (async) { 2006 /* 2007 * Reads are always synchronous; if this changes, we 2008 * need to add an asynchronous path for decryption. 2009 */ 2010 KASSERT(write); 2011 return 0; 2012 } 2013 2014 /* 2015 * must be sync i/o. wait for it to finish 2016 */ 2017 2018 error = biowait(bp); 2019 if (error) 2020 goto out; 2021 2022 /* 2023 * decrypt reads in place if needed 2024 */ 2025 2026 if (!write) do { 2027 struct swapdev *sdp; 2028 bool encinit; 2029 int i; 2030 2031 /* 2032 * Get the sdp. Everything about it except the encinit 2033 * bit, saying whether the encryption key is 2034 * initialized or not, and the encrypted bit for each 2035 * page, is stable until all swap pages have been 2036 * released and the device is removed. 2037 */ 2038 mutex_enter(&uvm_swap_data_lock); 2039 sdp = swapdrum_getsdp(startslot); 2040 encinit = sdp->swd_encinit; 2041 mutex_exit(&uvm_swap_data_lock); 2042 2043 if (!encinit) 2044 /* 2045 * If there's no encryption key, there's no way 2046 * any of these slots can be encrypted, so 2047 * nothing to do here. 2048 */ 2049 break; 2050 for (i = 0; i < npages; i++) { 2051 int s = startslot + i; 2052 KDASSERT(swapdrum_sdp_is(s, sdp)); 2053 KASSERT(s >= sdp->swd_drumoffset); 2054 s -= sdp->swd_drumoffset; 2055 KASSERT(s < sdp->swd_drumsize); 2056 if ((atomic_load_relaxed(&sdp->swd_encmap[s/32]) & 2057 __BIT(s%32)) == 0) 2058 continue; 2059 uvm_swap_decryptpage(sdp, 2060 (void *)(kva + (vsize_t)i*PAGE_SIZE), s); 2061 } 2062 } while (0); 2063 out: 2064 /* 2065 * kill the pager mapping 2066 */ 2067 2068 uvm_pagermapout(kva, npages); 2069 2070 /* 2071 * now dispose of the buf and we're done. 2072 */ 2073 2074 if (write) { 2075 mutex_enter(swapdev_vp->v_interlock); 2076 vwakeup(bp); 2077 mutex_exit(swapdev_vp->v_interlock); 2078 } 2079 putiobuf(bp); 2080 UVMHIST_LOG(pdhist, "<- done (sync) error=%jd", error, 0, 0, 0); 2081 2082 return (error); 2083 } 2084 2085 /* 2086 * uvm_swap_genkey(sdp) 2087 * 2088 * Generate a key for swap encryption. 2089 */ 2090 static void 2091 uvm_swap_genkey(struct swapdev *sdp) 2092 { 2093 uint8_t key[32]; 2094 2095 KASSERT(!sdp->swd_encinit); 2096 2097 cprng_strong(kern_cprng, key, sizeof key, 0); 2098 aes_setenckey256(&sdp->swd_enckey, key); 2099 aes_setdeckey256(&sdp->swd_deckey, key); 2100 explicit_memset(key, 0, sizeof key); 2101 2102 sdp->swd_encinit = true; 2103 } 2104 2105 /* 2106 * uvm_swap_encryptpage(sdp, kva, slot) 2107 * 2108 * Encrypt one page of data at kva for the specified slot number 2109 * in the swap device. 2110 */ 2111 static void 2112 uvm_swap_encryptpage(struct swapdev *sdp, void *kva, int slot) 2113 { 2114 uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16); 2115 2116 /* iv := AES_k(le32enc(slot) || 0^96) */ 2117 le32enc(preiv, slot); 2118 aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS); 2119 2120 /* *kva := AES-CBC_k(iv, *kva) */ 2121 aes_cbc_enc(&sdp->swd_enckey, kva, kva, PAGE_SIZE, iv, 2122 AES_256_NROUNDS); 2123 2124 explicit_memset(&iv, 0, sizeof iv); 2125 } 2126 2127 /* 2128 * uvm_swap_decryptpage(sdp, kva, slot) 2129 * 2130 * Decrypt one page of data at kva for the specified slot number 2131 * in the swap device. 2132 */ 2133 static void 2134 uvm_swap_decryptpage(struct swapdev *sdp, void *kva, int slot) 2135 { 2136 uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16); 2137 2138 /* iv := AES_k(le32enc(slot) || 0^96) */ 2139 le32enc(preiv, slot); 2140 aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS); 2141 2142 /* *kva := AES-CBC^{-1}_k(iv, *kva) */ 2143 aes_cbc_dec(&sdp->swd_deckey, kva, kva, PAGE_SIZE, iv, 2144 AES_256_NROUNDS); 2145 2146 explicit_memset(&iv, 0, sizeof iv); 2147 } 2148 2149 SYSCTL_SETUP(sysctl_uvmswap_setup, "sysctl uvmswap setup") 2150 { 2151 2152 sysctl_createv(clog, 0, NULL, NULL, 2153 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "swap_encrypt", 2154 SYSCTL_DESCR("Encrypt data when swapped out to disk"), 2155 NULL, 0, &uvm_swap_encrypt, 0, 2156 CTL_VM, CTL_CREATE, CTL_EOL); 2157 } 2158