1 /* $NetBSD: uvm_swap.c,v 1.198 2020/07/25 22:14:35 riastradh Exp $ */ 2 3 /* 4 * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 29 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.198 2020/07/25 22:14:35 riastradh Exp $"); 34 35 #include "opt_uvmhist.h" 36 #include "opt_compat_netbsd.h" 37 #include "opt_ddb.h" 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/atomic.h> 42 #include <sys/buf.h> 43 #include <sys/bufq.h> 44 #include <sys/conf.h> 45 #include <sys/cprng.h> 46 #include <sys/proc.h> 47 #include <sys/namei.h> 48 #include <sys/disklabel.h> 49 #include <sys/errno.h> 50 #include <sys/kernel.h> 51 #include <sys/vnode.h> 52 #include <sys/file.h> 53 #include <sys/vmem.h> 54 #include <sys/blist.h> 55 #include <sys/mount.h> 56 #include <sys/pool.h> 57 #include <sys/kmem.h> 58 #include <sys/syscallargs.h> 59 #include <sys/swap.h> 60 #include <sys/kauth.h> 61 #include <sys/sysctl.h> 62 #include <sys/workqueue.h> 63 64 #include <uvm/uvm.h> 65 66 #include <miscfs/specfs/specdev.h> 67 68 #include <crypto/aes/aes.h> 69 #include <crypto/aes/aes_cbc.h> 70 71 /* 72 * uvm_swap.c: manage configuration and i/o to swap space. 73 */ 74 75 /* 76 * swap space is managed in the following way: 77 * 78 * each swap partition or file is described by a "swapdev" structure. 79 * each "swapdev" structure contains a "swapent" structure which contains 80 * information that is passed up to the user (via system calls). 81 * 82 * each swap partition is assigned a "priority" (int) which controls 83 * swap parition usage. 84 * 85 * the system maintains a global data structure describing all swap 86 * partitions/files. there is a sorted LIST of "swappri" structures 87 * which describe "swapdev"'s at that priority. this LIST is headed 88 * by the "swap_priority" global var. each "swappri" contains a 89 * TAILQ of "swapdev" structures at that priority. 90 * 91 * locking: 92 * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl 93 * system call and prevents the swap priority list from changing 94 * while we are in the middle of a system call (e.g. SWAP_STATS). 95 * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data 96 * structures including the priority list, the swapdev structures, 97 * and the swapmap arena. 98 * 99 * each swap device has the following info: 100 * - swap device in use (could be disabled, preventing future use) 101 * - swap enabled (allows new allocations on swap) 102 * - map info in /dev/drum 103 * - vnode pointer 104 * for swap files only: 105 * - block size 106 * - max byte count in buffer 107 * - buffer 108 * 109 * userland controls and configures swap with the swapctl(2) system call. 110 * the sys_swapctl performs the following operations: 111 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 112 * [2] SWAP_STATS: given a pointer to an array of swapent structures 113 * (passed in via "arg") of a size passed in via "misc" ... we load 114 * the current swap config into the array. The actual work is done 115 * in the uvm_swap_stats() function. 116 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 117 * priority in "misc", start swapping on it. 118 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 119 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 120 * "misc") 121 */ 122 123 /* 124 * swapdev: describes a single swap partition/file 125 * 126 * note the following should be true: 127 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 128 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] 129 */ 130 struct swapdev { 131 dev_t swd_dev; /* device id */ 132 int swd_flags; /* flags:inuse/enable/fake */ 133 int swd_priority; /* our priority */ 134 int swd_nblks; /* blocks in this device */ 135 char *swd_path; /* saved pathname of device */ 136 int swd_pathlen; /* length of pathname */ 137 int swd_npages; /* #pages we can use */ 138 int swd_npginuse; /* #pages in use */ 139 int swd_npgbad; /* #pages bad */ 140 int swd_drumoffset; /* page0 offset in drum */ 141 int swd_drumsize; /* #pages in drum */ 142 blist_t swd_blist; /* blist for this swapdev */ 143 struct vnode *swd_vp; /* backing vnode */ 144 TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */ 145 146 int swd_bsize; /* blocksize (bytes) */ 147 int swd_maxactive; /* max active i/o reqs */ 148 struct bufq_state *swd_tab; /* buffer list */ 149 int swd_active; /* number of active buffers */ 150 151 volatile uint32_t *swd_encmap; /* bitmap of encrypted slots */ 152 struct aesenc swd_enckey; /* AES key expanded for enc */ 153 struct aesdec swd_deckey; /* AES key expanded for dec */ 154 bool swd_encinit; /* true if keys initialized */ 155 }; 156 157 /* 158 * swap device priority entry; the list is kept sorted on `spi_priority'. 159 */ 160 struct swappri { 161 int spi_priority; /* priority */ 162 TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 163 /* tailq of swapdevs at this priority */ 164 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 165 }; 166 167 /* 168 * The following two structures are used to keep track of data transfers 169 * on swap devices associated with regular files. 170 * NOTE: this code is more or less a copy of vnd.c; we use the same 171 * structure names here to ease porting.. 172 */ 173 struct vndxfer { 174 struct buf *vx_bp; /* Pointer to parent buffer */ 175 struct swapdev *vx_sdp; 176 int vx_error; 177 int vx_pending; /* # of pending aux buffers */ 178 int vx_flags; 179 #define VX_BUSY 1 180 #define VX_DEAD 2 181 }; 182 183 struct vndbuf { 184 struct buf vb_buf; 185 struct vndxfer *vb_xfer; 186 }; 187 188 /* 189 * We keep a of pool vndbuf's and vndxfer structures. 190 */ 191 static struct pool vndxfer_pool, vndbuf_pool; 192 193 /* 194 * local variables 195 */ 196 static vmem_t *swapmap; /* controls the mapping of /dev/drum */ 197 198 /* list of all active swap devices [by priority] */ 199 LIST_HEAD(swap_priority, swappri); 200 static struct swap_priority swap_priority; 201 202 /* locks */ 203 static kmutex_t uvm_swap_data_lock __cacheline_aligned; 204 static krwlock_t swap_syscall_lock; 205 206 /* workqueue and use counter for swap to regular files */ 207 static int sw_reg_count = 0; 208 static struct workqueue *sw_reg_workqueue; 209 210 /* tuneables */ 211 u_int uvm_swapisfull_factor = 99; 212 bool uvm_swap_encrypt = false; 213 214 /* 215 * prototypes 216 */ 217 static struct swapdev *swapdrum_getsdp(int); 218 219 static struct swapdev *swaplist_find(struct vnode *, bool); 220 static void swaplist_insert(struct swapdev *, 221 struct swappri *, int); 222 static void swaplist_trim(void); 223 224 static int swap_on(struct lwp *, struct swapdev *); 225 static int swap_off(struct lwp *, struct swapdev *); 226 227 static void sw_reg_strategy(struct swapdev *, struct buf *, int); 228 static void sw_reg_biodone(struct buf *); 229 static void sw_reg_iodone(struct work *wk, void *dummy); 230 static void sw_reg_start(struct swapdev *); 231 232 static int uvm_swap_io(struct vm_page **, int, int, int); 233 234 static void uvm_swap_genkey(struct swapdev *); 235 static void uvm_swap_encryptpage(struct swapdev *, void *, int); 236 static void uvm_swap_decryptpage(struct swapdev *, void *, int); 237 238 static size_t 239 encmap_size(size_t npages) 240 { 241 struct swapdev *sdp; 242 const size_t bytesperword = sizeof(sdp->swd_encmap[0]); 243 const size_t bitsperword = NBBY * bytesperword; 244 const size_t nbits = npages; /* one bit for each page */ 245 const size_t nwords = howmany(nbits, bitsperword); 246 const size_t nbytes = nwords * bytesperword; 247 248 return nbytes; 249 } 250 251 /* 252 * uvm_swap_init: init the swap system data structures and locks 253 * 254 * => called at boot time from init_main.c after the filesystems 255 * are brought up (which happens after uvm_init()) 256 */ 257 void 258 uvm_swap_init(void) 259 { 260 UVMHIST_FUNC(__func__); 261 262 UVMHIST_CALLED(pdhist); 263 /* 264 * first, init the swap list, its counter, and its lock. 265 * then get a handle on the vnode for /dev/drum by using 266 * the its dev_t number ("swapdev", from MD conf.c). 267 */ 268 269 LIST_INIT(&swap_priority); 270 uvmexp.nswapdev = 0; 271 rw_init(&swap_syscall_lock); 272 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE); 273 274 if (bdevvp(swapdev, &swapdev_vp)) 275 panic("%s: can't get vnode for swap device", __func__); 276 if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY)) 277 panic("%s: can't lock swap device", __func__); 278 if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED)) 279 panic("%s: can't open swap device", __func__); 280 VOP_UNLOCK(swapdev_vp); 281 282 /* 283 * create swap block resource map to map /dev/drum. the range 284 * from 1 to INT_MAX allows 2 gigablocks of swap space. note 285 * that block 0 is reserved (used to indicate an allocation 286 * failure, or no allocation). 287 */ 288 swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0, 289 VM_NOSLEEP, IPL_NONE); 290 if (swapmap == 0) { 291 panic("%s: vmem_create failed", __func__); 292 } 293 294 pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx", 295 NULL, IPL_BIO); 296 pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd", 297 NULL, IPL_BIO); 298 299 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); 300 } 301 302 /* 303 * swaplist functions: functions that operate on the list of swap 304 * devices on the system. 305 */ 306 307 /* 308 * swaplist_insert: insert swap device "sdp" into the global list 309 * 310 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 311 * => caller must provide a newly allocated swappri structure (we will 312 * FREE it if we don't need it... this it to prevent allocation 313 * blocking here while adding swap) 314 */ 315 static void 316 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) 317 { 318 struct swappri *spp, *pspp; 319 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 320 321 KASSERT(rw_write_held(&swap_syscall_lock)); 322 KASSERT(mutex_owned(&uvm_swap_data_lock)); 323 324 /* 325 * find entry at or after which to insert the new device. 326 */ 327 pspp = NULL; 328 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 329 if (priority <= spp->spi_priority) 330 break; 331 pspp = spp; 332 } 333 334 /* 335 * new priority? 336 */ 337 if (spp == NULL || spp->spi_priority != priority) { 338 spp = newspp; /* use newspp! */ 339 UVMHIST_LOG(pdhist, "created new swappri = %jd", 340 priority, 0, 0, 0); 341 342 spp->spi_priority = priority; 343 TAILQ_INIT(&spp->spi_swapdev); 344 345 if (pspp) 346 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 347 else 348 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 349 } else { 350 /* we don't need a new priority structure, free it */ 351 kmem_free(newspp, sizeof(*newspp)); 352 } 353 354 /* 355 * priority found (or created). now insert on the priority's 356 * tailq list and bump the total number of swapdevs. 357 */ 358 sdp->swd_priority = priority; 359 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 360 uvmexp.nswapdev++; 361 } 362 363 /* 364 * swaplist_find: find and optionally remove a swap device from the 365 * global list. 366 * 367 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 368 * => we return the swapdev we found (and removed) 369 */ 370 static struct swapdev * 371 swaplist_find(struct vnode *vp, bool remove) 372 { 373 struct swapdev *sdp; 374 struct swappri *spp; 375 376 KASSERT(rw_lock_held(&swap_syscall_lock)); 377 KASSERT(remove ? rw_write_held(&swap_syscall_lock) : 1); 378 KASSERT(mutex_owned(&uvm_swap_data_lock)); 379 380 /* 381 * search the lists for the requested vp 382 */ 383 384 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 385 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 386 if (sdp->swd_vp == vp) { 387 if (remove) { 388 TAILQ_REMOVE(&spp->spi_swapdev, 389 sdp, swd_next); 390 uvmexp.nswapdev--; 391 } 392 return(sdp); 393 } 394 } 395 } 396 return (NULL); 397 } 398 399 /* 400 * swaplist_trim: scan priority list for empty priority entries and kill 401 * them. 402 * 403 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 404 */ 405 static void 406 swaplist_trim(void) 407 { 408 struct swappri *spp, *nextspp; 409 410 KASSERT(rw_write_held(&swap_syscall_lock)); 411 KASSERT(mutex_owned(&uvm_swap_data_lock)); 412 413 LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) { 414 if (!TAILQ_EMPTY(&spp->spi_swapdev)) 415 continue; 416 LIST_REMOVE(spp, spi_swappri); 417 kmem_free(spp, sizeof(*spp)); 418 } 419 } 420 421 /* 422 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 423 * to the "swapdev" that maps that section of the drum. 424 * 425 * => each swapdev takes one big contig chunk of the drum 426 * => caller must hold uvm_swap_data_lock 427 */ 428 static struct swapdev * 429 swapdrum_getsdp(int pgno) 430 { 431 struct swapdev *sdp; 432 struct swappri *spp; 433 434 KASSERT(mutex_owned(&uvm_swap_data_lock)); 435 436 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 437 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 438 if (sdp->swd_flags & SWF_FAKE) 439 continue; 440 if (pgno >= sdp->swd_drumoffset && 441 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 442 return sdp; 443 } 444 } 445 } 446 return NULL; 447 } 448 449 /* 450 * swapdrum_sdp_is: true iff the swap device for pgno is sdp 451 * 452 * => for use in positive assertions only; result is not stable 453 */ 454 static bool __debugused 455 swapdrum_sdp_is(int pgno, struct swapdev *sdp) 456 { 457 bool result; 458 459 mutex_enter(&uvm_swap_data_lock); 460 result = swapdrum_getsdp(pgno) == sdp; 461 mutex_exit(&uvm_swap_data_lock); 462 463 return result; 464 } 465 466 void swapsys_lock(krw_t op) 467 { 468 rw_enter(&swap_syscall_lock, op); 469 } 470 471 void swapsys_unlock(void) 472 { 473 rw_exit(&swap_syscall_lock); 474 } 475 476 static void 477 swapent_cvt(struct swapent *se, const struct swapdev *sdp, int inuse) 478 { 479 se->se_dev = sdp->swd_dev; 480 se->se_flags = sdp->swd_flags; 481 se->se_nblks = sdp->swd_nblks; 482 se->se_inuse = inuse; 483 se->se_priority = sdp->swd_priority; 484 KASSERT(sdp->swd_pathlen < sizeof(se->se_path)); 485 strcpy(se->se_path, sdp->swd_path); 486 } 487 488 int (*uvm_swap_stats13)(const struct sys_swapctl_args *, register_t *) = 489 (void *)enosys; 490 int (*uvm_swap_stats50)(const struct sys_swapctl_args *, register_t *) = 491 (void *)enosys; 492 493 /* 494 * sys_swapctl: main entry point for swapctl(2) system call 495 * [with two helper functions: swap_on and swap_off] 496 */ 497 int 498 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval) 499 { 500 /* { 501 syscallarg(int) cmd; 502 syscallarg(void *) arg; 503 syscallarg(int) misc; 504 } */ 505 struct vnode *vp; 506 struct nameidata nd; 507 struct swappri *spp; 508 struct swapdev *sdp; 509 #define SWAP_PATH_MAX (PATH_MAX + 1) 510 char *userpath; 511 size_t len = 0; 512 int error; 513 int priority; 514 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 515 516 /* 517 * we handle the non-priv NSWAP and STATS request first. 518 * 519 * SWAP_NSWAP: return number of config'd swap devices 520 * [can also be obtained with uvmexp sysctl] 521 */ 522 if (SCARG(uap, cmd) == SWAP_NSWAP) { 523 const int nswapdev = uvmexp.nswapdev; 524 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%jd", nswapdev, 525 0, 0, 0); 526 *retval = nswapdev; 527 return 0; 528 } 529 530 userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP); 531 532 /* 533 * ensure serialized syscall access by grabbing the swap_syscall_lock 534 */ 535 rw_enter(&swap_syscall_lock, RW_WRITER); 536 537 /* 538 * SWAP_STATS: get stats on current # of configured swap devs 539 * 540 * note that the swap_priority list can't change as long 541 * as we are holding the swap_syscall_lock. we don't want 542 * to grab the uvm_swap_data_lock because we may fault&sleep during 543 * copyout() and we don't want to be holding that lock then! 544 */ 545 switch (SCARG(uap, cmd)) { 546 case SWAP_STATS13: 547 error = (*uvm_swap_stats13)(uap, retval); 548 goto out; 549 case SWAP_STATS50: 550 error = (*uvm_swap_stats50)(uap, retval); 551 goto out; 552 case SWAP_STATS: 553 error = uvm_swap_stats(SCARG(uap, arg), SCARG(uap, misc), 554 NULL, sizeof(struct swapent), retval); 555 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); 556 goto out; 557 558 case SWAP_GETDUMPDEV: 559 error = copyout(&dumpdev, SCARG(uap, arg), sizeof(dumpdev)); 560 goto out; 561 default: 562 break; 563 } 564 565 /* 566 * all other requests require superuser privs. verify. 567 */ 568 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL, 569 0, NULL, NULL, NULL))) 570 goto out; 571 572 if (SCARG(uap, cmd) == SWAP_DUMPOFF) { 573 /* drop the current dump device */ 574 dumpdev = NODEV; 575 dumpcdev = NODEV; 576 cpu_dumpconf(); 577 goto out; 578 } 579 580 /* 581 * at this point we expect a path name in arg. we will 582 * use namei() to gain a vnode reference (vref), and lock 583 * the vnode (VOP_LOCK). 584 * 585 * XXX: a NULL arg means use the root vnode pointer (e.g. for 586 * miniroot) 587 */ 588 if (SCARG(uap, arg) == NULL) { 589 vp = rootvp; /* miniroot */ 590 vref(vp); 591 if (vn_lock(vp, LK_EXCLUSIVE)) { 592 vrele(vp); 593 error = EBUSY; 594 goto out; 595 } 596 if (SCARG(uap, cmd) == SWAP_ON && 597 copystr("miniroot", userpath, SWAP_PATH_MAX, &len)) 598 panic("swapctl: miniroot copy failed"); 599 } else { 600 struct pathbuf *pb; 601 602 /* 603 * This used to allow copying in one extra byte 604 * (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON. 605 * This was completely pointless because if anyone 606 * used that extra byte namei would fail with 607 * ENAMETOOLONG anyway, so I've removed the excess 608 * logic. - dholland 20100215 609 */ 610 611 error = pathbuf_copyin(SCARG(uap, arg), &pb); 612 if (error) { 613 goto out; 614 } 615 if (SCARG(uap, cmd) == SWAP_ON) { 616 /* get a copy of the string */ 617 pathbuf_copystring(pb, userpath, SWAP_PATH_MAX); 618 len = strlen(userpath) + 1; 619 } 620 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb); 621 if ((error = namei(&nd))) { 622 pathbuf_destroy(pb); 623 goto out; 624 } 625 vp = nd.ni_vp; 626 pathbuf_destroy(pb); 627 } 628 /* note: "vp" is referenced and locked */ 629 630 error = 0; /* assume no error */ 631 switch(SCARG(uap, cmd)) { 632 633 case SWAP_DUMPDEV: 634 if (vp->v_type != VBLK) { 635 error = ENOTBLK; 636 break; 637 } 638 if (bdevsw_lookup(vp->v_rdev)) { 639 dumpdev = vp->v_rdev; 640 dumpcdev = devsw_blk2chr(dumpdev); 641 } else 642 dumpdev = NODEV; 643 cpu_dumpconf(); 644 break; 645 646 case SWAP_CTL: 647 /* 648 * get new priority, remove old entry (if any) and then 649 * reinsert it in the correct place. finally, prune out 650 * any empty priority structures. 651 */ 652 priority = SCARG(uap, misc); 653 spp = kmem_alloc(sizeof(*spp), KM_SLEEP); 654 mutex_enter(&uvm_swap_data_lock); 655 if ((sdp = swaplist_find(vp, true)) == NULL) { 656 error = ENOENT; 657 } else { 658 swaplist_insert(sdp, spp, priority); 659 swaplist_trim(); 660 } 661 mutex_exit(&uvm_swap_data_lock); 662 if (error) 663 kmem_free(spp, sizeof(*spp)); 664 break; 665 666 case SWAP_ON: 667 668 /* 669 * check for duplicates. if none found, then insert a 670 * dummy entry on the list to prevent someone else from 671 * trying to enable this device while we are working on 672 * it. 673 */ 674 675 priority = SCARG(uap, misc); 676 sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP); 677 spp = kmem_alloc(sizeof(*spp), KM_SLEEP); 678 sdp->swd_flags = SWF_FAKE; 679 sdp->swd_vp = vp; 680 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 681 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK); 682 mutex_enter(&uvm_swap_data_lock); 683 if (swaplist_find(vp, false) != NULL) { 684 error = EBUSY; 685 mutex_exit(&uvm_swap_data_lock); 686 bufq_free(sdp->swd_tab); 687 kmem_free(sdp, sizeof(*sdp)); 688 kmem_free(spp, sizeof(*spp)); 689 break; 690 } 691 swaplist_insert(sdp, spp, priority); 692 mutex_exit(&uvm_swap_data_lock); 693 694 KASSERT(len > 0); 695 sdp->swd_pathlen = len; 696 sdp->swd_path = kmem_alloc(len, KM_SLEEP); 697 if (copystr(userpath, sdp->swd_path, len, 0) != 0) 698 panic("swapctl: copystr"); 699 700 /* 701 * we've now got a FAKE placeholder in the swap list. 702 * now attempt to enable swap on it. if we fail, undo 703 * what we've done and kill the fake entry we just inserted. 704 * if swap_on is a success, it will clear the SWF_FAKE flag 705 */ 706 707 if ((error = swap_on(l, sdp)) != 0) { 708 mutex_enter(&uvm_swap_data_lock); 709 (void) swaplist_find(vp, true); /* kill fake entry */ 710 swaplist_trim(); 711 mutex_exit(&uvm_swap_data_lock); 712 bufq_free(sdp->swd_tab); 713 kmem_free(sdp->swd_path, sdp->swd_pathlen); 714 kmem_free(sdp, sizeof(*sdp)); 715 break; 716 } 717 break; 718 719 case SWAP_OFF: 720 mutex_enter(&uvm_swap_data_lock); 721 if ((sdp = swaplist_find(vp, false)) == NULL) { 722 mutex_exit(&uvm_swap_data_lock); 723 error = ENXIO; 724 break; 725 } 726 727 /* 728 * If a device isn't in use or enabled, we 729 * can't stop swapping from it (again). 730 */ 731 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 732 mutex_exit(&uvm_swap_data_lock); 733 error = EBUSY; 734 break; 735 } 736 737 /* 738 * do the real work. 739 */ 740 error = swap_off(l, sdp); 741 break; 742 743 default: 744 error = EINVAL; 745 } 746 747 /* 748 * done! release the ref gained by namei() and unlock. 749 */ 750 vput(vp); 751 out: 752 rw_exit(&swap_syscall_lock); 753 kmem_free(userpath, SWAP_PATH_MAX); 754 755 UVMHIST_LOG(pdhist, "<- done! error=%jd", error, 0, 0, 0); 756 return (error); 757 } 758 759 /* 760 * uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept 761 * away from sys_swapctl() in order to allow COMPAT_* swapctl() 762 * emulation to use it directly without going through sys_swapctl(). 763 * The problem with using sys_swapctl() there is that it involves 764 * copying the swapent array to the stackgap, and this array's size 765 * is not known at build time. Hence it would not be possible to 766 * ensure it would fit in the stackgap in any case. 767 */ 768 int 769 uvm_swap_stats(char *ptr, int misc, 770 void (*f)(void *, const struct swapent *), size_t len, 771 register_t *retval) 772 { 773 struct swappri *spp; 774 struct swapdev *sdp; 775 struct swapent sep; 776 int count = 0; 777 int error; 778 779 KASSERT(len <= sizeof(sep)); 780 if (len == 0) 781 return ENOSYS; 782 783 if (misc < 0) 784 return EINVAL; 785 786 if (misc == 0 || uvmexp.nswapdev == 0) 787 return 0; 788 789 /* Make sure userland cannot exhaust kernel memory */ 790 if ((size_t)misc > (size_t)uvmexp.nswapdev) 791 misc = uvmexp.nswapdev; 792 793 KASSERT(rw_lock_held(&swap_syscall_lock)); 794 795 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 796 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 797 int inuse; 798 799 if (misc-- <= 0) 800 break; 801 802 inuse = btodb((uint64_t)sdp->swd_npginuse << 803 PAGE_SHIFT); 804 805 memset(&sep, 0, sizeof(sep)); 806 swapent_cvt(&sep, sdp, inuse); 807 if (f) 808 (*f)(&sep, &sep); 809 if ((error = copyout(&sep, ptr, len)) != 0) 810 return error; 811 ptr += len; 812 count++; 813 } 814 } 815 *retval = count; 816 return 0; 817 } 818 819 /* 820 * swap_on: attempt to enable a swapdev for swapping. note that the 821 * swapdev is already on the global list, but disabled (marked 822 * SWF_FAKE). 823 * 824 * => we avoid the start of the disk (to protect disk labels) 825 * => we also avoid the miniroot, if we are swapping to root. 826 * => caller should leave uvm_swap_data_lock unlocked, we may lock it 827 * if needed. 828 */ 829 static int 830 swap_on(struct lwp *l, struct swapdev *sdp) 831 { 832 struct vnode *vp; 833 int error, npages, nblocks, size; 834 long addr; 835 vmem_addr_t result; 836 struct vattr va; 837 dev_t dev; 838 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 839 840 /* 841 * we want to enable swapping on sdp. the swd_vp contains 842 * the vnode we want (locked and ref'd), and the swd_dev 843 * contains the dev_t of the file, if it a block device. 844 */ 845 846 vp = sdp->swd_vp; 847 dev = sdp->swd_dev; 848 849 /* 850 * open the swap file (mostly useful for block device files to 851 * let device driver know what is up). 852 * 853 * we skip the open/close for root on swap because the root 854 * has already been opened when root was mounted (mountroot). 855 */ 856 if (vp != rootvp) { 857 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred))) 858 return (error); 859 } 860 861 /* XXX this only works for block devices */ 862 UVMHIST_LOG(pdhist, " dev=%jd, major(dev)=%jd", dev, major(dev), 0, 0); 863 864 /* 865 * we now need to determine the size of the swap area. for 866 * block specials we can call the d_psize function. 867 * for normal files, we must stat [get attrs]. 868 * 869 * we put the result in nblks. 870 * for normal files, we also want the filesystem block size 871 * (which we get with statfs). 872 */ 873 switch (vp->v_type) { 874 case VBLK: 875 if ((nblocks = bdev_size(dev)) == -1) { 876 error = ENXIO; 877 goto bad; 878 } 879 break; 880 881 case VREG: 882 if ((error = VOP_GETATTR(vp, &va, l->l_cred))) 883 goto bad; 884 nblocks = (int)btodb(va.va_size); 885 sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift; 886 /* 887 * limit the max # of outstanding I/O requests we issue 888 * at any one time. take it easy on NFS servers. 889 */ 890 if (vp->v_tag == VT_NFS) 891 sdp->swd_maxactive = 2; /* XXX */ 892 else 893 sdp->swd_maxactive = 8; /* XXX */ 894 break; 895 896 default: 897 error = ENXIO; 898 goto bad; 899 } 900 901 /* 902 * save nblocks in a safe place and convert to pages. 903 */ 904 905 sdp->swd_nblks = nblocks; 906 npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT; 907 908 /* 909 * for block special files, we want to make sure that leave 910 * the disklabel and bootblocks alone, so we arrange to skip 911 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 912 * note that because of this the "size" can be less than the 913 * actual number of blocks on the device. 914 */ 915 if (vp->v_type == VBLK) { 916 /* we use pages 1 to (size - 1) [inclusive] */ 917 size = npages - 1; 918 addr = 1; 919 } else { 920 /* we use pages 0 to (size - 1) [inclusive] */ 921 size = npages; 922 addr = 0; 923 } 924 925 /* 926 * make sure we have enough blocks for a reasonable sized swap 927 * area. we want at least one page. 928 */ 929 930 if (size < 1) { 931 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); 932 error = EINVAL; 933 goto bad; 934 } 935 936 UVMHIST_LOG(pdhist, " dev=%jx: size=%jd addr=%jd", dev, size, addr, 0); 937 938 /* 939 * now we need to allocate an extent to manage this swap device 940 */ 941 942 sdp->swd_blist = blist_create(npages); 943 /* mark all expect the `saved' region free. */ 944 blist_free(sdp->swd_blist, addr, size); 945 946 /* 947 * allocate space to for swap encryption state and mark the 948 * keys uninitialized so we generate them lazily 949 */ 950 sdp->swd_encmap = kmem_zalloc(encmap_size(npages), KM_SLEEP); 951 sdp->swd_encinit = false; 952 953 /* 954 * if the vnode we are swapping to is the root vnode 955 * (i.e. we are swapping to the miniroot) then we want 956 * to make sure we don't overwrite it. do a statfs to 957 * find its size and skip over it. 958 */ 959 if (vp == rootvp) { 960 struct mount *mp; 961 struct statvfs *sp; 962 int rootblocks, rootpages; 963 964 mp = rootvnode->v_mount; 965 sp = &mp->mnt_stat; 966 rootblocks = sp->f_blocks * btodb(sp->f_frsize); 967 /* 968 * XXX: sp->f_blocks isn't the total number of 969 * blocks in the filesystem, it's the number of 970 * data blocks. so, our rootblocks almost 971 * definitely underestimates the total size 972 * of the filesystem - how badly depends on the 973 * details of the filesystem type. there isn't 974 * an obvious way to deal with this cleanly 975 * and perfectly, so for now we just pad our 976 * rootblocks estimate with an extra 5 percent. 977 */ 978 rootblocks += (rootblocks >> 5) + 979 (rootblocks >> 6) + 980 (rootblocks >> 7); 981 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; 982 if (rootpages > size) 983 panic("swap_on: miniroot larger than swap?"); 984 985 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) { 986 panic("swap_on: unable to preserve miniroot"); 987 } 988 989 size -= rootpages; 990 printf("Preserved %d pages of miniroot ", rootpages); 991 printf("leaving %d pages of swap\n", size); 992 } 993 994 /* 995 * add a ref to vp to reflect usage as a swap device. 996 */ 997 vref(vp); 998 999 /* 1000 * now add the new swapdev to the drum and enable. 1001 */ 1002 error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result); 1003 if (error != 0) 1004 panic("swapdrum_add"); 1005 /* 1006 * If this is the first regular swap create the workqueue. 1007 * => Protected by swap_syscall_lock. 1008 */ 1009 if (vp->v_type != VBLK) { 1010 if (sw_reg_count++ == 0) { 1011 KASSERT(sw_reg_workqueue == NULL); 1012 if (workqueue_create(&sw_reg_workqueue, "swapiod", 1013 sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0) 1014 panic("%s: workqueue_create failed", __func__); 1015 } 1016 } 1017 1018 sdp->swd_drumoffset = (int)result; 1019 sdp->swd_drumsize = npages; 1020 sdp->swd_npages = size; 1021 mutex_enter(&uvm_swap_data_lock); 1022 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 1023 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 1024 uvmexp.swpages += size; 1025 uvmexp.swpgavail += size; 1026 mutex_exit(&uvm_swap_data_lock); 1027 return (0); 1028 1029 /* 1030 * failure: clean up and return error. 1031 */ 1032 1033 bad: 1034 if (sdp->swd_blist) { 1035 blist_destroy(sdp->swd_blist); 1036 } 1037 if (vp != rootvp) { 1038 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred); 1039 } 1040 return (error); 1041 } 1042 1043 /* 1044 * swap_off: stop swapping on swapdev 1045 * 1046 * => swap data should be locked, we will unlock. 1047 */ 1048 static int 1049 swap_off(struct lwp *l, struct swapdev *sdp) 1050 { 1051 int npages = sdp->swd_npages; 1052 int error = 0; 1053 1054 UVMHIST_FUNC(__func__); 1055 UVMHIST_CALLARGS(pdhist, " dev=%jx, npages=%jd", sdp->swd_dev,npages, 0, 0); 1056 1057 KASSERT(rw_write_held(&swap_syscall_lock)); 1058 KASSERT(mutex_owned(&uvm_swap_data_lock)); 1059 1060 /* disable the swap area being removed */ 1061 sdp->swd_flags &= ~SWF_ENABLE; 1062 uvmexp.swpgavail -= npages; 1063 mutex_exit(&uvm_swap_data_lock); 1064 1065 /* 1066 * the idea is to find all the pages that are paged out to this 1067 * device, and page them all in. in uvm, swap-backed pageable 1068 * memory can take two forms: aobjs and anons. call the 1069 * swapoff hook for each subsystem to bring in pages. 1070 */ 1071 1072 if (uao_swap_off(sdp->swd_drumoffset, 1073 sdp->swd_drumoffset + sdp->swd_drumsize) || 1074 amap_swap_off(sdp->swd_drumoffset, 1075 sdp->swd_drumoffset + sdp->swd_drumsize)) { 1076 error = ENOMEM; 1077 } else if (sdp->swd_npginuse > sdp->swd_npgbad) { 1078 error = EBUSY; 1079 } 1080 1081 if (error) { 1082 mutex_enter(&uvm_swap_data_lock); 1083 sdp->swd_flags |= SWF_ENABLE; 1084 uvmexp.swpgavail += npages; 1085 mutex_exit(&uvm_swap_data_lock); 1086 1087 return error; 1088 } 1089 1090 /* 1091 * If this is the last regular swap destroy the workqueue. 1092 * => Protected by swap_syscall_lock. 1093 */ 1094 if (sdp->swd_vp->v_type != VBLK) { 1095 KASSERT(sw_reg_count > 0); 1096 KASSERT(sw_reg_workqueue != NULL); 1097 if (--sw_reg_count == 0) { 1098 workqueue_destroy(sw_reg_workqueue); 1099 sw_reg_workqueue = NULL; 1100 } 1101 } 1102 1103 /* 1104 * done with the vnode. 1105 * drop our ref on the vnode before calling VOP_CLOSE() 1106 * so that spec_close() can tell if this is the last close. 1107 */ 1108 vrele(sdp->swd_vp); 1109 if (sdp->swd_vp != rootvp) { 1110 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred); 1111 } 1112 1113 mutex_enter(&uvm_swap_data_lock); 1114 uvmexp.swpages -= npages; 1115 uvmexp.swpginuse -= sdp->swd_npgbad; 1116 1117 if (swaplist_find(sdp->swd_vp, true) == NULL) 1118 panic("%s: swapdev not in list", __func__); 1119 swaplist_trim(); 1120 mutex_exit(&uvm_swap_data_lock); 1121 1122 /* 1123 * free all resources! 1124 */ 1125 vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize); 1126 blist_destroy(sdp->swd_blist); 1127 bufq_free(sdp->swd_tab); 1128 kmem_free(__UNVOLATILE(sdp->swd_encmap), 1129 encmap_size(sdp->swd_drumsize)); 1130 explicit_memset(&sdp->swd_enckey, 0, sizeof sdp->swd_enckey); 1131 explicit_memset(&sdp->swd_deckey, 0, sizeof sdp->swd_deckey); 1132 kmem_free(sdp, sizeof(*sdp)); 1133 return (0); 1134 } 1135 1136 void 1137 uvm_swap_shutdown(struct lwp *l) 1138 { 1139 struct swapdev *sdp; 1140 struct swappri *spp; 1141 struct vnode *vp; 1142 int error; 1143 1144 printf("turning off swap..."); 1145 rw_enter(&swap_syscall_lock, RW_WRITER); 1146 mutex_enter(&uvm_swap_data_lock); 1147 again: 1148 LIST_FOREACH(spp, &swap_priority, spi_swappri) 1149 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1150 if (sdp->swd_flags & SWF_FAKE) 1151 continue; 1152 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) 1153 continue; 1154 #ifdef DEBUG 1155 printf("\nturning off swap on %s...", 1156 sdp->swd_path); 1157 #endif 1158 if (vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE)) { 1159 error = EBUSY; 1160 vp = NULL; 1161 } else 1162 error = 0; 1163 if (!error) { 1164 error = swap_off(l, sdp); 1165 mutex_enter(&uvm_swap_data_lock); 1166 } 1167 if (error) { 1168 printf("stopping swap on %s failed " 1169 "with error %d\n", sdp->swd_path, error); 1170 TAILQ_REMOVE(&spp->spi_swapdev, sdp, 1171 swd_next); 1172 uvmexp.nswapdev--; 1173 swaplist_trim(); 1174 if (vp) 1175 vput(vp); 1176 } 1177 goto again; 1178 } 1179 printf(" done\n"); 1180 mutex_exit(&uvm_swap_data_lock); 1181 rw_exit(&swap_syscall_lock); 1182 } 1183 1184 1185 /* 1186 * /dev/drum interface and i/o functions 1187 */ 1188 1189 /* 1190 * swstrategy: perform I/O on the drum 1191 * 1192 * => we must map the i/o request from the drum to the correct swapdev. 1193 */ 1194 static void 1195 swstrategy(struct buf *bp) 1196 { 1197 struct swapdev *sdp; 1198 struct vnode *vp; 1199 int pageno, bn; 1200 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1201 1202 /* 1203 * convert block number to swapdev. note that swapdev can't 1204 * be yanked out from under us because we are holding resources 1205 * in it (i.e. the blocks we are doing I/O on). 1206 */ 1207 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT; 1208 mutex_enter(&uvm_swap_data_lock); 1209 sdp = swapdrum_getsdp(pageno); 1210 mutex_exit(&uvm_swap_data_lock); 1211 if (sdp == NULL) { 1212 bp->b_error = EINVAL; 1213 bp->b_resid = bp->b_bcount; 1214 biodone(bp); 1215 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); 1216 return; 1217 } 1218 1219 /* 1220 * convert drum page number to block number on this swapdev. 1221 */ 1222 1223 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1224 bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ 1225 1226 UVMHIST_LOG(pdhist, " Rd/Wr (0/1) %jd: mapoff=%jx bn=%jx bcount=%jd", 1227 ((bp->b_flags & B_READ) == 0) ? 1 : 0, 1228 sdp->swd_drumoffset, bn, bp->b_bcount); 1229 1230 /* 1231 * for block devices we finish up here. 1232 * for regular files we have to do more work which we delegate 1233 * to sw_reg_strategy(). 1234 */ 1235 1236 vp = sdp->swd_vp; /* swapdev vnode pointer */ 1237 switch (vp->v_type) { 1238 default: 1239 panic("%s: vnode type 0x%x", __func__, vp->v_type); 1240 1241 case VBLK: 1242 1243 /* 1244 * must convert "bp" from an I/O on /dev/drum to an I/O 1245 * on the swapdev (sdp). 1246 */ 1247 bp->b_blkno = bn; /* swapdev block number */ 1248 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */ 1249 1250 /* 1251 * if we are doing a write, we have to redirect the i/o on 1252 * drum's v_numoutput counter to the swapdevs. 1253 */ 1254 if ((bp->b_flags & B_READ) == 0) { 1255 mutex_enter(bp->b_objlock); 1256 vwakeup(bp); /* kills one 'v_numoutput' on drum */ 1257 mutex_exit(bp->b_objlock); 1258 mutex_enter(vp->v_interlock); 1259 vp->v_numoutput++; /* put it on swapdev */ 1260 mutex_exit(vp->v_interlock); 1261 } 1262 1263 /* 1264 * finally plug in swapdev vnode and start I/O 1265 */ 1266 bp->b_vp = vp; 1267 bp->b_objlock = vp->v_interlock; 1268 VOP_STRATEGY(vp, bp); 1269 return; 1270 1271 case VREG: 1272 /* 1273 * delegate to sw_reg_strategy function. 1274 */ 1275 sw_reg_strategy(sdp, bp, bn); 1276 return; 1277 } 1278 /* NOTREACHED */ 1279 } 1280 1281 /* 1282 * swread: the read function for the drum (just a call to physio) 1283 */ 1284 /*ARGSUSED*/ 1285 static int 1286 swread(dev_t dev, struct uio *uio, int ioflag) 1287 { 1288 UVMHIST_FUNC(__func__); 1289 UVMHIST_CALLARGS(pdhist, " dev=%jx offset=%jx", dev, uio->uio_offset, 0, 0); 1290 1291 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); 1292 } 1293 1294 /* 1295 * swwrite: the write function for the drum (just a call to physio) 1296 */ 1297 /*ARGSUSED*/ 1298 static int 1299 swwrite(dev_t dev, struct uio *uio, int ioflag) 1300 { 1301 UVMHIST_FUNC(__func__); 1302 UVMHIST_CALLARGS(pdhist, " dev=%jx offset=%jx", dev, uio->uio_offset, 0, 0); 1303 1304 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); 1305 } 1306 1307 const struct bdevsw swap_bdevsw = { 1308 .d_open = nullopen, 1309 .d_close = nullclose, 1310 .d_strategy = swstrategy, 1311 .d_ioctl = noioctl, 1312 .d_dump = nodump, 1313 .d_psize = nosize, 1314 .d_discard = nodiscard, 1315 .d_flag = D_OTHER 1316 }; 1317 1318 const struct cdevsw swap_cdevsw = { 1319 .d_open = nullopen, 1320 .d_close = nullclose, 1321 .d_read = swread, 1322 .d_write = swwrite, 1323 .d_ioctl = noioctl, 1324 .d_stop = nostop, 1325 .d_tty = notty, 1326 .d_poll = nopoll, 1327 .d_mmap = nommap, 1328 .d_kqfilter = nokqfilter, 1329 .d_discard = nodiscard, 1330 .d_flag = D_OTHER, 1331 }; 1332 1333 /* 1334 * sw_reg_strategy: handle swap i/o to regular files 1335 */ 1336 static void 1337 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) 1338 { 1339 struct vnode *vp; 1340 struct vndxfer *vnx; 1341 daddr_t nbn; 1342 char *addr; 1343 off_t byteoff; 1344 int s, off, nra, error, sz, resid; 1345 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1346 1347 /* 1348 * allocate a vndxfer head for this transfer and point it to 1349 * our buffer. 1350 */ 1351 vnx = pool_get(&vndxfer_pool, PR_WAITOK); 1352 vnx->vx_flags = VX_BUSY; 1353 vnx->vx_error = 0; 1354 vnx->vx_pending = 0; 1355 vnx->vx_bp = bp; 1356 vnx->vx_sdp = sdp; 1357 1358 /* 1359 * setup for main loop where we read filesystem blocks into 1360 * our buffer. 1361 */ 1362 error = 0; 1363 bp->b_resid = bp->b_bcount; /* nothing transferred yet! */ 1364 addr = bp->b_data; /* current position in buffer */ 1365 byteoff = dbtob((uint64_t)bn); 1366 1367 for (resid = bp->b_resid; resid; resid -= sz) { 1368 struct vndbuf *nbp; 1369 1370 /* 1371 * translate byteoffset into block number. return values: 1372 * vp = vnode of underlying device 1373 * nbn = new block number (on underlying vnode dev) 1374 * nra = num blocks we can read-ahead (excludes requested 1375 * block) 1376 */ 1377 nra = 0; 1378 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1379 &vp, &nbn, &nra); 1380 1381 if (error == 0 && nbn == (daddr_t)-1) { 1382 /* 1383 * this used to just set error, but that doesn't 1384 * do the right thing. Instead, it causes random 1385 * memory errors. The panic() should remain until 1386 * this condition doesn't destabilize the system. 1387 */ 1388 #if 1 1389 panic("%s: swap to sparse file", __func__); 1390 #else 1391 error = EIO; /* failure */ 1392 #endif 1393 } 1394 1395 /* 1396 * punt if there was an error or a hole in the file. 1397 * we must wait for any i/o ops we have already started 1398 * to finish before returning. 1399 * 1400 * XXX we could deal with holes here but it would be 1401 * a hassle (in the write case). 1402 */ 1403 if (error) { 1404 s = splbio(); 1405 vnx->vx_error = error; /* pass error up */ 1406 goto out; 1407 } 1408 1409 /* 1410 * compute the size ("sz") of this transfer (in bytes). 1411 */ 1412 off = byteoff % sdp->swd_bsize; 1413 sz = (1 + nra) * sdp->swd_bsize - off; 1414 if (sz > resid) 1415 sz = resid; 1416 1417 UVMHIST_LOG(pdhist, "sw_reg_strategy: " 1418 "vp %#jx/%#jx offset 0x%jx/0x%jx", 1419 (uintptr_t)sdp->swd_vp, (uintptr_t)vp, byteoff, nbn); 1420 1421 /* 1422 * now get a buf structure. note that the vb_buf is 1423 * at the front of the nbp structure so that you can 1424 * cast pointers between the two structure easily. 1425 */ 1426 nbp = pool_get(&vndbuf_pool, PR_WAITOK); 1427 buf_init(&nbp->vb_buf); 1428 nbp->vb_buf.b_flags = bp->b_flags; 1429 nbp->vb_buf.b_cflags = bp->b_cflags; 1430 nbp->vb_buf.b_oflags = bp->b_oflags; 1431 nbp->vb_buf.b_bcount = sz; 1432 nbp->vb_buf.b_bufsize = sz; 1433 nbp->vb_buf.b_error = 0; 1434 nbp->vb_buf.b_data = addr; 1435 nbp->vb_buf.b_lblkno = 0; 1436 nbp->vb_buf.b_blkno = nbn + btodb(off); 1437 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno; 1438 nbp->vb_buf.b_iodone = sw_reg_biodone; 1439 nbp->vb_buf.b_vp = vp; 1440 nbp->vb_buf.b_objlock = vp->v_interlock; 1441 if (vp->v_type == VBLK) { 1442 nbp->vb_buf.b_dev = vp->v_rdev; 1443 } 1444 1445 nbp->vb_xfer = vnx; /* patch it back in to vnx */ 1446 1447 /* 1448 * Just sort by block number 1449 */ 1450 s = splbio(); 1451 if (vnx->vx_error != 0) { 1452 buf_destroy(&nbp->vb_buf); 1453 pool_put(&vndbuf_pool, nbp); 1454 goto out; 1455 } 1456 vnx->vx_pending++; 1457 1458 /* sort it in and start I/O if we are not over our limit */ 1459 /* XXXAD locking */ 1460 bufq_put(sdp->swd_tab, &nbp->vb_buf); 1461 sw_reg_start(sdp); 1462 splx(s); 1463 1464 /* 1465 * advance to the next I/O 1466 */ 1467 byteoff += sz; 1468 addr += sz; 1469 } 1470 1471 s = splbio(); 1472 1473 out: /* Arrive here at splbio */ 1474 vnx->vx_flags &= ~VX_BUSY; 1475 if (vnx->vx_pending == 0) { 1476 error = vnx->vx_error; 1477 pool_put(&vndxfer_pool, vnx); 1478 bp->b_error = error; 1479 biodone(bp); 1480 } 1481 splx(s); 1482 } 1483 1484 /* 1485 * sw_reg_start: start an I/O request on the requested swapdev 1486 * 1487 * => reqs are sorted by b_rawblkno (above) 1488 */ 1489 static void 1490 sw_reg_start(struct swapdev *sdp) 1491 { 1492 struct buf *bp; 1493 struct vnode *vp; 1494 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1495 1496 /* recursion control */ 1497 if ((sdp->swd_flags & SWF_BUSY) != 0) 1498 return; 1499 1500 sdp->swd_flags |= SWF_BUSY; 1501 1502 while (sdp->swd_active < sdp->swd_maxactive) { 1503 bp = bufq_get(sdp->swd_tab); 1504 if (bp == NULL) 1505 break; 1506 sdp->swd_active++; 1507 1508 UVMHIST_LOG(pdhist, 1509 "sw_reg_start: bp %#jx vp %#jx blkno %#jx cnt %jx", 1510 (uintptr_t)bp, (uintptr_t)bp->b_vp, (uintptr_t)bp->b_blkno, 1511 bp->b_bcount); 1512 vp = bp->b_vp; 1513 KASSERT(bp->b_objlock == vp->v_interlock); 1514 if ((bp->b_flags & B_READ) == 0) { 1515 mutex_enter(vp->v_interlock); 1516 vp->v_numoutput++; 1517 mutex_exit(vp->v_interlock); 1518 } 1519 VOP_STRATEGY(vp, bp); 1520 } 1521 sdp->swd_flags &= ~SWF_BUSY; 1522 } 1523 1524 /* 1525 * sw_reg_biodone: one of our i/o's has completed 1526 */ 1527 static void 1528 sw_reg_biodone(struct buf *bp) 1529 { 1530 workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL); 1531 } 1532 1533 /* 1534 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1535 * 1536 * => note that we can recover the vndbuf struct by casting the buf ptr 1537 */ 1538 static void 1539 sw_reg_iodone(struct work *wk, void *dummy) 1540 { 1541 struct vndbuf *vbp = (void *)wk; 1542 struct vndxfer *vnx = vbp->vb_xfer; 1543 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1544 struct swapdev *sdp = vnx->vx_sdp; 1545 int s, resid, error; 1546 KASSERT(&vbp->vb_buf.b_work == wk); 1547 UVMHIST_FUNC(__func__); 1548 UVMHIST_CALLARGS(pdhist, " vbp=%#jx vp=%#jx blkno=%jx addr=%#jx", 1549 (uintptr_t)vbp, (uintptr_t)vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, 1550 (uintptr_t)vbp->vb_buf.b_data); 1551 UVMHIST_LOG(pdhist, " cnt=%jx resid=%jx", 1552 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); 1553 1554 /* 1555 * protect vbp at splbio and update. 1556 */ 1557 1558 s = splbio(); 1559 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1560 pbp->b_resid -= resid; 1561 vnx->vx_pending--; 1562 1563 if (vbp->vb_buf.b_error != 0) { 1564 /* pass error upward */ 1565 error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO; 1566 UVMHIST_LOG(pdhist, " got error=%jd !", error, 0, 0, 0); 1567 vnx->vx_error = error; 1568 } 1569 1570 /* 1571 * kill vbp structure 1572 */ 1573 buf_destroy(&vbp->vb_buf); 1574 pool_put(&vndbuf_pool, vbp); 1575 1576 /* 1577 * wrap up this transaction if it has run to completion or, in 1578 * case of an error, when all auxiliary buffers have returned. 1579 */ 1580 if (vnx->vx_error != 0) { 1581 /* pass error upward */ 1582 error = vnx->vx_error; 1583 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1584 pbp->b_error = error; 1585 biodone(pbp); 1586 pool_put(&vndxfer_pool, vnx); 1587 } 1588 } else if (pbp->b_resid == 0) { 1589 KASSERT(vnx->vx_pending == 0); 1590 if ((vnx->vx_flags & VX_BUSY) == 0) { 1591 UVMHIST_LOG(pdhist, " iodone, pbp=%#jx error=%jd !", 1592 (uintptr_t)pbp, vnx->vx_error, 0, 0); 1593 biodone(pbp); 1594 pool_put(&vndxfer_pool, vnx); 1595 } 1596 } 1597 1598 /* 1599 * done! start next swapdev I/O if one is pending 1600 */ 1601 sdp->swd_active--; 1602 sw_reg_start(sdp); 1603 splx(s); 1604 } 1605 1606 1607 /* 1608 * uvm_swap_alloc: allocate space on swap 1609 * 1610 * => allocation is done "round robin" down the priority list, as we 1611 * allocate in a priority we "rotate" the circle queue. 1612 * => space can be freed with uvm_swap_free 1613 * => we return the page slot number in /dev/drum (0 == invalid slot) 1614 * => we lock uvm_swap_data_lock 1615 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1616 */ 1617 int 1618 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok) 1619 { 1620 struct swapdev *sdp; 1621 struct swappri *spp; 1622 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1623 1624 /* 1625 * no swap devices configured yet? definite failure. 1626 */ 1627 if (uvmexp.nswapdev < 1) 1628 return 0; 1629 1630 /* 1631 * XXXJAK: BEGIN HACK 1632 * 1633 * blist_alloc() in subr_blist.c will panic if we try to allocate 1634 * too many slots. 1635 */ 1636 if (*nslots > BLIST_MAX_ALLOC) { 1637 if (__predict_false(lessok == false)) 1638 return 0; 1639 *nslots = BLIST_MAX_ALLOC; 1640 } 1641 /* XXXJAK: END HACK */ 1642 1643 /* 1644 * lock data lock, convert slots into blocks, and enter loop 1645 */ 1646 mutex_enter(&uvm_swap_data_lock); 1647 1648 ReTry: /* XXXMRG */ 1649 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1650 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1651 uint64_t result; 1652 1653 /* if it's not enabled, then we can't swap from it */ 1654 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1655 continue; 1656 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1657 continue; 1658 result = blist_alloc(sdp->swd_blist, *nslots); 1659 if (result == BLIST_NONE) { 1660 continue; 1661 } 1662 KASSERT(result < sdp->swd_drumsize); 1663 1664 /* 1665 * successful allocation! now rotate the tailq. 1666 */ 1667 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1668 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1669 sdp->swd_npginuse += *nslots; 1670 uvmexp.swpginuse += *nslots; 1671 mutex_exit(&uvm_swap_data_lock); 1672 /* done! return drum slot number */ 1673 UVMHIST_LOG(pdhist, 1674 "success! returning %jd slots starting at %jd", 1675 *nslots, result + sdp->swd_drumoffset, 0, 0); 1676 return (result + sdp->swd_drumoffset); 1677 } 1678 } 1679 1680 /* XXXMRG: BEGIN HACK */ 1681 if (*nslots > 1 && lessok) { 1682 *nslots = 1; 1683 /* XXXMRG: ugh! blist should support this for us */ 1684 goto ReTry; 1685 } 1686 /* XXXMRG: END HACK */ 1687 1688 mutex_exit(&uvm_swap_data_lock); 1689 return 0; 1690 } 1691 1692 /* 1693 * uvm_swapisfull: return true if most of available swap is allocated 1694 * and in use. we don't count some small portion as it may be inaccessible 1695 * to us at any given moment, for example if there is lock contention or if 1696 * pages are busy. 1697 */ 1698 bool 1699 uvm_swapisfull(void) 1700 { 1701 int swpgonly; 1702 bool rv; 1703 1704 mutex_enter(&uvm_swap_data_lock); 1705 KASSERT(uvmexp.swpgonly <= uvmexp.swpages); 1706 swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 / 1707 uvm_swapisfull_factor); 1708 rv = (swpgonly >= uvmexp.swpgavail); 1709 mutex_exit(&uvm_swap_data_lock); 1710 1711 return (rv); 1712 } 1713 1714 /* 1715 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1716 * 1717 * => we lock uvm_swap_data_lock 1718 */ 1719 void 1720 uvm_swap_markbad(int startslot, int nslots) 1721 { 1722 struct swapdev *sdp; 1723 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1724 1725 mutex_enter(&uvm_swap_data_lock); 1726 sdp = swapdrum_getsdp(startslot); 1727 KASSERT(sdp != NULL); 1728 1729 /* 1730 * we just keep track of how many pages have been marked bad 1731 * in this device, to make everything add up in swap_off(). 1732 * we assume here that the range of slots will all be within 1733 * one swap device. 1734 */ 1735 1736 KASSERT(uvmexp.swpgonly >= nslots); 1737 atomic_add_int(&uvmexp.swpgonly, -nslots); 1738 sdp->swd_npgbad += nslots; 1739 UVMHIST_LOG(pdhist, "now %jd bad", sdp->swd_npgbad, 0,0,0); 1740 mutex_exit(&uvm_swap_data_lock); 1741 } 1742 1743 /* 1744 * uvm_swap_free: free swap slots 1745 * 1746 * => this can be all or part of an allocation made by uvm_swap_alloc 1747 * => we lock uvm_swap_data_lock 1748 */ 1749 void 1750 uvm_swap_free(int startslot, int nslots) 1751 { 1752 struct swapdev *sdp; 1753 UVMHIST_FUNC(__func__); 1754 UVMHIST_CALLARGS(pdhist, "freeing %jd slots starting at %jd", nslots, 1755 startslot, 0, 0); 1756 1757 /* 1758 * ignore attempts to free the "bad" slot. 1759 */ 1760 1761 if (startslot == SWSLOT_BAD) { 1762 return; 1763 } 1764 1765 /* 1766 * convert drum slot offset back to sdp, free the blocks 1767 * in the extent, and return. must hold pri lock to do 1768 * lookup and access the extent. 1769 */ 1770 1771 mutex_enter(&uvm_swap_data_lock); 1772 sdp = swapdrum_getsdp(startslot); 1773 KASSERT(uvmexp.nswapdev >= 1); 1774 KASSERT(sdp != NULL); 1775 KASSERT(sdp->swd_npginuse >= nslots); 1776 blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots); 1777 sdp->swd_npginuse -= nslots; 1778 uvmexp.swpginuse -= nslots; 1779 mutex_exit(&uvm_swap_data_lock); 1780 } 1781 1782 /* 1783 * uvm_swap_put: put any number of pages into a contig place on swap 1784 * 1785 * => can be sync or async 1786 */ 1787 1788 int 1789 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) 1790 { 1791 int error; 1792 1793 error = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1794 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1795 return error; 1796 } 1797 1798 /* 1799 * uvm_swap_get: get a single page from swap 1800 * 1801 * => usually a sync op (from fault) 1802 */ 1803 1804 int 1805 uvm_swap_get(struct vm_page *page, int swslot, int flags) 1806 { 1807 int error; 1808 1809 atomic_inc_uint(&uvmexp.nswget); 1810 KASSERT(flags & PGO_SYNCIO); 1811 if (swslot == SWSLOT_BAD) { 1812 return EIO; 1813 } 1814 1815 error = uvm_swap_io(&page, swslot, 1, B_READ | 1816 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1817 if (error == 0) { 1818 1819 /* 1820 * this page is no longer only in swap. 1821 */ 1822 1823 KASSERT(uvmexp.swpgonly > 0); 1824 atomic_dec_uint(&uvmexp.swpgonly); 1825 } 1826 return error; 1827 } 1828 1829 /* 1830 * uvm_swap_io: do an i/o operation to swap 1831 */ 1832 1833 static int 1834 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) 1835 { 1836 daddr_t startblk; 1837 struct buf *bp; 1838 vaddr_t kva; 1839 int error, mapinflags; 1840 bool write, async, swap_encrypt; 1841 UVMHIST_FUNC(__func__); 1842 UVMHIST_CALLARGS(pdhist, "<- called, startslot=%jd, npages=%jd, flags=%jd", 1843 startslot, npages, flags, 0); 1844 1845 write = (flags & B_READ) == 0; 1846 async = (flags & B_ASYNC) != 0; 1847 swap_encrypt = atomic_load_relaxed(&uvm_swap_encrypt); 1848 1849 /* 1850 * allocate a buf for the i/o. 1851 */ 1852 1853 KASSERT(curlwp != uvm.pagedaemon_lwp || (write && async)); 1854 bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp); 1855 if (bp == NULL) { 1856 uvm_aio_aiodone_pages(pps, npages, true, ENOMEM); 1857 return ENOMEM; 1858 } 1859 1860 /* 1861 * convert starting drum slot to block number 1862 */ 1863 1864 startblk = btodb((uint64_t)startslot << PAGE_SHIFT); 1865 1866 /* 1867 * first, map the pages into the kernel. 1868 */ 1869 1870 mapinflags = !write ? 1871 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ : 1872 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE; 1873 if (write && swap_encrypt) /* need to encrypt in-place */ 1874 mapinflags |= UVMPAGER_MAPIN_READ; 1875 kva = uvm_pagermapin(pps, npages, mapinflags); 1876 1877 /* 1878 * encrypt writes in place if requested 1879 */ 1880 1881 if (write) do { 1882 struct swapdev *sdp; 1883 int i; 1884 1885 /* 1886 * Get the swapdev so we can discriminate on the 1887 * encryption state. There may or may not be an 1888 * encryption key generated; we may or may not be asked 1889 * to encrypt swap. 1890 * 1891 * 1. NO KEY, NO ENCRYPTION: Nothing to do. 1892 * 1893 * 2. NO KEY, BUT ENCRYPTION: Generate a key, encrypt, 1894 * and mark the slots encrypted. 1895 * 1896 * 3. KEY, BUT NO ENCRYPTION: The slots may already be 1897 * marked encrypted from a past life. Mark them not 1898 * encrypted. 1899 * 1900 * 4. KEY, ENCRYPTION: Encrypt and mark the slots 1901 * encrypted. 1902 */ 1903 mutex_enter(&uvm_swap_data_lock); 1904 sdp = swapdrum_getsdp(startslot); 1905 if (!sdp->swd_encinit) { 1906 if (!swap_encrypt) { 1907 mutex_exit(&uvm_swap_data_lock); 1908 break; 1909 } 1910 uvm_swap_genkey(sdp); 1911 } 1912 KASSERT(sdp->swd_encinit); 1913 mutex_exit(&uvm_swap_data_lock); 1914 1915 for (i = 0; i < npages; i++) { 1916 int s = startslot + i; 1917 KDASSERT(swapdrum_sdp_is(s, sdp)); 1918 KASSERT(s >= sdp->swd_drumoffset); 1919 s -= sdp->swd_drumoffset; 1920 KASSERT(s < sdp->swd_drumsize); 1921 1922 if (swap_encrypt) { 1923 uvm_swap_encryptpage(sdp, 1924 (void *)(kva + (vsize_t)i*PAGE_SIZE), s); 1925 atomic_or_32(&sdp->swd_encmap[s/32], 1926 __BIT(s%32)); 1927 } else { 1928 atomic_and_32(&sdp->swd_encmap[s/32], 1929 ~__BIT(s%32)); 1930 } 1931 } 1932 } while (0); 1933 1934 /* 1935 * fill in the bp/sbp. we currently route our i/o through 1936 * /dev/drum's vnode [swapdev_vp]. 1937 */ 1938 1939 bp->b_cflags = BC_BUSY | BC_NOCACHE; 1940 bp->b_flags = (flags & (B_READ|B_ASYNC)); 1941 bp->b_proc = &proc0; /* XXX */ 1942 bp->b_vnbufs.le_next = NOLIST; 1943 bp->b_data = (void *)kva; 1944 bp->b_blkno = startblk; 1945 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT; 1946 1947 /* 1948 * bump v_numoutput (counter of number of active outputs). 1949 */ 1950 1951 if (write) { 1952 mutex_enter(swapdev_vp->v_interlock); 1953 swapdev_vp->v_numoutput++; 1954 mutex_exit(swapdev_vp->v_interlock); 1955 } 1956 1957 /* 1958 * for async ops we must set up the iodone handler. 1959 */ 1960 1961 if (async) { 1962 bp->b_iodone = uvm_aio_aiodone; 1963 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); 1964 if (curlwp == uvm.pagedaemon_lwp) 1965 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1966 else 1967 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 1968 } else { 1969 bp->b_iodone = NULL; 1970 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1971 } 1972 UVMHIST_LOG(pdhist, 1973 "about to start io: data = %#jx blkno = 0x%jx, bcount = %jd", 1974 (uintptr_t)bp->b_data, bp->b_blkno, bp->b_bcount, 0); 1975 1976 /* 1977 * now we start the I/O, and if async, return. 1978 */ 1979 1980 VOP_STRATEGY(swapdev_vp, bp); 1981 if (async) { 1982 /* 1983 * Reads are always synchronous; if this changes, we 1984 * need to add an asynchronous path for decryption. 1985 */ 1986 KASSERT(write); 1987 return 0; 1988 } 1989 1990 /* 1991 * must be sync i/o. wait for it to finish 1992 */ 1993 1994 error = biowait(bp); 1995 if (error) 1996 goto out; 1997 1998 /* 1999 * decrypt reads in place if needed 2000 */ 2001 2002 if (!write) do { 2003 struct swapdev *sdp; 2004 bool encinit; 2005 int i; 2006 2007 /* 2008 * Get the sdp. Everything about it except the encinit 2009 * bit, saying whether the encryption key is 2010 * initialized or not, and the encrypted bit for each 2011 * page, is stable until all swap pages have been 2012 * released and the device is removed. 2013 */ 2014 mutex_enter(&uvm_swap_data_lock); 2015 sdp = swapdrum_getsdp(startslot); 2016 encinit = sdp->swd_encinit; 2017 mutex_exit(&uvm_swap_data_lock); 2018 2019 if (!encinit) 2020 /* 2021 * If there's no encryption key, there's no way 2022 * any of these slots can be encrypted, so 2023 * nothing to do here. 2024 */ 2025 break; 2026 for (i = 0; i < npages; i++) { 2027 int s = startslot + i; 2028 KDASSERT(swapdrum_sdp_is(s, sdp)); 2029 KASSERT(s >= sdp->swd_drumoffset); 2030 s -= sdp->swd_drumoffset; 2031 KASSERT(s < sdp->swd_drumsize); 2032 if ((atomic_load_relaxed(&sdp->swd_encmap[s/32]) & 2033 __BIT(s%32)) == 0) 2034 continue; 2035 uvm_swap_decryptpage(sdp, 2036 (void *)(kva + (vsize_t)i*PAGE_SIZE), s); 2037 } 2038 } while (0); 2039 out: 2040 /* 2041 * kill the pager mapping 2042 */ 2043 2044 uvm_pagermapout(kva, npages); 2045 2046 /* 2047 * now dispose of the buf and we're done. 2048 */ 2049 2050 if (write) { 2051 mutex_enter(swapdev_vp->v_interlock); 2052 vwakeup(bp); 2053 mutex_exit(swapdev_vp->v_interlock); 2054 } 2055 putiobuf(bp); 2056 UVMHIST_LOG(pdhist, "<- done (sync) error=%jd", error, 0, 0, 0); 2057 2058 return (error); 2059 } 2060 2061 /* 2062 * uvm_swap_genkey(sdp) 2063 * 2064 * Generate a key for swap encryption. 2065 */ 2066 static void 2067 uvm_swap_genkey(struct swapdev *sdp) 2068 { 2069 uint8_t key[32]; 2070 2071 KASSERT(!sdp->swd_encinit); 2072 2073 cprng_strong(kern_cprng, key, sizeof key, 0); 2074 aes_setenckey256(&sdp->swd_enckey, key); 2075 aes_setdeckey256(&sdp->swd_deckey, key); 2076 explicit_memset(key, 0, sizeof key); 2077 2078 sdp->swd_encinit = true; 2079 } 2080 2081 /* 2082 * uvm_swap_encryptpage(sdp, kva, slot) 2083 * 2084 * Encrypt one page of data at kva for the specified slot number 2085 * in the swap device. 2086 */ 2087 static void 2088 uvm_swap_encryptpage(struct swapdev *sdp, void *kva, int slot) 2089 { 2090 uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16); 2091 2092 /* iv := AES_k(le32enc(slot) || 0^96) */ 2093 le32enc(preiv, slot); 2094 aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS); 2095 2096 /* *kva := AES-CBC_k(iv, *kva) */ 2097 aes_cbc_enc(&sdp->swd_enckey, kva, kva, PAGE_SIZE, iv, 2098 AES_256_NROUNDS); 2099 2100 explicit_memset(&iv, 0, sizeof iv); 2101 } 2102 2103 /* 2104 * uvm_swap_decryptpage(sdp, kva, slot) 2105 * 2106 * Decrypt one page of data at kva for the specified slot number 2107 * in the swap device. 2108 */ 2109 static void 2110 uvm_swap_decryptpage(struct swapdev *sdp, void *kva, int slot) 2111 { 2112 uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16); 2113 2114 /* iv := AES_k(le32enc(slot) || 0^96) */ 2115 le32enc(preiv, slot); 2116 aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS); 2117 2118 /* *kva := AES-CBC^{-1}_k(iv, *kva) */ 2119 aes_cbc_dec(&sdp->swd_deckey, kva, kva, PAGE_SIZE, iv, 2120 AES_256_NROUNDS); 2121 2122 explicit_memset(&iv, 0, sizeof iv); 2123 } 2124 2125 SYSCTL_SETUP(sysctl_uvmswap_setup, "sysctl uvmswap setup") 2126 { 2127 2128 sysctl_createv(clog, 0, NULL, NULL, 2129 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "swap_encrypt", 2130 SYSCTL_DESCR("Encrypt data when swapped out to disk"), 2131 NULL, 0, &uvm_swap_encrypt, 0, 2132 CTL_VM, CTL_CREATE, CTL_EOL); 2133 } 2134