1 /* $OpenBSD: uvm_swap.c,v 1.148 2020/12/14 13:29:18 mpi Exp $ */ 2 /* $NetBSD: uvm_swap.c,v 1.40 2000/11/17 11:39:39 mrg Exp $ */ 3 4 /* 5 * Copyright (c) 1995, 1996, 1997 Matthew R. Green 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 24 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 30 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 31 */ 32 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/buf.h> 36 #include <sys/conf.h> 37 #include <sys/proc.h> 38 #include <sys/namei.h> 39 #include <sys/disklabel.h> 40 #include <sys/errno.h> 41 #include <sys/kernel.h> 42 #include <sys/malloc.h> 43 #include <sys/vnode.h> 44 #include <sys/fcntl.h> 45 #include <sys/extent.h> 46 #include <sys/mount.h> 47 #include <sys/pool.h> 48 #include <sys/syscallargs.h> 49 #include <sys/swap.h> 50 #include <sys/disk.h> 51 #include <sys/task.h> 52 #include <sys/pledge.h> 53 #if defined(NFSCLIENT) 54 #include <sys/socket.h> 55 #include <sys/domain.h> 56 #include <netinet/in.h> 57 #include <nfs/nfsproto.h> 58 #include <nfs/nfsdiskless.h> 59 #endif 60 61 #include <uvm/uvm.h> 62 #ifdef UVM_SWAP_ENCRYPT 63 #include <uvm/uvm_swap_encrypt.h> 64 #endif 65 66 #include <sys/specdev.h> 67 68 #include "vnd.h" 69 70 /* 71 * uvm_swap.c: manage configuration and i/o to swap space. 72 */ 73 74 /* 75 * swap space is managed in the following way: 76 * 77 * each swap partition or file is described by a "swapdev" structure. 78 * each "swapdev" structure contains a "swapent" structure which contains 79 * information that is passed up to the user (via system calls). 80 * 81 * each swap partition is assigned a "priority" (int) which controls 82 * swap partition usage. 83 * 84 * the system maintains a global data structure describing all swap 85 * partitions/files. there is a sorted LIST of "swappri" structures 86 * which describe "swapdev"'s at that priority. this LIST is headed 87 * by the "swap_priority" global var. each "swappri" contains a 88 * TAILQ of "swapdev" structures at that priority. 89 * 90 * locking: 91 * - swap_syscall_lock (sleep lock): this lock serializes the swapctl 92 * system call and prevents the swap priority list from changing 93 * while we are in the middle of a system call (e.g. SWAP_STATS). 94 * 95 * each swap device has the following info: 96 * - swap device in use (could be disabled, preventing future use) 97 * - swap enabled (allows new allocations on swap) 98 * - map info in /dev/drum 99 * - vnode pointer 100 * for swap files only: 101 * - block size 102 * - max byte count in buffer 103 * - buffer 104 * - credentials to use when doing i/o to file 105 * 106 * userland controls and configures swap with the swapctl(2) system call. 107 * the sys_swapctl performs the following operations: 108 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 109 * [2] SWAP_STATS: given a pointer to an array of swapent structures 110 * (passed in via "arg") of a size passed in via "misc" ... we load 111 * the current swap config into the array. 112 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 113 * priority in "misc", start swapping on it. 114 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 115 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 116 * "misc") 117 */ 118 119 /* 120 * swapdev: describes a single swap partition/file 121 * 122 * note the following should be true: 123 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 124 * swd_nblks <= swd_mapsize [because mapsize includes disklabel] 125 */ 126 struct swapdev { 127 struct swapent swd_se; 128 #define swd_dev swd_se.se_dev /* device id */ 129 #define swd_flags swd_se.se_flags /* flags:inuse/enable/fake */ 130 #define swd_priority swd_se.se_priority /* our priority */ 131 #define swd_inuse swd_se.se_inuse /* blocks used */ 132 #define swd_nblks swd_se.se_nblks /* total blocks */ 133 char *swd_path; /* saved pathname of device */ 134 int swd_pathlen; /* length of pathname */ 135 int swd_npages; /* #pages we can use */ 136 int swd_npginuse; /* #pages in use */ 137 int swd_npgbad; /* #pages bad */ 138 int swd_drumoffset; /* page0 offset in drum */ 139 int swd_drumsize; /* #pages in drum */ 140 struct extent *swd_ex; /* extent for this swapdev */ 141 char swd_exname[12]; /* name of extent above */ 142 struct vnode *swd_vp; /* backing vnode */ 143 TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */ 144 145 int swd_bsize; /* blocksize (bytes) */ 146 int swd_maxactive; /* max active i/o reqs */ 147 int swd_active; /* # of active i/o reqs */ 148 struct bufq swd_bufq; 149 struct ucred *swd_cred; /* cred for file access */ 150 #ifdef UVM_SWAP_ENCRYPT 151 #define SWD_KEY_SHIFT 7 /* One key per 0.5 MByte */ 152 #define SWD_KEY(x,y) &((x)->swd_keys[((y) - (x)->swd_drumoffset) >> SWD_KEY_SHIFT]) 153 #define SWD_KEY_SIZE(x) (((x) + (1 << SWD_KEY_SHIFT) - 1) >> SWD_KEY_SHIFT) 154 155 #define SWD_DCRYPT_SHIFT 5 156 #define SWD_DCRYPT_BITS 32 157 #define SWD_DCRYPT_MASK (SWD_DCRYPT_BITS - 1) 158 #define SWD_DCRYPT_OFF(x) ((x) >> SWD_DCRYPT_SHIFT) 159 #define SWD_DCRYPT_BIT(x) ((x) & SWD_DCRYPT_MASK) 160 #define SWD_DCRYPT_SIZE(x) (SWD_DCRYPT_OFF((x) + SWD_DCRYPT_MASK) * sizeof(u_int32_t)) 161 u_int32_t *swd_decrypt; /* bitmap for decryption */ 162 struct swap_key *swd_keys; /* keys for different parts */ 163 #endif 164 }; 165 166 /* 167 * swap device priority entry; the list is kept sorted on `spi_priority'. 168 */ 169 struct swappri { 170 int spi_priority; /* priority */ 171 TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 172 /* tailq of swapdevs at this priority */ 173 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 174 }; 175 176 /* 177 * The following two structures are used to keep track of data transfers 178 * on swap devices associated with regular files. 179 * NOTE: this code is more or less a copy of vnd.c; we use the same 180 * structure names here to ease porting.. 181 */ 182 struct vndxfer { 183 struct buf *vx_bp; /* Pointer to parent buffer */ 184 struct swapdev *vx_sdp; 185 int vx_error; 186 int vx_pending; /* # of pending aux buffers */ 187 int vx_flags; 188 #define VX_BUSY 1 189 #define VX_DEAD 2 190 }; 191 192 struct vndbuf { 193 struct buf vb_buf; 194 struct vndxfer *vb_vnx; 195 struct task vb_task; 196 }; 197 198 /* 199 * We keep a of pool vndbuf's and vndxfer structures. 200 */ 201 struct pool vndxfer_pool; 202 struct pool vndbuf_pool; 203 204 205 /* 206 * local variables 207 */ 208 struct extent *swapmap; /* controls the mapping of /dev/drum */ 209 210 /* list of all active swap devices [by priority] */ 211 LIST_HEAD(swap_priority, swappri); 212 struct swap_priority swap_priority; 213 214 /* locks */ 215 struct rwlock swap_syscall_lock = RWLOCK_INITIALIZER("swplk"); 216 217 /* 218 * prototypes 219 */ 220 void swapdrum_add(struct swapdev *, int); 221 struct swapdev *swapdrum_getsdp(int); 222 223 struct swapdev *swaplist_find(struct vnode *, int); 224 void swaplist_insert(struct swapdev *, 225 struct swappri *, int); 226 void swaplist_trim(void); 227 228 int swap_on(struct proc *, struct swapdev *); 229 int swap_off(struct proc *, struct swapdev *); 230 231 void sw_reg_strategy(struct swapdev *, struct buf *, int); 232 void sw_reg_iodone(struct buf *); 233 void sw_reg_iodone_internal(void *); 234 void sw_reg_start(struct swapdev *); 235 236 int uvm_swap_io(struct vm_page **, int, int, int); 237 238 void swapmount(void); 239 boolean_t uvm_swap_allocpages(struct vm_page **, int); 240 241 #ifdef UVM_SWAP_ENCRYPT 242 /* for swap encrypt */ 243 void uvm_swap_markdecrypt(struct swapdev *, int, int, int); 244 boolean_t uvm_swap_needdecrypt(struct swapdev *, int); 245 void uvm_swap_initcrypt(struct swapdev *, int); 246 #endif 247 248 /* 249 * uvm_swap_init: init the swap system data structures and locks 250 * 251 * => called at boot time from init_main.c after the filesystems 252 * are brought up (which happens after uvm_init()) 253 */ 254 void 255 uvm_swap_init(void) 256 { 257 /* 258 * first, init the swap list, its counter, and its lock. 259 * then get a handle on the vnode for /dev/drum by using 260 * the its dev_t number ("swapdev", from MD conf.c). 261 */ 262 LIST_INIT(&swap_priority); 263 uvmexp.nswapdev = 0; 264 265 if (!swapdev_vp && bdevvp(swapdev, &swapdev_vp)) 266 panic("uvm_swap_init: can't get vnode for swap device"); 267 268 /* 269 * create swap block extent to map /dev/drum. The extent spans 270 * 1 to INT_MAX allows 2 gigablocks of swap space. Note that 271 * block 0 is reserved (used to indicate an allocation failure, 272 * or no allocation). 273 */ 274 swapmap = extent_create("swapmap", 1, INT_MAX, 275 M_VMSWAP, 0, 0, EX_NOWAIT); 276 if (swapmap == 0) 277 panic("uvm_swap_init: extent_create failed"); 278 279 /* allocate pools for structures used for swapping to files. */ 280 pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, IPL_BIO, 0, 281 "swp vnx", NULL); 282 pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, IPL_BIO, 0, 283 "swp vnd", NULL); 284 285 /* Setup the initial swap partition */ 286 swapmount(); 287 } 288 289 #ifdef UVM_SWAP_ENCRYPT 290 void 291 uvm_swap_initcrypt_all(void) 292 { 293 struct swapdev *sdp; 294 struct swappri *spp; 295 int npages; 296 297 298 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 299 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 300 if (sdp->swd_decrypt == NULL) { 301 npages = dbtob((uint64_t)sdp->swd_nblks) >> 302 PAGE_SHIFT; 303 uvm_swap_initcrypt(sdp, npages); 304 } 305 } 306 } 307 } 308 309 void 310 uvm_swap_initcrypt(struct swapdev *sdp, int npages) 311 { 312 /* 313 * keep information if a page needs to be decrypted when we get it 314 * from the swap device. 315 * We cannot chance a malloc later, if we are doing ASYNC puts, 316 * we may not call malloc with M_WAITOK. This consumes only 317 * 8KB memory for a 256MB swap partition. 318 */ 319 sdp->swd_decrypt = malloc(SWD_DCRYPT_SIZE(npages), M_VMSWAP, 320 M_WAITOK|M_ZERO); 321 sdp->swd_keys = mallocarray(SWD_KEY_SIZE(npages), 322 sizeof(struct swap_key), M_VMSWAP, M_WAITOK|M_ZERO); 323 } 324 325 #endif /* UVM_SWAP_ENCRYPT */ 326 327 boolean_t 328 uvm_swap_allocpages(struct vm_page **pps, int npages) 329 { 330 struct pglist pgl; 331 int i; 332 boolean_t fail; 333 334 /* Estimate if we will succeed */ 335 uvm_lock_fpageq(); 336 337 fail = uvmexp.free - npages < uvmexp.reserve_kernel; 338 339 uvm_unlock_fpageq(); 340 341 if (fail) 342 return FALSE; 343 344 TAILQ_INIT(&pgl); 345 if (uvm_pglistalloc(npages * PAGE_SIZE, dma_constraint.ucr_low, 346 dma_constraint.ucr_high, 0, 0, &pgl, npages, UVM_PLA_NOWAIT)) 347 return FALSE; 348 349 for (i = 0; i < npages; i++) { 350 pps[i] = TAILQ_FIRST(&pgl); 351 /* *sigh* */ 352 atomic_setbits_int(&pps[i]->pg_flags, PG_BUSY); 353 TAILQ_REMOVE(&pgl, pps[i], pageq); 354 } 355 356 return TRUE; 357 } 358 359 void 360 uvm_swap_freepages(struct vm_page **pps, int npages) 361 { 362 int i; 363 364 uvm_lock_pageq(); 365 for (i = 0; i < npages; i++) 366 uvm_pagefree(pps[i]); 367 uvm_unlock_pageq(); 368 } 369 370 #ifdef UVM_SWAP_ENCRYPT 371 /* 372 * Mark pages on the swap device for later decryption 373 */ 374 375 void 376 uvm_swap_markdecrypt(struct swapdev *sdp, int startslot, int npages, 377 int decrypt) 378 { 379 int pagestart, i; 380 int off, bit; 381 382 if (!sdp) 383 return; 384 385 pagestart = startslot - sdp->swd_drumoffset; 386 for (i = 0; i < npages; i++, pagestart++) { 387 off = SWD_DCRYPT_OFF(pagestart); 388 bit = SWD_DCRYPT_BIT(pagestart); 389 if (decrypt) 390 /* pages read need decryption */ 391 sdp->swd_decrypt[off] |= 1 << bit; 392 else 393 /* pages read do not need decryption */ 394 sdp->swd_decrypt[off] &= ~(1 << bit); 395 } 396 } 397 398 /* 399 * Check if the page that we got from disk needs to be decrypted 400 */ 401 402 boolean_t 403 uvm_swap_needdecrypt(struct swapdev *sdp, int off) 404 { 405 if (!sdp) 406 return FALSE; 407 408 off -= sdp->swd_drumoffset; 409 return sdp->swd_decrypt[SWD_DCRYPT_OFF(off)] & (1 << SWD_DCRYPT_BIT(off)) ? 410 TRUE : FALSE; 411 } 412 413 void 414 uvm_swap_finicrypt_all(void) 415 { 416 struct swapdev *sdp; 417 struct swappri *spp; 418 struct swap_key *key; 419 unsigned int nkeys; 420 421 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 422 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 423 if (sdp->swd_decrypt == NULL) 424 continue; 425 426 nkeys = dbtob((uint64_t)sdp->swd_nblks) >> PAGE_SHIFT; 427 key = sdp->swd_keys + (SWD_KEY_SIZE(nkeys) - 1); 428 do { 429 if (key->refcount != 0) 430 swap_key_delete(key); 431 } while (key-- != sdp->swd_keys); 432 } 433 } 434 } 435 #endif /* UVM_SWAP_ENCRYPT */ 436 437 /* 438 * swaplist functions: functions that operate on the list of swap 439 * devices on the system. 440 */ 441 442 /* 443 * swaplist_insert: insert swap device "sdp" into the global list 444 * 445 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 446 * => caller must provide a newly malloc'd swappri structure (we will 447 * FREE it if we don't need it... this it to prevent malloc blocking 448 * here while adding swap) 449 */ 450 void 451 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) 452 { 453 struct swappri *spp, *pspp; 454 455 /* 456 * find entry at or after which to insert the new device. 457 */ 458 for (pspp = NULL, spp = LIST_FIRST(&swap_priority); spp != NULL; 459 spp = LIST_NEXT(spp, spi_swappri)) { 460 if (priority <= spp->spi_priority) 461 break; 462 pspp = spp; 463 } 464 465 /* 466 * new priority? 467 */ 468 if (spp == NULL || spp->spi_priority != priority) { 469 spp = newspp; /* use newspp! */ 470 471 spp->spi_priority = priority; 472 TAILQ_INIT(&spp->spi_swapdev); 473 474 if (pspp) 475 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 476 else 477 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 478 } else { 479 /* we don't need a new priority structure, free it */ 480 free(newspp, M_VMSWAP, sizeof(*newspp)); 481 } 482 483 /* 484 * priority found (or created). now insert on the priority's 485 * tailq list and bump the total number of swapdevs. 486 */ 487 sdp->swd_priority = priority; 488 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 489 uvmexp.nswapdev++; 490 } 491 492 /* 493 * swaplist_find: find and optionally remove a swap device from the 494 * global list. 495 * 496 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 497 * => we return the swapdev we found (and removed) 498 */ 499 struct swapdev * 500 swaplist_find(struct vnode *vp, boolean_t remove) 501 { 502 struct swapdev *sdp; 503 struct swappri *spp; 504 505 /* 506 * search the lists for the requested vp 507 */ 508 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 509 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 510 if (sdp->swd_vp != vp) 511 continue; 512 if (remove) { 513 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 514 uvmexp.nswapdev--; 515 } 516 return (sdp); 517 } 518 } 519 return (NULL); 520 } 521 522 523 /* 524 * swaplist_trim: scan priority list for empty priority entries and kill 525 * them. 526 * 527 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 528 */ 529 void 530 swaplist_trim(void) 531 { 532 struct swappri *spp, *nextspp; 533 534 LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) { 535 if (!TAILQ_EMPTY(&spp->spi_swapdev)) 536 continue; 537 LIST_REMOVE(spp, spi_swappri); 538 free(spp, M_VMSWAP, sizeof(*spp)); 539 } 540 } 541 542 /* 543 * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area. 544 * 545 * => caller must hold swap_syscall_lock 546 * => uvm.swap_data_lock should be unlocked (we may sleep) 547 */ 548 void 549 swapdrum_add(struct swapdev *sdp, int npages) 550 { 551 u_long result; 552 553 if (extent_alloc(swapmap, npages, EX_NOALIGN, 0, EX_NOBOUNDARY, 554 EX_WAITOK, &result)) 555 panic("swapdrum_add"); 556 557 sdp->swd_drumoffset = result; 558 sdp->swd_drumsize = npages; 559 } 560 561 /* 562 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 563 * to the "swapdev" that maps that section of the drum. 564 * 565 * => each swapdev takes one big contig chunk of the drum 566 * => caller must hold uvm.swap_data_lock 567 */ 568 struct swapdev * 569 swapdrum_getsdp(int pgno) 570 { 571 struct swapdev *sdp; 572 struct swappri *spp; 573 574 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 575 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 576 if (pgno >= sdp->swd_drumoffset && 577 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 578 return sdp; 579 } 580 } 581 } 582 return NULL; 583 } 584 585 586 /* 587 * sys_swapctl: main entry point for swapctl(2) system call 588 * [with two helper functions: swap_on and swap_off] 589 */ 590 int 591 sys_swapctl(struct proc *p, void *v, register_t *retval) 592 { 593 struct sys_swapctl_args /* { 594 syscallarg(int) cmd; 595 syscallarg(void *) arg; 596 syscallarg(int) misc; 597 } */ *uap = (struct sys_swapctl_args *)v; 598 struct vnode *vp; 599 struct nameidata nd; 600 struct swappri *spp; 601 struct swapdev *sdp; 602 struct swapent *sep; 603 char userpath[MAXPATHLEN]; 604 size_t len; 605 int count, error, misc; 606 int priority; 607 608 misc = SCARG(uap, misc); 609 610 /* 611 * ensure serialized syscall access by grabbing the swap_syscall_lock 612 */ 613 rw_enter_write(&swap_syscall_lock); 614 615 /* 616 * we handle the non-priv NSWAP and STATS request first. 617 * 618 * SWAP_NSWAP: return number of config'd swap devices 619 * [can also be obtained with uvmexp sysctl] 620 */ 621 if (SCARG(uap, cmd) == SWAP_NSWAP) { 622 *retval = uvmexp.nswapdev; 623 error = 0; 624 goto out; 625 } 626 627 /* 628 * SWAP_STATS: get stats on current # of configured swap devs 629 * 630 * note that the swap_priority list can't change as long 631 * as we are holding the swap_syscall_lock. we don't want 632 * to grab the uvm.swap_data_lock because we may fault&sleep during 633 * copyout() and we don't want to be holding that lock then! 634 */ 635 if (SCARG(uap, cmd) == SWAP_STATS) { 636 sep = (struct swapent *)SCARG(uap, arg); 637 count = 0; 638 639 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 640 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 641 if (count >= misc) 642 continue; 643 644 sdp->swd_inuse = 645 btodb((u_int64_t)sdp->swd_npginuse << 646 PAGE_SHIFT); 647 error = copyout(&sdp->swd_se, sep, 648 sizeof(struct swapent)); 649 if (error) 650 goto out; 651 652 /* now copy out the path if necessary */ 653 error = copyoutstr(sdp->swd_path, 654 sep->se_path, sizeof(sep->se_path), NULL); 655 if (error) 656 goto out; 657 658 count++; 659 sep++; 660 } 661 } 662 663 *retval = count; 664 error = 0; 665 goto out; 666 } 667 668 /* all other requests require superuser privs. verify. */ 669 if ((error = suser(p)) || (error = pledge_swapctl(p))) 670 goto out; 671 672 /* 673 * at this point we expect a path name in arg. we will 674 * use namei() to gain a vnode reference (vref), and lock 675 * the vnode (VOP_LOCK). 676 */ 677 error = copyinstr(SCARG(uap, arg), userpath, sizeof(userpath), &len); 678 if (error) 679 goto out; 680 disk_map(userpath, userpath, sizeof(userpath), DM_OPENBLCK); 681 NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, UIO_SYSSPACE, userpath, p); 682 if ((error = namei(&nd))) 683 goto out; 684 vp = nd.ni_vp; 685 /* note: "vp" is referenced and locked */ 686 687 error = 0; /* assume no error */ 688 switch(SCARG(uap, cmd)) { 689 case SWAP_DUMPDEV: 690 if (vp->v_type != VBLK) { 691 error = ENOTBLK; 692 break; 693 } 694 dumpdev = vp->v_rdev; 695 break; 696 case SWAP_CTL: 697 /* 698 * get new priority, remove old entry (if any) and then 699 * reinsert it in the correct place. finally, prune out 700 * any empty priority structures. 701 */ 702 priority = SCARG(uap, misc); 703 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 704 if ((sdp = swaplist_find(vp, 1)) == NULL) { 705 error = ENOENT; 706 } else { 707 swaplist_insert(sdp, spp, priority); 708 swaplist_trim(); 709 } 710 if (error) 711 free(spp, M_VMSWAP, sizeof(*spp)); 712 break; 713 case SWAP_ON: 714 /* 715 * If the device is a regular file, make sure the filesystem 716 * can be used for swapping. 717 */ 718 if (vp->v_type == VREG && 719 (vp->v_mount->mnt_flag & MNT_SWAPPABLE) == 0) { 720 error = ENOTSUP; 721 break; 722 } 723 724 /* 725 * check for duplicates. if none found, then insert a 726 * dummy entry on the list to prevent someone else from 727 * trying to enable this device while we are working on 728 * it. 729 */ 730 priority = SCARG(uap, misc); 731 if ((sdp = swaplist_find(vp, 0)) != NULL) { 732 error = EBUSY; 733 break; 734 } 735 sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK|M_ZERO); 736 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 737 sdp->swd_flags = SWF_FAKE; /* placeholder only */ 738 sdp->swd_vp = vp; 739 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 740 741 /* 742 * XXX Is NFS elaboration necessary? 743 */ 744 if (vp->v_type == VREG) { 745 sdp->swd_cred = crdup(p->p_ucred); 746 } 747 748 swaplist_insert(sdp, spp, priority); 749 750 sdp->swd_pathlen = len; 751 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK); 752 strlcpy(sdp->swd_path, userpath, len); 753 754 /* 755 * we've now got a FAKE placeholder in the swap list. 756 * now attempt to enable swap on it. if we fail, undo 757 * what we've done and kill the fake entry we just inserted. 758 * if swap_on is a success, it will clear the SWF_FAKE flag 759 */ 760 761 if ((error = swap_on(p, sdp)) != 0) { 762 (void) swaplist_find(vp, 1); /* kill fake entry */ 763 swaplist_trim(); 764 if (vp->v_type == VREG) { 765 crfree(sdp->swd_cred); 766 } 767 free(sdp->swd_path, M_VMSWAP, sdp->swd_pathlen); 768 free(sdp, M_VMSWAP, sizeof(*sdp)); 769 break; 770 } 771 break; 772 case SWAP_OFF: 773 if ((sdp = swaplist_find(vp, 0)) == NULL) { 774 error = ENXIO; 775 break; 776 } 777 778 /* 779 * If a device isn't in use or enabled, we 780 * can't stop swapping from it (again). 781 */ 782 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 783 error = EBUSY; 784 break; 785 } 786 787 /* 788 * do the real work. 789 */ 790 error = swap_off(p, sdp); 791 break; 792 default: 793 error = EINVAL; 794 } 795 796 /* done! release the ref gained by namei() and unlock. */ 797 vput(vp); 798 799 out: 800 rw_exit_write(&swap_syscall_lock); 801 802 return (error); 803 } 804 805 /* 806 * swap_on: attempt to enable a swapdev for swapping. note that the 807 * swapdev is already on the global list, but disabled (marked 808 * SWF_FAKE). 809 * 810 * => we avoid the start of the disk (to protect disk labels) 811 * => caller should leave uvm.swap_data_lock unlocked, we may lock it 812 * if needed. 813 */ 814 int 815 swap_on(struct proc *p, struct swapdev *sdp) 816 { 817 static int count = 0; /* static */ 818 struct vnode *vp; 819 int error, npages, nblocks, size; 820 long addr; 821 struct vattr va; 822 #if defined(NFSCLIENT) 823 extern const struct vops nfs_vops; 824 #endif /* defined(NFSCLIENT) */ 825 dev_t dev; 826 827 /* 828 * we want to enable swapping on sdp. the swd_vp contains 829 * the vnode we want (locked and ref'd), and the swd_dev 830 * contains the dev_t of the file, if it a block device. 831 */ 832 833 vp = sdp->swd_vp; 834 dev = sdp->swd_dev; 835 836 #if NVND > 0 837 /* no swapping to vnds. */ 838 if (bdevsw[major(dev)].d_strategy == vndstrategy) 839 return (EOPNOTSUPP); 840 #endif 841 842 /* 843 * open the swap file (mostly useful for block device files to 844 * let device driver know what is up). 845 * 846 * we skip the open/close for root on swap because the root 847 * has already been opened when root was mounted (mountroot). 848 */ 849 if (vp != rootvp) { 850 if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p))) 851 return (error); 852 } 853 854 /* XXX this only works for block devices */ 855 /* 856 * we now need to determine the size of the swap area. for 857 * block specials we can call the d_psize function. 858 * for normal files, we must stat [get attrs]. 859 * 860 * we put the result in nblks. 861 * for normal files, we also want the filesystem block size 862 * (which we get with statfs). 863 */ 864 switch (vp->v_type) { 865 case VBLK: 866 if (bdevsw[major(dev)].d_psize == 0 || 867 (nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) { 868 error = ENXIO; 869 goto bad; 870 } 871 break; 872 873 case VREG: 874 if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p))) 875 goto bad; 876 nblocks = (int)btodb(va.va_size); 877 if ((error = 878 VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0) 879 goto bad; 880 881 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize; 882 /* 883 * limit the max # of outstanding I/O requests we issue 884 * at any one time. take it easy on NFS servers. 885 */ 886 #if defined(NFSCLIENT) 887 if (vp->v_op == &nfs_vops) 888 sdp->swd_maxactive = 2; /* XXX */ 889 else 890 #endif /* defined(NFSCLIENT) */ 891 sdp->swd_maxactive = 8; /* XXX */ 892 bufq_init(&sdp->swd_bufq, BUFQ_FIFO); 893 break; 894 895 default: 896 error = ENXIO; 897 goto bad; 898 } 899 900 /* 901 * save nblocks in a safe place and convert to pages. 902 */ 903 904 sdp->swd_nblks = nblocks; 905 npages = dbtob((u_int64_t)nblocks) >> PAGE_SHIFT; 906 907 /* 908 * for block special files, we want to make sure that leave 909 * the disklabel and bootblocks alone, so we arrange to skip 910 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 911 * note that because of this the "size" can be less than the 912 * actual number of blocks on the device. 913 */ 914 if (vp->v_type == VBLK) { 915 /* we use pages 1 to (size - 1) [inclusive] */ 916 size = npages - 1; 917 addr = 1; 918 } else { 919 /* we use pages 0 to (size - 1) [inclusive] */ 920 size = npages; 921 addr = 0; 922 } 923 924 /* 925 * make sure we have enough blocks for a reasonable sized swap 926 * area. we want at least one page. 927 */ 928 929 if (size < 1) { 930 error = EINVAL; 931 goto bad; 932 } 933 934 /* 935 * now we need to allocate an extent to manage this swap device 936 */ 937 snprintf(sdp->swd_exname, sizeof(sdp->swd_exname), "swap0x%04x", 938 count++); 939 940 /* note that extent_create's 3rd arg is inclusive, thus "- 1" */ 941 sdp->swd_ex = extent_create(sdp->swd_exname, 0, npages - 1, M_VMSWAP, 942 0, 0, EX_WAITOK); 943 /* allocate the `saved' region from the extent so it won't be used */ 944 if (addr) { 945 if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK)) 946 panic("disklabel reserve"); 947 /* XXX: is extent synchronized with swd_npginuse? */ 948 } 949 #ifdef HIBERNATE 950 /* 951 * Lock down the last region of primary disk swap, in case 952 * hibernate needs to place a signature there. 953 */ 954 if (dev == swdevt[0].sw_dev && vp->v_type == VBLK && size > 3 ) { 955 if (extent_alloc_region(sdp->swd_ex, 956 npages - 1 - 1, 1, EX_WAITOK)) 957 panic("hibernate reserve"); 958 /* XXX: is extent synchronized with swd_npginuse? */ 959 } 960 #endif 961 962 /* add a ref to vp to reflect usage as a swap device. */ 963 vref(vp); 964 965 #ifdef UVM_SWAP_ENCRYPT 966 if (uvm_doswapencrypt) 967 uvm_swap_initcrypt(sdp, npages); 968 #endif 969 /* now add the new swapdev to the drum and enable. */ 970 swapdrum_add(sdp, npages); 971 sdp->swd_npages = size; 972 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 973 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 974 uvmexp.swpages += size; 975 return (0); 976 977 bad: 978 /* failure: close device if necessary and return error. */ 979 if (vp != rootvp) 980 (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p); 981 return (error); 982 } 983 984 /* 985 * swap_off: stop swapping on swapdev 986 * 987 * => swap data should be locked, we will unlock. 988 */ 989 int 990 swap_off(struct proc *p, struct swapdev *sdp) 991 { 992 int error = 0; 993 994 /* disable the swap area being removed */ 995 sdp->swd_flags &= ~SWF_ENABLE; 996 997 /* 998 * the idea is to find all the pages that are paged out to this 999 * device, and page them all in. in uvm, swap-backed pageable 1000 * memory can take two forms: aobjs and anons. call the 1001 * swapoff hook for each subsystem to bring in pages. 1002 */ 1003 1004 if (uao_swap_off(sdp->swd_drumoffset, 1005 sdp->swd_drumoffset + sdp->swd_drumsize) || 1006 amap_swap_off(sdp->swd_drumoffset, 1007 sdp->swd_drumoffset + sdp->swd_drumsize)) { 1008 1009 error = ENOMEM; 1010 } else if (sdp->swd_npginuse > sdp->swd_npgbad) { 1011 error = EBUSY; 1012 } 1013 1014 if (error) { 1015 sdp->swd_flags |= SWF_ENABLE; 1016 return (error); 1017 } 1018 1019 /* 1020 * done with the vnode and saved creds. 1021 * drop our ref on the vnode before calling VOP_CLOSE() 1022 * so that spec_close() can tell if this is the last close. 1023 */ 1024 if (sdp->swd_vp->v_type == VREG) { 1025 crfree(sdp->swd_cred); 1026 } 1027 vrele(sdp->swd_vp); 1028 if (sdp->swd_vp != rootvp) { 1029 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p); 1030 } 1031 1032 uvmexp.swpages -= sdp->swd_npages; 1033 1034 if (swaplist_find(sdp->swd_vp, 1) == NULL) 1035 panic("swap_off: swapdev not in list"); 1036 swaplist_trim(); 1037 1038 /* 1039 * free all resources! 1040 */ 1041 extent_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize, 1042 EX_WAITOK); 1043 extent_destroy(sdp->swd_ex); 1044 /* free sdp->swd_path ? */ 1045 free(sdp, M_VMSWAP, sizeof(*sdp)); 1046 return (0); 1047 } 1048 1049 /* 1050 * /dev/drum interface and i/o functions 1051 */ 1052 1053 /* 1054 * swstrategy: perform I/O on the drum 1055 * 1056 * => we must map the i/o request from the drum to the correct swapdev. 1057 */ 1058 void 1059 swstrategy(struct buf *bp) 1060 { 1061 struct swapdev *sdp; 1062 int s, pageno, bn; 1063 1064 /* 1065 * convert block number to swapdev. note that swapdev can't 1066 * be yanked out from under us because we are holding resources 1067 * in it (i.e. the blocks we are doing I/O on). 1068 */ 1069 pageno = dbtob((u_int64_t)bp->b_blkno) >> PAGE_SHIFT; 1070 sdp = swapdrum_getsdp(pageno); 1071 if (sdp == NULL) { 1072 bp->b_error = EINVAL; 1073 bp->b_flags |= B_ERROR; 1074 s = splbio(); 1075 biodone(bp); 1076 splx(s); 1077 return; 1078 } 1079 1080 /* convert drum page number to block number on this swapdev. */ 1081 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1082 bn = btodb((u_int64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ 1083 1084 /* 1085 * for block devices we finish up here. 1086 * for regular files we have to do more work which we delegate 1087 * to sw_reg_strategy(). 1088 */ 1089 switch (sdp->swd_vp->v_type) { 1090 default: 1091 panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type); 1092 case VBLK: 1093 /* 1094 * must convert "bp" from an I/O on /dev/drum to an I/O 1095 * on the swapdev (sdp). 1096 */ 1097 s = splbio(); 1098 buf_replacevnode(bp, sdp->swd_vp); 1099 1100 bp->b_blkno = bn; 1101 splx(s); 1102 VOP_STRATEGY(bp); 1103 return; 1104 case VREG: 1105 /* delegate to sw_reg_strategy function. */ 1106 sw_reg_strategy(sdp, bp, bn); 1107 return; 1108 } 1109 /* NOTREACHED */ 1110 } 1111 1112 /* 1113 * sw_reg_strategy: handle swap i/o to regular files 1114 */ 1115 void 1116 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) 1117 { 1118 struct vnode *vp; 1119 struct vndxfer *vnx; 1120 daddr_t nbn; 1121 caddr_t addr; 1122 off_t byteoff; 1123 int s, off, nra, error, sz, resid; 1124 1125 /* 1126 * allocate a vndxfer head for this transfer and point it to 1127 * our buffer. 1128 */ 1129 vnx = pool_get(&vndxfer_pool, PR_WAITOK); 1130 vnx->vx_flags = VX_BUSY; 1131 vnx->vx_error = 0; 1132 vnx->vx_pending = 0; 1133 vnx->vx_bp = bp; 1134 vnx->vx_sdp = sdp; 1135 1136 /* 1137 * setup for main loop where we read filesystem blocks into 1138 * our buffer. 1139 */ 1140 error = 0; 1141 bp->b_resid = bp->b_bcount; /* nothing transferred yet! */ 1142 addr = bp->b_data; /* current position in buffer */ 1143 byteoff = dbtob((u_int64_t)bn); 1144 1145 for (resid = bp->b_resid; resid; resid -= sz) { 1146 struct vndbuf *nbp; 1147 /* 1148 * translate byteoffset into block number. return values: 1149 * vp = vnode of underlying device 1150 * nbn = new block number (on underlying vnode dev) 1151 * nra = num blocks we can read-ahead (excludes requested 1152 * block) 1153 */ 1154 nra = 0; 1155 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1156 &vp, &nbn, &nra); 1157 1158 if (error == 0 && nbn == -1) { 1159 /* 1160 * this used to just set error, but that doesn't 1161 * do the right thing. Instead, it causes random 1162 * memory errors. The panic() should remain until 1163 * this condition doesn't destabilize the system. 1164 */ 1165 #if 1 1166 panic("sw_reg_strategy: swap to sparse file"); 1167 #else 1168 error = EIO; /* failure */ 1169 #endif 1170 } 1171 1172 /* 1173 * punt if there was an error or a hole in the file. 1174 * we must wait for any i/o ops we have already started 1175 * to finish before returning. 1176 * 1177 * XXX we could deal with holes here but it would be 1178 * a hassle (in the write case). 1179 */ 1180 if (error) { 1181 s = splbio(); 1182 vnx->vx_error = error; /* pass error up */ 1183 goto out; 1184 } 1185 1186 /* 1187 * compute the size ("sz") of this transfer (in bytes). 1188 */ 1189 off = byteoff % sdp->swd_bsize; 1190 sz = (1 + nra) * sdp->swd_bsize - off; 1191 if (sz > resid) 1192 sz = resid; 1193 1194 /* 1195 * now get a buf structure. note that the vb_buf is 1196 * at the front of the nbp structure so that you can 1197 * cast pointers between the two structure easily. 1198 */ 1199 nbp = pool_get(&vndbuf_pool, PR_WAITOK); 1200 nbp->vb_buf.b_flags = bp->b_flags | B_CALL; 1201 nbp->vb_buf.b_bcount = sz; 1202 nbp->vb_buf.b_bufsize = sz; 1203 nbp->vb_buf.b_error = 0; 1204 nbp->vb_buf.b_data = addr; 1205 nbp->vb_buf.b_bq = NULL; 1206 nbp->vb_buf.b_blkno = nbn + btodb(off); 1207 nbp->vb_buf.b_proc = bp->b_proc; 1208 nbp->vb_buf.b_iodone = sw_reg_iodone; 1209 nbp->vb_buf.b_vp = NULLVP; 1210 nbp->vb_buf.b_vnbufs.le_next = NOLIST; 1211 LIST_INIT(&nbp->vb_buf.b_dep); 1212 1213 /* 1214 * set b_dirtyoff/end and b_validoff/end. this is 1215 * required by the NFS client code (otherwise it will 1216 * just discard our I/O request). 1217 */ 1218 if (bp->b_dirtyend == 0) { 1219 nbp->vb_buf.b_dirtyoff = 0; 1220 nbp->vb_buf.b_dirtyend = sz; 1221 } else { 1222 nbp->vb_buf.b_dirtyoff = 1223 max(0, bp->b_dirtyoff - (bp->b_bcount-resid)); 1224 nbp->vb_buf.b_dirtyend = 1225 min(sz, 1226 max(0, bp->b_dirtyend - (bp->b_bcount-resid))); 1227 } 1228 if (bp->b_validend == 0) { 1229 nbp->vb_buf.b_validoff = 0; 1230 nbp->vb_buf.b_validend = sz; 1231 } else { 1232 nbp->vb_buf.b_validoff = 1233 max(0, bp->b_validoff - (bp->b_bcount-resid)); 1234 nbp->vb_buf.b_validend = 1235 min(sz, 1236 max(0, bp->b_validend - (bp->b_bcount-resid))); 1237 } 1238 1239 /* patch it back to the vnx */ 1240 nbp->vb_vnx = vnx; 1241 task_set(&nbp->vb_task, sw_reg_iodone_internal, nbp); 1242 1243 s = splbio(); 1244 if (vnx->vx_error != 0) { 1245 pool_put(&vndbuf_pool, nbp); 1246 goto out; 1247 } 1248 vnx->vx_pending++; 1249 1250 /* assoc new buffer with underlying vnode */ 1251 bgetvp(vp, &nbp->vb_buf); 1252 1253 /* start I/O if we are not over our limit */ 1254 bufq_queue(&sdp->swd_bufq, &nbp->vb_buf); 1255 sw_reg_start(sdp); 1256 splx(s); 1257 1258 /* 1259 * advance to the next I/O 1260 */ 1261 byteoff += sz; 1262 addr += sz; 1263 } 1264 1265 s = splbio(); 1266 1267 out: /* Arrive here at splbio */ 1268 vnx->vx_flags &= ~VX_BUSY; 1269 if (vnx->vx_pending == 0) { 1270 if (vnx->vx_error != 0) { 1271 bp->b_error = vnx->vx_error; 1272 bp->b_flags |= B_ERROR; 1273 } 1274 pool_put(&vndxfer_pool, vnx); 1275 biodone(bp); 1276 } 1277 splx(s); 1278 } 1279 1280 /* sw_reg_start: start an I/O request on the requested swapdev. */ 1281 void 1282 sw_reg_start(struct swapdev *sdp) 1283 { 1284 struct buf *bp; 1285 1286 /* XXX: recursion control */ 1287 if ((sdp->swd_flags & SWF_BUSY) != 0) 1288 return; 1289 1290 sdp->swd_flags |= SWF_BUSY; 1291 1292 while (sdp->swd_active < sdp->swd_maxactive) { 1293 bp = bufq_dequeue(&sdp->swd_bufq); 1294 if (bp == NULL) 1295 break; 1296 1297 sdp->swd_active++; 1298 1299 if ((bp->b_flags & B_READ) == 0) 1300 bp->b_vp->v_numoutput++; 1301 1302 VOP_STRATEGY(bp); 1303 } 1304 sdp->swd_flags &= ~SWF_BUSY; 1305 } 1306 1307 /* 1308 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1309 * 1310 * => note that we can recover the vndbuf struct by casting the buf ptr 1311 * 1312 * XXX: 1313 * We only put this onto a taskq here, because of the maxactive game since 1314 * it basically requires us to call back into VOP_STRATEGY() (where we must 1315 * be able to sleep) via sw_reg_start(). 1316 */ 1317 void 1318 sw_reg_iodone(struct buf *bp) 1319 { 1320 struct vndbuf *vbp = (struct vndbuf *)bp; 1321 task_add(systq, &vbp->vb_task); 1322 } 1323 1324 void 1325 sw_reg_iodone_internal(void *xvbp) 1326 { 1327 struct vndbuf *vbp = xvbp; 1328 struct vndxfer *vnx = vbp->vb_vnx; 1329 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1330 struct swapdev *sdp = vnx->vx_sdp; 1331 int resid, s; 1332 1333 s = splbio(); 1334 1335 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1336 pbp->b_resid -= resid; 1337 vnx->vx_pending--; 1338 1339 /* pass error upward */ 1340 if (vbp->vb_buf.b_error) 1341 vnx->vx_error = vbp->vb_buf.b_error; 1342 1343 /* disassociate this buffer from the vnode (if any). */ 1344 if (vbp->vb_buf.b_vp != NULL) { 1345 brelvp(&vbp->vb_buf); 1346 } 1347 1348 /* kill vbp structure */ 1349 pool_put(&vndbuf_pool, vbp); 1350 1351 /* 1352 * wrap up this transaction if it has run to completion or, in 1353 * case of an error, when all auxiliary buffers have returned. 1354 */ 1355 if (vnx->vx_error != 0) { 1356 /* pass error upward */ 1357 pbp->b_flags |= B_ERROR; 1358 pbp->b_error = vnx->vx_error; 1359 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1360 pool_put(&vndxfer_pool, vnx); 1361 biodone(pbp); 1362 } 1363 } else if (pbp->b_resid == 0) { 1364 KASSERT(vnx->vx_pending == 0); 1365 if ((vnx->vx_flags & VX_BUSY) == 0) { 1366 pool_put(&vndxfer_pool, vnx); 1367 biodone(pbp); 1368 } 1369 } 1370 1371 /* 1372 * done! start next swapdev I/O if one is pending 1373 */ 1374 sdp->swd_active--; 1375 sw_reg_start(sdp); 1376 splx(s); 1377 } 1378 1379 1380 /* 1381 * uvm_swap_alloc: allocate space on swap 1382 * 1383 * => allocation is done "round robin" down the priority list, as we 1384 * allocate in a priority we "rotate" the tail queue. 1385 * => space can be freed with uvm_swap_free 1386 * => we return the page slot number in /dev/drum (0 == invalid slot) 1387 * => we lock uvm.swap_data_lock 1388 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1389 */ 1390 int 1391 uvm_swap_alloc(int *nslots, boolean_t lessok) 1392 { 1393 struct swapdev *sdp; 1394 struct swappri *spp; 1395 u_long result; 1396 1397 /* 1398 * no swap devices configured yet? definite failure. 1399 */ 1400 if (uvmexp.nswapdev < 1) 1401 return 0; 1402 1403 /* 1404 * lock data lock, convert slots into blocks, and enter loop 1405 */ 1406 KERNEL_ASSERT_LOCKED(); 1407 ReTry: /* XXXMRG */ 1408 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1409 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1410 /* if it's not enabled, then we can't swap from it */ 1411 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1412 continue; 1413 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1414 continue; 1415 if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN, 0, 1416 EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT, 1417 &result) != 0) { 1418 continue; 1419 } 1420 1421 /* 1422 * successful allocation! now rotate the tailq. 1423 */ 1424 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1425 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1426 sdp->swd_npginuse += *nslots; 1427 uvmexp.swpginuse += *nslots; 1428 /* done! return drum slot number */ 1429 return(result + sdp->swd_drumoffset); 1430 } 1431 } 1432 1433 /* XXXMRG: BEGIN HACK */ 1434 if (*nslots > 1 && lessok) { 1435 *nslots = 1; 1436 goto ReTry; /* XXXMRG: ugh! extent should support this for us */ 1437 } 1438 /* XXXMRG: END HACK */ 1439 1440 return 0; /* failed */ 1441 } 1442 1443 /* 1444 * uvm_swapisfull: return true if all of available swap is allocated 1445 * and in use. 1446 */ 1447 int 1448 uvm_swapisfull(void) 1449 { 1450 int result; 1451 1452 KERNEL_LOCK(); 1453 KASSERT(uvmexp.swpgonly <= uvmexp.swpages); 1454 result = (uvmexp.swpgonly == uvmexp.swpages); 1455 KERNEL_UNLOCK(); 1456 1457 return result; 1458 } 1459 1460 /* 1461 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1462 * 1463 * => we lock uvm.swap_data_lock 1464 */ 1465 void 1466 uvm_swap_markbad(int startslot, int nslots) 1467 { 1468 struct swapdev *sdp; 1469 1470 KERNEL_LOCK(); 1471 sdp = swapdrum_getsdp(startslot); 1472 if (sdp != NULL) { 1473 /* 1474 * we just keep track of how many pages have been marked bad 1475 * in this device, to make everything add up in swap_off(). 1476 * we assume here that the range of slots will all be within 1477 * one swap device. 1478 */ 1479 sdp->swd_npgbad += nslots; 1480 } 1481 KERNEL_UNLOCK(); 1482 } 1483 1484 /* 1485 * uvm_swap_free: free swap slots 1486 * 1487 * => this can be all or part of an allocation made by uvm_swap_alloc 1488 * => we lock uvm.swap_data_lock 1489 */ 1490 void 1491 uvm_swap_free(int startslot, int nslots) 1492 { 1493 struct swapdev *sdp; 1494 1495 /* 1496 * ignore attempts to free the "bad" slot. 1497 */ 1498 1499 if (startslot == SWSLOT_BAD) { 1500 return; 1501 } 1502 1503 /* 1504 * convert drum slot offset back to sdp, free the blocks 1505 * in the extent, and return. must hold pri lock to do 1506 * lookup and access the extent. 1507 */ 1508 KERNEL_LOCK(); 1509 sdp = swapdrum_getsdp(startslot); 1510 KASSERT(uvmexp.nswapdev >= 1); 1511 KASSERT(sdp != NULL); 1512 KASSERT(sdp->swd_npginuse >= nslots); 1513 if (extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots, 1514 EX_MALLOCOK|EX_NOWAIT) != 0) { 1515 printf("warning: resource shortage: %d pages of swap lost\n", 1516 nslots); 1517 } 1518 1519 sdp->swd_npginuse -= nslots; 1520 uvmexp.swpginuse -= nslots; 1521 #ifdef UVM_SWAP_ENCRYPT 1522 { 1523 int i; 1524 if (swap_encrypt_initialized) { 1525 /* Dereference keys */ 1526 for (i = 0; i < nslots; i++) 1527 if (uvm_swap_needdecrypt(sdp, startslot + i)) { 1528 struct swap_key *key; 1529 1530 key = SWD_KEY(sdp, startslot + i); 1531 if (key->refcount != 0) 1532 SWAP_KEY_PUT(sdp, key); 1533 } 1534 1535 /* Mark range as not decrypt */ 1536 uvm_swap_markdecrypt(sdp, startslot, nslots, 0); 1537 } 1538 } 1539 #endif /* UVM_SWAP_ENCRYPT */ 1540 KERNEL_UNLOCK(); 1541 } 1542 1543 /* 1544 * uvm_swap_put: put any number of pages into a contig place on swap 1545 * 1546 * => can be sync or async 1547 */ 1548 int 1549 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) 1550 { 1551 int result; 1552 1553 result = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1554 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1555 1556 return (result); 1557 } 1558 1559 /* 1560 * uvm_swap_get: get a single page from swap 1561 * 1562 * => usually a sync op (from fault) 1563 */ 1564 int 1565 uvm_swap_get(struct vm_page *page, int swslot, int flags) 1566 { 1567 int result; 1568 1569 uvmexp.nswget++; 1570 KASSERT(flags & PGO_SYNCIO); 1571 if (swslot == SWSLOT_BAD) { 1572 return VM_PAGER_ERROR; 1573 } 1574 1575 KERNEL_LOCK(); 1576 /* this page is (about to be) no longer only in swap. */ 1577 uvmexp.swpgonly--; 1578 1579 result = uvm_swap_io(&page, swslot, 1, B_READ | 1580 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1581 1582 if (result != VM_PAGER_OK && result != VM_PAGER_PEND) { 1583 /* oops, the read failed so it really is still only in swap. */ 1584 uvmexp.swpgonly++; 1585 } 1586 KERNEL_UNLOCK(); 1587 return (result); 1588 } 1589 1590 /* 1591 * uvm_swap_io: do an i/o operation to swap 1592 */ 1593 1594 int 1595 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) 1596 { 1597 daddr_t startblk; 1598 struct buf *bp; 1599 vaddr_t kva; 1600 int result, s, mapinflags, pflag, bounce = 0, i; 1601 boolean_t write, async; 1602 vaddr_t bouncekva; 1603 struct vm_page *tpps[MAXBSIZE >> PAGE_SHIFT]; 1604 #ifdef UVM_SWAP_ENCRYPT 1605 struct swapdev *sdp; 1606 int encrypt = 0; 1607 #endif 1608 1609 KERNEL_ASSERT_LOCKED(); 1610 1611 write = (flags & B_READ) == 0; 1612 async = (flags & B_ASYNC) != 0; 1613 1614 /* convert starting drum slot to block number */ 1615 startblk = btodb((u_int64_t)startslot << PAGE_SHIFT); 1616 1617 /* 1618 * first, map the pages into the kernel (XXX: currently required 1619 * by buffer system). 1620 */ 1621 mapinflags = !write ? UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE; 1622 if (!async) 1623 mapinflags |= UVMPAGER_MAPIN_WAITOK; 1624 kva = uvm_pagermapin(pps, npages, mapinflags); 1625 if (kva == 0) 1626 return (VM_PAGER_AGAIN); 1627 1628 #ifdef UVM_SWAP_ENCRYPT 1629 if (write) { 1630 /* 1631 * Check if we need to do swap encryption on old pages. 1632 * Later we need a different scheme, that swap encrypts 1633 * all pages of a process that had at least one page swap 1634 * encrypted. Then we might not need to copy all pages 1635 * in the cluster, and avoid the memory overheard in 1636 * swapping. 1637 */ 1638 if (uvm_doswapencrypt) 1639 encrypt = 1; 1640 } 1641 1642 if (swap_encrypt_initialized || encrypt) { 1643 /* 1644 * we need to know the swap device that we are swapping to/from 1645 * to see if the pages need to be marked for decryption or 1646 * actually need to be decrypted. 1647 * XXX - does this information stay the same over the whole 1648 * execution of this function? 1649 */ 1650 sdp = swapdrum_getsdp(startslot); 1651 } 1652 1653 /* 1654 * Check that we are dma capable for read (write always bounces 1655 * through the swapencrypt anyway... 1656 */ 1657 if (write && encrypt) { 1658 bounce = 1; /* bounce through swapencrypt always */ 1659 } else { 1660 #else 1661 { 1662 #endif 1663 1664 for (i = 0; i < npages; i++) { 1665 if (VM_PAGE_TO_PHYS(pps[i]) < dma_constraint.ucr_low || 1666 VM_PAGE_TO_PHYS(pps[i]) > dma_constraint.ucr_high) { 1667 bounce = 1; 1668 break; 1669 } 1670 } 1671 } 1672 1673 if (bounce) { 1674 int swmapflags; 1675 1676 /* We always need write access. */ 1677 swmapflags = UVMPAGER_MAPIN_READ; 1678 if (!async) 1679 swmapflags |= UVMPAGER_MAPIN_WAITOK; 1680 1681 if (!uvm_swap_allocpages(tpps, npages)) { 1682 uvm_pagermapout(kva, npages); 1683 return (VM_PAGER_AGAIN); 1684 } 1685 1686 bouncekva = uvm_pagermapin(tpps, npages, swmapflags); 1687 if (bouncekva == 0) { 1688 uvm_pagermapout(kva, npages); 1689 uvm_swap_freepages(tpps, npages); 1690 return (VM_PAGER_AGAIN); 1691 } 1692 } 1693 1694 /* encrypt to swap */ 1695 if (write && bounce) { 1696 int i, opages; 1697 caddr_t src, dst; 1698 u_int64_t block; 1699 1700 src = (caddr_t) kva; 1701 dst = (caddr_t) bouncekva; 1702 block = startblk; 1703 for (i = 0; i < npages; i++) { 1704 #ifdef UVM_SWAP_ENCRYPT 1705 struct swap_key *key; 1706 1707 if (encrypt) { 1708 key = SWD_KEY(sdp, startslot + i); 1709 SWAP_KEY_GET(sdp, key); /* add reference */ 1710 1711 swap_encrypt(key, src, dst, block, PAGE_SIZE); 1712 block += btodb(PAGE_SIZE); 1713 } else { 1714 #else 1715 { 1716 #endif /* UVM_SWAP_ENCRYPT */ 1717 memcpy(dst, src, PAGE_SIZE); 1718 } 1719 /* this just tells async callbacks to free */ 1720 atomic_setbits_int(&tpps[i]->pg_flags, PQ_ENCRYPT); 1721 src += PAGE_SIZE; 1722 dst += PAGE_SIZE; 1723 } 1724 1725 uvm_pagermapout(kva, npages); 1726 1727 /* dispose of pages we dont use anymore */ 1728 opages = npages; 1729 uvm_pager_dropcluster(NULL, NULL, pps, &opages, 1730 PGO_PDFREECLUST); 1731 1732 kva = bouncekva; 1733 } 1734 1735 /* 1736 * now allocate a buf for the i/o. 1737 * [make sure we don't put the pagedaemon to sleep...] 1738 */ 1739 pflag = (async || curproc == uvm.pagedaemon_proc) ? PR_NOWAIT : 1740 PR_WAITOK; 1741 bp = pool_get(&bufpool, pflag | PR_ZERO); 1742 1743 /* 1744 * if we failed to get a swapbuf, return "try again" 1745 */ 1746 if (bp == NULL) { 1747 if (write && bounce) { 1748 #ifdef UVM_SWAP_ENCRYPT 1749 int i; 1750 1751 /* swap encrypt needs cleanup */ 1752 if (encrypt) 1753 for (i = 0; i < npages; i++) 1754 SWAP_KEY_PUT(sdp, SWD_KEY(sdp, 1755 startslot + i)); 1756 #endif 1757 1758 uvm_pagermapout(kva, npages); 1759 uvm_swap_freepages(tpps, npages); 1760 } 1761 return (VM_PAGER_AGAIN); 1762 } 1763 1764 /* 1765 * prevent ASYNC reads. 1766 * uvm_swap_io is only called from uvm_swap_get, uvm_swap_get 1767 * assumes that all gets are SYNCIO. Just make sure here. 1768 * XXXARTUBC - might not be true anymore. 1769 */ 1770 if (!write) { 1771 flags &= ~B_ASYNC; 1772 async = 0; 1773 } 1774 1775 /* 1776 * fill in the bp. we currently route our i/o through 1777 * /dev/drum's vnode [swapdev_vp]. 1778 */ 1779 bp->b_flags = B_BUSY | B_NOCACHE | B_RAW | (flags & (B_READ|B_ASYNC)); 1780 bp->b_proc = &proc0; /* XXX */ 1781 bp->b_vnbufs.le_next = NOLIST; 1782 if (bounce) 1783 bp->b_data = (caddr_t)bouncekva; 1784 else 1785 bp->b_data = (caddr_t)kva; 1786 bp->b_bq = NULL; 1787 bp->b_blkno = startblk; 1788 LIST_INIT(&bp->b_dep); 1789 s = splbio(); 1790 bp->b_vp = NULL; 1791 buf_replacevnode(bp, swapdev_vp); 1792 splx(s); 1793 bp->b_bufsize = bp->b_bcount = (long)npages << PAGE_SHIFT; 1794 1795 /* 1796 * for pageouts we must set "dirtyoff" [NFS client code needs it]. 1797 * and we bump v_numoutput (counter of number of active outputs). 1798 */ 1799 if (write) { 1800 bp->b_dirtyoff = 0; 1801 bp->b_dirtyend = npages << PAGE_SHIFT; 1802 #ifdef UVM_SWAP_ENCRYPT 1803 /* mark the pages in the drum for decryption */ 1804 if (swap_encrypt_initialized) 1805 uvm_swap_markdecrypt(sdp, startslot, npages, encrypt); 1806 #endif 1807 s = splbio(); 1808 swapdev_vp->v_numoutput++; 1809 splx(s); 1810 } 1811 1812 /* for async ops we must set up the iodone handler. */ 1813 if (async) { 1814 bp->b_flags |= B_CALL | (curproc == uvm.pagedaemon_proc ? 1815 B_PDAEMON : 0); 1816 bp->b_iodone = uvm_aio_biodone; 1817 } 1818 1819 /* now we start the I/O, and if async, return. */ 1820 VOP_STRATEGY(bp); 1821 if (async) 1822 return (VM_PAGER_PEND); 1823 1824 /* must be sync i/o. wait for it to finish */ 1825 (void) biowait(bp); 1826 result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK; 1827 1828 /* decrypt swap */ 1829 if (!write && !(bp->b_flags & B_ERROR)) { 1830 int i; 1831 caddr_t data = (caddr_t)kva; 1832 caddr_t dst = (caddr_t)kva; 1833 u_int64_t block = startblk; 1834 1835 if (bounce) 1836 data = (caddr_t)bouncekva; 1837 1838 for (i = 0; i < npages; i++) { 1839 #ifdef UVM_SWAP_ENCRYPT 1840 struct swap_key *key; 1841 1842 /* Check if we need to decrypt */ 1843 if (swap_encrypt_initialized && 1844 uvm_swap_needdecrypt(sdp, startslot + i)) { 1845 key = SWD_KEY(sdp, startslot + i); 1846 if (key->refcount == 0) { 1847 result = VM_PAGER_ERROR; 1848 break; 1849 } 1850 swap_decrypt(key, data, dst, block, PAGE_SIZE); 1851 } else if (bounce) { 1852 #else 1853 if (bounce) { 1854 #endif 1855 memcpy(dst, data, PAGE_SIZE); 1856 } 1857 data += PAGE_SIZE; 1858 dst += PAGE_SIZE; 1859 block += btodb(PAGE_SIZE); 1860 } 1861 if (bounce) 1862 uvm_pagermapout(bouncekva, npages); 1863 } 1864 /* kill the pager mapping */ 1865 uvm_pagermapout(kva, npages); 1866 1867 /* Not anymore needed, free after encryption/bouncing */ 1868 if (!write && bounce) 1869 uvm_swap_freepages(tpps, npages); 1870 1871 /* now dispose of the buf */ 1872 s = splbio(); 1873 if (bp->b_vp) 1874 brelvp(bp); 1875 1876 if (write && bp->b_vp) 1877 vwakeup(bp->b_vp); 1878 pool_put(&bufpool, bp); 1879 splx(s); 1880 1881 /* finally return. */ 1882 return (result); 1883 } 1884 1885 void 1886 swapmount(void) 1887 { 1888 struct swapdev *sdp; 1889 struct swappri *spp; 1890 struct vnode *vp; 1891 dev_t swap_dev = swdevt[0].sw_dev; 1892 char *nam; 1893 char path[MNAMELEN + 1]; 1894 1895 /* 1896 * No locking here since we happen to know that we will just be called 1897 * once before any other process has forked. 1898 */ 1899 if (swap_dev == NODEV) 1900 return; 1901 1902 #if defined(NFSCLIENT) 1903 if (swap_dev == NETDEV) { 1904 extern struct nfs_diskless nfs_diskless; 1905 1906 snprintf(path, sizeof(path), "%s", 1907 nfs_diskless.nd_swap.ndm_host); 1908 vp = nfs_diskless.sw_vp; 1909 goto gotit; 1910 } else 1911 #endif 1912 if (bdevvp(swap_dev, &vp)) 1913 return; 1914 1915 /* Construct a potential path to swap */ 1916 if ((nam = findblkname(major(swap_dev)))) 1917 snprintf(path, sizeof(path), "/dev/%s%d%c", nam, 1918 DISKUNIT(swap_dev), 'a' + DISKPART(swap_dev)); 1919 else 1920 snprintf(path, sizeof(path), "blkdev0x%x", 1921 swap_dev); 1922 1923 #if defined(NFSCLIENT) 1924 gotit: 1925 #endif 1926 sdp = malloc(sizeof(*sdp), M_VMSWAP, M_WAITOK|M_ZERO); 1927 spp = malloc(sizeof(*spp), M_VMSWAP, M_WAITOK); 1928 1929 sdp->swd_flags = SWF_FAKE; 1930 sdp->swd_dev = swap_dev; 1931 1932 sdp->swd_pathlen = strlen(path) + 1; 1933 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK | M_ZERO); 1934 strlcpy(sdp->swd_path, path, sdp->swd_pathlen); 1935 1936 sdp->swd_vp = vp; 1937 1938 swaplist_insert(sdp, spp, 0); 1939 1940 if (swap_on(curproc, sdp)) { 1941 swaplist_find(vp, 1); 1942 swaplist_trim(); 1943 vput(sdp->swd_vp); 1944 free(sdp->swd_path, M_VMSWAP, sdp->swd_pathlen); 1945 free(sdp, M_VMSWAP, sizeof(*sdp)); 1946 return; 1947 } 1948 } 1949 1950 #ifdef HIBERNATE 1951 int 1952 uvm_hibswap(dev_t dev, u_long *sp, u_long *ep) 1953 { 1954 struct swapdev *sdp, *swd = NULL; 1955 struct swappri *spp; 1956 struct extent_region *exr, *exrn; 1957 u_long start = 0, end = 0, size = 0; 1958 1959 /* no swap devices configured yet? */ 1960 if (uvmexp.nswapdev < 1 || dev != swdevt[0].sw_dev) 1961 return (1); 1962 1963 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1964 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1965 if (sdp->swd_dev == dev) 1966 swd = sdp; 1967 } 1968 } 1969 1970 if (swd == NULL || (swd->swd_flags & SWF_ENABLE) == 0) 1971 return (1); 1972 1973 LIST_FOREACH(exr, &swd->swd_ex->ex_regions, er_link) { 1974 u_long gapstart, gapend, gapsize; 1975 1976 gapstart = exr->er_end + 1; 1977 exrn = LIST_NEXT(exr, er_link); 1978 if (!exrn) 1979 break; 1980 gapend = exrn->er_start - 1; 1981 gapsize = gapend - gapstart; 1982 if (gapsize > size) { 1983 start = gapstart; 1984 end = gapend; 1985 size = gapsize; 1986 } 1987 } 1988 1989 if (size) { 1990 *sp = start; 1991 *ep = end; 1992 return (0); 1993 } 1994 return (1); 1995 } 1996 #endif /* HIBERNATE */ 1997