1 /* $OpenBSD: uvm_swap.c,v 1.161 2022/07/18 18:02:27 jca Exp $ */ 2 /* $NetBSD: uvm_swap.c,v 1.40 2000/11/17 11:39:39 mrg Exp $ */ 3 4 /* 5 * Copyright (c) 1995, 1996, 1997 Matthew R. Green 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 24 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 30 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 31 */ 32 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/buf.h> 36 #include <sys/conf.h> 37 #include <sys/proc.h> 38 #include <sys/namei.h> 39 #include <sys/disklabel.h> 40 #include <sys/errno.h> 41 #include <sys/kernel.h> 42 #include <sys/malloc.h> 43 #include <sys/vnode.h> 44 #include <sys/fcntl.h> 45 #include <sys/extent.h> 46 #include <sys/mount.h> 47 #include <sys/pool.h> 48 #include <sys/syscallargs.h> 49 #include <sys/swap.h> 50 #include <sys/disk.h> 51 #include <sys/task.h> 52 #include <sys/pledge.h> 53 #if defined(NFSCLIENT) 54 #include <sys/socket.h> 55 #include <netinet/in.h> 56 #include <nfs/nfsproto.h> 57 #include <nfs/nfsdiskless.h> 58 #endif 59 60 #include <uvm/uvm.h> 61 #ifdef UVM_SWAP_ENCRYPT 62 #include <uvm/uvm_swap_encrypt.h> 63 #endif 64 65 #include <sys/specdev.h> 66 67 #include "vnd.h" 68 69 /* 70 * uvm_swap.c: manage configuration and i/o to swap space. 71 */ 72 73 /* 74 * swap space is managed in the following way: 75 * 76 * each swap partition or file is described by a "swapdev" structure. 77 * each "swapdev" structure contains a "swapent" structure which contains 78 * information that is passed up to the user (via system calls). 79 * 80 * each swap partition is assigned a "priority" (int) which controls 81 * swap partition usage. 82 * 83 * the system maintains a global data structure describing all swap 84 * partitions/files. there is a sorted LIST of "swappri" structures 85 * which describe "swapdev"'s at that priority. this LIST is headed 86 * by the "swap_priority" global var. each "swappri" contains a 87 * TAILQ of "swapdev" structures at that priority. 88 * 89 * locking: 90 * - swap_syscall_lock (sleep lock): this lock serializes the swapctl 91 * system call and prevents the swap priority list from changing 92 * while we are in the middle of a system call (e.g. SWAP_STATS). 93 * 94 * each swap device has the following info: 95 * - swap device in use (could be disabled, preventing future use) 96 * - swap enabled (allows new allocations on swap) 97 * - map info in /dev/drum 98 * - vnode pointer 99 * for swap files only: 100 * - block size 101 * - max byte count in buffer 102 * - buffer 103 * - credentials to use when doing i/o to file 104 * 105 * userland controls and configures swap with the swapctl(2) system call. 106 * the sys_swapctl performs the following operations: 107 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 108 * [2] SWAP_STATS: given a pointer to an array of swapent structures 109 * (passed in via "arg") of a size passed in via "misc" ... we load 110 * the current swap config into the array. 111 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 112 * priority in "misc", start swapping on it. 113 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 114 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 115 * "misc") 116 */ 117 118 /* 119 * swapdev: describes a single swap partition/file 120 * 121 * note the following should be true: 122 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 123 * swd_nblks <= swd_mapsize [because mapsize includes disklabel] 124 */ 125 struct swapdev { 126 struct swapent swd_se; 127 #define swd_dev swd_se.se_dev /* device id */ 128 #define swd_flags swd_se.se_flags /* flags:inuse/enable/fake */ 129 #define swd_priority swd_se.se_priority /* our priority */ 130 #define swd_inuse swd_se.se_inuse /* blocks used */ 131 #define swd_nblks swd_se.se_nblks /* total blocks */ 132 char *swd_path; /* saved pathname of device */ 133 int swd_pathlen; /* length of pathname */ 134 int swd_npages; /* #pages we can use */ 135 int swd_npginuse; /* #pages in use */ 136 int swd_npgbad; /* #pages bad */ 137 int swd_drumoffset; /* page0 offset in drum */ 138 int swd_drumsize; /* #pages in drum */ 139 struct extent *swd_ex; /* extent for this swapdev */ 140 char swd_exname[12]; /* name of extent above */ 141 struct vnode *swd_vp; /* backing vnode */ 142 TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */ 143 144 int swd_bsize; /* blocksize (bytes) */ 145 int swd_maxactive; /* max active i/o reqs */ 146 int swd_active; /* # of active i/o reqs */ 147 struct bufq swd_bufq; 148 struct ucred *swd_cred; /* cred for file access */ 149 #ifdef UVM_SWAP_ENCRYPT 150 #define SWD_KEY_SHIFT 7 /* One key per 0.5 MByte */ 151 #define SWD_KEY(x,y) &((x)->swd_keys[((y) - (x)->swd_drumoffset) >> SWD_KEY_SHIFT]) 152 #define SWD_KEY_SIZE(x) (((x) + (1 << SWD_KEY_SHIFT) - 1) >> SWD_KEY_SHIFT) 153 154 #define SWD_DCRYPT_SHIFT 5 155 #define SWD_DCRYPT_BITS 32 156 #define SWD_DCRYPT_MASK (SWD_DCRYPT_BITS - 1) 157 #define SWD_DCRYPT_OFF(x) ((x) >> SWD_DCRYPT_SHIFT) 158 #define SWD_DCRYPT_BIT(x) ((x) & SWD_DCRYPT_MASK) 159 #define SWD_DCRYPT_SIZE(x) (SWD_DCRYPT_OFF((x) + SWD_DCRYPT_MASK) * sizeof(u_int32_t)) 160 u_int32_t *swd_decrypt; /* bitmap for decryption */ 161 struct swap_key *swd_keys; /* keys for different parts */ 162 #endif 163 }; 164 165 /* 166 * swap device priority entry; the list is kept sorted on `spi_priority'. 167 */ 168 struct swappri { 169 int spi_priority; /* priority */ 170 TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 171 /* tailq of swapdevs at this priority */ 172 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 173 }; 174 175 /* 176 * The following two structures are used to keep track of data transfers 177 * on swap devices associated with regular files. 178 * NOTE: this code is more or less a copy of vnd.c; we use the same 179 * structure names here to ease porting.. 180 */ 181 struct vndxfer { 182 struct buf *vx_bp; /* Pointer to parent buffer */ 183 struct swapdev *vx_sdp; 184 int vx_error; 185 int vx_pending; /* # of pending aux buffers */ 186 int vx_flags; 187 #define VX_BUSY 1 188 #define VX_DEAD 2 189 }; 190 191 struct vndbuf { 192 struct buf vb_buf; 193 struct vndxfer *vb_vnx; 194 struct task vb_task; 195 }; 196 197 /* 198 * We keep a of pool vndbuf's and vndxfer structures. 199 */ 200 struct pool vndxfer_pool; 201 struct pool vndbuf_pool; 202 203 204 /* 205 * local variables 206 */ 207 struct extent *swapmap; /* controls the mapping of /dev/drum */ 208 209 /* list of all active swap devices [by priority] */ 210 LIST_HEAD(swap_priority, swappri); 211 struct swap_priority swap_priority; 212 213 /* locks */ 214 struct rwlock swap_syscall_lock = RWLOCK_INITIALIZER("swplk"); 215 216 struct mutex oommtx = MUTEX_INITIALIZER(IPL_VM); 217 struct vm_page *oompps[SWCLUSTPAGES]; 218 int oom = 0; 219 220 /* 221 * prototypes 222 */ 223 void swapdrum_add(struct swapdev *, int); 224 struct swapdev *swapdrum_getsdp(int); 225 226 struct swapdev *swaplist_find(struct vnode *, int); 227 void swaplist_insert(struct swapdev *, 228 struct swappri *, int); 229 void swaplist_trim(void); 230 231 int swap_on(struct proc *, struct swapdev *); 232 int swap_off(struct proc *, struct swapdev *); 233 234 void sw_reg_strategy(struct swapdev *, struct buf *, int); 235 void sw_reg_iodone(struct buf *); 236 void sw_reg_iodone_internal(void *); 237 void sw_reg_start(struct swapdev *); 238 239 int uvm_swap_io(struct vm_page **, int, int, int); 240 241 void swapmount(void); 242 int uvm_swap_allocpages(struct vm_page **, int, int); 243 244 #ifdef UVM_SWAP_ENCRYPT 245 /* for swap encrypt */ 246 void uvm_swap_markdecrypt(struct swapdev *, int, int, int); 247 boolean_t uvm_swap_needdecrypt(struct swapdev *, int); 248 void uvm_swap_initcrypt(struct swapdev *, int); 249 #endif 250 251 /* 252 * uvm_swap_init: init the swap system data structures and locks 253 * 254 * => called at boot time from init_main.c after the filesystems 255 * are brought up (which happens after uvm_init()) 256 */ 257 void 258 uvm_swap_init(void) 259 { 260 int error; 261 262 /* 263 * first, init the swap list, its counter, and its lock. 264 * then get a handle on the vnode for /dev/drum by using 265 * the its dev_t number ("swapdev", from MD conf.c). 266 */ 267 LIST_INIT(&swap_priority); 268 uvmexp.nswapdev = 0; 269 270 if (!swapdev_vp && bdevvp(swapdev, &swapdev_vp)) 271 panic("uvm_swap_init: can't get vnode for swap device"); 272 273 /* 274 * create swap block extent to map /dev/drum. The extent spans 275 * 1 to INT_MAX allows 2 gigablocks of swap space. Note that 276 * block 0 is reserved (used to indicate an allocation failure, 277 * or no allocation). 278 */ 279 swapmap = extent_create("swapmap", 1, INT_MAX, 280 M_VMSWAP, 0, 0, EX_NOWAIT); 281 if (swapmap == 0) 282 panic("uvm_swap_init: extent_create failed"); 283 284 /* allocate pools for structures used for swapping to files. */ 285 pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, IPL_BIO, 0, 286 "swp vnx", NULL); 287 pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, IPL_BIO, 0, 288 "swp vnd", NULL); 289 290 /* allocate pages for OOM situations. */ 291 error = uvm_swap_allocpages(oompps, SWCLUSTPAGES, UVM_PLA_NOWAIT); 292 KASSERT(error == 0); 293 294 /* Setup the initial swap partition */ 295 swapmount(); 296 } 297 298 #ifdef UVM_SWAP_ENCRYPT 299 void 300 uvm_swap_initcrypt_all(void) 301 { 302 struct swapdev *sdp; 303 struct swappri *spp; 304 int npages; 305 306 307 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 308 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 309 if (sdp->swd_decrypt == NULL) { 310 npages = dbtob((uint64_t)sdp->swd_nblks) >> 311 PAGE_SHIFT; 312 uvm_swap_initcrypt(sdp, npages); 313 } 314 } 315 } 316 } 317 318 void 319 uvm_swap_initcrypt(struct swapdev *sdp, int npages) 320 { 321 /* 322 * keep information if a page needs to be decrypted when we get it 323 * from the swap device. 324 * We cannot chance a malloc later, if we are doing ASYNC puts, 325 * we may not call malloc with M_WAITOK. This consumes only 326 * 8KB memory for a 256MB swap partition. 327 */ 328 sdp->swd_decrypt = malloc(SWD_DCRYPT_SIZE(npages), M_VMSWAP, 329 M_WAITOK|M_ZERO); 330 sdp->swd_keys = mallocarray(SWD_KEY_SIZE(npages), 331 sizeof(struct swap_key), M_VMSWAP, M_WAITOK|M_ZERO); 332 } 333 334 #endif /* UVM_SWAP_ENCRYPT */ 335 336 int 337 uvm_swap_allocpages(struct vm_page **pps, int npages, int flags) 338 { 339 struct pglist pgl; 340 int error, i; 341 342 KASSERT(npages <= SWCLUSTPAGES); 343 344 TAILQ_INIT(&pgl); 345 again: 346 error = uvm_pglistalloc(npages * PAGE_SIZE, dma_constraint.ucr_low, 347 dma_constraint.ucr_high, 0, 0, &pgl, npages, flags); 348 if (error && (curproc == uvm.pagedaemon_proc)) { 349 mtx_enter(&oommtx); 350 if (oom) { 351 msleep_nsec(&oom, &oommtx, PVM | PNORELOCK, 352 "oom", INFSLP); 353 goto again; 354 } 355 oom = 1; 356 for (i = 0; i < npages; i++) { 357 pps[i] = oompps[i]; 358 atomic_setbits_int(&pps[i]->pg_flags, PG_BUSY); 359 } 360 mtx_leave(&oommtx); 361 return 0; 362 } 363 if (error) 364 return error; 365 366 for (i = 0; i < npages; i++) { 367 pps[i] = TAILQ_FIRST(&pgl); 368 /* *sigh* */ 369 atomic_setbits_int(&pps[i]->pg_flags, PG_BUSY); 370 TAILQ_REMOVE(&pgl, pps[i], pageq); 371 } 372 373 return 0; 374 } 375 376 void 377 uvm_swap_freepages(struct vm_page **pps, int npages) 378 { 379 int i; 380 381 if (pps[0] == oompps[0]) { 382 for (i = 0; i < npages; i++) 383 uvm_pageclean(pps[i]); 384 385 mtx_enter(&oommtx); 386 KASSERT(oom == 1); 387 oom = 0; 388 mtx_leave(&oommtx); 389 wakeup(&oom); 390 return; 391 } 392 393 uvm_lock_pageq(); 394 for (i = 0; i < npages; i++) 395 uvm_pagefree(pps[i]); 396 uvm_unlock_pageq(); 397 398 } 399 400 #ifdef UVM_SWAP_ENCRYPT 401 /* 402 * Mark pages on the swap device for later decryption 403 */ 404 405 void 406 uvm_swap_markdecrypt(struct swapdev *sdp, int startslot, int npages, 407 int decrypt) 408 { 409 int pagestart, i; 410 int off, bit; 411 412 if (!sdp) 413 return; 414 415 pagestart = startslot - sdp->swd_drumoffset; 416 for (i = 0; i < npages; i++, pagestart++) { 417 off = SWD_DCRYPT_OFF(pagestart); 418 bit = SWD_DCRYPT_BIT(pagestart); 419 if (decrypt) 420 /* pages read need decryption */ 421 sdp->swd_decrypt[off] |= 1 << bit; 422 else 423 /* pages read do not need decryption */ 424 sdp->swd_decrypt[off] &= ~(1 << bit); 425 } 426 } 427 428 /* 429 * Check if the page that we got from disk needs to be decrypted 430 */ 431 432 boolean_t 433 uvm_swap_needdecrypt(struct swapdev *sdp, int off) 434 { 435 if (!sdp) 436 return FALSE; 437 438 off -= sdp->swd_drumoffset; 439 return sdp->swd_decrypt[SWD_DCRYPT_OFF(off)] & (1 << SWD_DCRYPT_BIT(off)) ? 440 TRUE : FALSE; 441 } 442 443 void 444 uvm_swap_finicrypt_all(void) 445 { 446 struct swapdev *sdp; 447 struct swappri *spp; 448 struct swap_key *key; 449 unsigned int nkeys; 450 451 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 452 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 453 if (sdp->swd_decrypt == NULL) 454 continue; 455 456 nkeys = dbtob((uint64_t)sdp->swd_nblks) >> PAGE_SHIFT; 457 key = sdp->swd_keys + (SWD_KEY_SIZE(nkeys) - 1); 458 do { 459 if (key->refcount != 0) 460 swap_key_delete(key); 461 } while (key-- != sdp->swd_keys); 462 } 463 } 464 } 465 #endif /* UVM_SWAP_ENCRYPT */ 466 467 /* 468 * swaplist functions: functions that operate on the list of swap 469 * devices on the system. 470 */ 471 472 /* 473 * swaplist_insert: insert swap device "sdp" into the global list 474 * 475 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 476 * => caller must provide a newly malloc'd swappri structure (we will 477 * FREE it if we don't need it... this it to prevent malloc blocking 478 * here while adding swap) 479 */ 480 void 481 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) 482 { 483 struct swappri *spp, *pspp; 484 485 /* 486 * find entry at or after which to insert the new device. 487 */ 488 pspp = NULL; 489 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 490 if (priority <= spp->spi_priority) 491 break; 492 pspp = spp; 493 } 494 495 /* 496 * new priority? 497 */ 498 if (spp == NULL || spp->spi_priority != priority) { 499 spp = newspp; /* use newspp! */ 500 501 spp->spi_priority = priority; 502 TAILQ_INIT(&spp->spi_swapdev); 503 504 if (pspp) 505 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 506 else 507 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 508 } else { 509 /* we don't need a new priority structure, free it */ 510 free(newspp, M_VMSWAP, sizeof(*newspp)); 511 } 512 513 /* 514 * priority found (or created). now insert on the priority's 515 * tailq list and bump the total number of swapdevs. 516 */ 517 sdp->swd_priority = priority; 518 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 519 uvmexp.nswapdev++; 520 } 521 522 /* 523 * swaplist_find: find and optionally remove a swap device from the 524 * global list. 525 * 526 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 527 * => we return the swapdev we found (and removed) 528 */ 529 struct swapdev * 530 swaplist_find(struct vnode *vp, boolean_t remove) 531 { 532 struct swapdev *sdp; 533 struct swappri *spp; 534 535 /* 536 * search the lists for the requested vp 537 */ 538 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 539 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 540 if (sdp->swd_vp != vp) 541 continue; 542 if (remove) { 543 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 544 uvmexp.nswapdev--; 545 } 546 return (sdp); 547 } 548 } 549 return (NULL); 550 } 551 552 553 /* 554 * swaplist_trim: scan priority list for empty priority entries and kill 555 * them. 556 * 557 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 558 */ 559 void 560 swaplist_trim(void) 561 { 562 struct swappri *spp, *nextspp; 563 564 LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) { 565 if (!TAILQ_EMPTY(&spp->spi_swapdev)) 566 continue; 567 LIST_REMOVE(spp, spi_swappri); 568 free(spp, M_VMSWAP, sizeof(*spp)); 569 } 570 } 571 572 /* 573 * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area. 574 * 575 * => caller must hold swap_syscall_lock 576 * => uvm.swap_data_lock should be unlocked (we may sleep) 577 */ 578 void 579 swapdrum_add(struct swapdev *sdp, int npages) 580 { 581 u_long result; 582 583 if (extent_alloc(swapmap, npages, EX_NOALIGN, 0, EX_NOBOUNDARY, 584 EX_WAITOK, &result)) 585 panic("swapdrum_add"); 586 587 sdp->swd_drumoffset = result; 588 sdp->swd_drumsize = npages; 589 } 590 591 /* 592 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 593 * to the "swapdev" that maps that section of the drum. 594 * 595 * => each swapdev takes one big contig chunk of the drum 596 * => caller must hold uvm.swap_data_lock 597 */ 598 struct swapdev * 599 swapdrum_getsdp(int pgno) 600 { 601 struct swapdev *sdp; 602 struct swappri *spp; 603 604 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 605 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 606 if (pgno >= sdp->swd_drumoffset && 607 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 608 return sdp; 609 } 610 } 611 } 612 return NULL; 613 } 614 615 616 /* 617 * sys_swapctl: main entry point for swapctl(2) system call 618 * [with two helper functions: swap_on and swap_off] 619 */ 620 int 621 sys_swapctl(struct proc *p, void *v, register_t *retval) 622 { 623 struct sys_swapctl_args /* { 624 syscallarg(int) cmd; 625 syscallarg(void *) arg; 626 syscallarg(int) misc; 627 } */ *uap = (struct sys_swapctl_args *)v; 628 struct vnode *vp; 629 struct nameidata nd; 630 struct swappri *spp; 631 struct swapdev *sdp; 632 struct swapent *sep; 633 char userpath[MAXPATHLEN]; 634 size_t len; 635 int count, error, misc; 636 int priority; 637 638 misc = SCARG(uap, misc); 639 640 if ((error = pledge_swapctl(p, SCARG(uap, cmd)))) 641 return error; 642 643 /* 644 * ensure serialized syscall access by grabbing the swap_syscall_lock 645 */ 646 rw_enter_write(&swap_syscall_lock); 647 648 /* 649 * we handle the non-priv NSWAP and STATS request first. 650 * 651 * SWAP_NSWAP: return number of config'd swap devices 652 * [can also be obtained with uvmexp sysctl] 653 */ 654 if (SCARG(uap, cmd) == SWAP_NSWAP) { 655 *retval = uvmexp.nswapdev; 656 error = 0; 657 goto out; 658 } 659 660 /* 661 * SWAP_STATS: get stats on current # of configured swap devs 662 * 663 * note that the swap_priority list can't change as long 664 * as we are holding the swap_syscall_lock. we don't want 665 * to grab the uvm.swap_data_lock because we may fault&sleep during 666 * copyout() and we don't want to be holding that lock then! 667 */ 668 if (SCARG(uap, cmd) == SWAP_STATS) { 669 sep = (struct swapent *)SCARG(uap, arg); 670 count = 0; 671 672 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 673 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 674 if (count >= misc) 675 continue; 676 677 sdp->swd_inuse = 678 btodb((u_int64_t)sdp->swd_npginuse << 679 PAGE_SHIFT); 680 error = copyout(&sdp->swd_se, sep, 681 sizeof(struct swapent)); 682 if (error) 683 goto out; 684 685 /* now copy out the path if necessary */ 686 error = copyoutstr(sdp->swd_path, 687 sep->se_path, sizeof(sep->se_path), NULL); 688 if (error) 689 goto out; 690 691 count++; 692 sep++; 693 } 694 } 695 696 *retval = count; 697 error = 0; 698 goto out; 699 } 700 701 /* all other requests require superuser privs. verify. */ 702 if ((error = suser(p))) 703 goto out; 704 705 /* 706 * at this point we expect a path name in arg. we will 707 * use namei() to gain a vnode reference (vref), and lock 708 * the vnode (VOP_LOCK). 709 */ 710 error = copyinstr(SCARG(uap, arg), userpath, sizeof(userpath), &len); 711 if (error) 712 goto out; 713 disk_map(userpath, userpath, sizeof(userpath), DM_OPENBLCK); 714 NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, UIO_SYSSPACE, userpath, p); 715 if ((error = namei(&nd))) 716 goto out; 717 vp = nd.ni_vp; 718 /* note: "vp" is referenced and locked */ 719 720 error = 0; /* assume no error */ 721 switch(SCARG(uap, cmd)) { 722 case SWAP_DUMPDEV: 723 if (vp->v_type != VBLK) { 724 error = ENOTBLK; 725 break; 726 } 727 dumpdev = vp->v_rdev; 728 break; 729 case SWAP_CTL: 730 /* 731 * get new priority, remove old entry (if any) and then 732 * reinsert it in the correct place. finally, prune out 733 * any empty priority structures. 734 */ 735 priority = SCARG(uap, misc); 736 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 737 if ((sdp = swaplist_find(vp, 1)) == NULL) { 738 error = ENOENT; 739 } else { 740 swaplist_insert(sdp, spp, priority); 741 swaplist_trim(); 742 } 743 if (error) 744 free(spp, M_VMSWAP, sizeof(*spp)); 745 break; 746 case SWAP_ON: 747 /* 748 * If the device is a regular file, make sure the filesystem 749 * can be used for swapping. 750 */ 751 if (vp->v_type == VREG && 752 (vp->v_mount->mnt_flag & MNT_SWAPPABLE) == 0) { 753 error = ENOTSUP; 754 break; 755 } 756 757 /* 758 * check for duplicates. if none found, then insert a 759 * dummy entry on the list to prevent someone else from 760 * trying to enable this device while we are working on 761 * it. 762 */ 763 priority = SCARG(uap, misc); 764 if ((sdp = swaplist_find(vp, 0)) != NULL) { 765 error = EBUSY; 766 break; 767 } 768 sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK|M_ZERO); 769 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 770 sdp->swd_flags = SWF_FAKE; /* placeholder only */ 771 sdp->swd_vp = vp; 772 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 773 774 /* 775 * XXX Is NFS elaboration necessary? 776 */ 777 if (vp->v_type == VREG) { 778 sdp->swd_cred = crdup(p->p_ucred); 779 } 780 781 swaplist_insert(sdp, spp, priority); 782 783 sdp->swd_pathlen = len; 784 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK); 785 strlcpy(sdp->swd_path, userpath, len); 786 787 /* 788 * we've now got a FAKE placeholder in the swap list. 789 * now attempt to enable swap on it. if we fail, undo 790 * what we've done and kill the fake entry we just inserted. 791 * if swap_on is a success, it will clear the SWF_FAKE flag 792 */ 793 794 if ((error = swap_on(p, sdp)) != 0) { 795 (void) swaplist_find(vp, 1); /* kill fake entry */ 796 swaplist_trim(); 797 if (vp->v_type == VREG) { 798 crfree(sdp->swd_cred); 799 } 800 free(sdp->swd_path, M_VMSWAP, sdp->swd_pathlen); 801 free(sdp, M_VMSWAP, sizeof(*sdp)); 802 break; 803 } 804 break; 805 case SWAP_OFF: 806 if ((sdp = swaplist_find(vp, 0)) == NULL) { 807 error = ENXIO; 808 break; 809 } 810 811 /* 812 * If a device isn't in use or enabled, we 813 * can't stop swapping from it (again). 814 */ 815 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 816 error = EBUSY; 817 break; 818 } 819 820 /* 821 * do the real work. 822 */ 823 error = swap_off(p, sdp); 824 break; 825 default: 826 error = EINVAL; 827 } 828 829 /* done! release the ref gained by namei() and unlock. */ 830 vput(vp); 831 832 out: 833 rw_exit_write(&swap_syscall_lock); 834 835 return (error); 836 } 837 838 /* 839 * swap_on: attempt to enable a swapdev for swapping. note that the 840 * swapdev is already on the global list, but disabled (marked 841 * SWF_FAKE). 842 * 843 * => we avoid the start of the disk (to protect disk labels) 844 * => caller should leave uvm.swap_data_lock unlocked, we may lock it 845 * if needed. 846 */ 847 int 848 swap_on(struct proc *p, struct swapdev *sdp) 849 { 850 static int count = 0; /* static */ 851 struct vnode *vp; 852 int error, npages, nblocks, size; 853 long addr; 854 struct vattr va; 855 #if defined(NFSCLIENT) 856 extern const struct vops nfs_vops; 857 #endif /* defined(NFSCLIENT) */ 858 dev_t dev; 859 860 /* 861 * we want to enable swapping on sdp. the swd_vp contains 862 * the vnode we want (locked and ref'd), and the swd_dev 863 * contains the dev_t of the file, if it a block device. 864 */ 865 866 vp = sdp->swd_vp; 867 dev = sdp->swd_dev; 868 869 #if NVND > 0 870 /* no swapping to vnds. */ 871 if (bdevsw[major(dev)].d_strategy == vndstrategy) 872 return (EOPNOTSUPP); 873 #endif 874 875 /* 876 * open the swap file (mostly useful for block device files to 877 * let device driver know what is up). 878 * 879 * we skip the open/close for root on swap because the root 880 * has already been opened when root was mounted (mountroot). 881 */ 882 if (vp != rootvp) { 883 if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p))) 884 return (error); 885 } 886 887 /* XXX this only works for block devices */ 888 /* 889 * we now need to determine the size of the swap area. for 890 * block specials we can call the d_psize function. 891 * for normal files, we must stat [get attrs]. 892 * 893 * we put the result in nblks. 894 * for normal files, we also want the filesystem block size 895 * (which we get with statfs). 896 */ 897 switch (vp->v_type) { 898 case VBLK: 899 if (bdevsw[major(dev)].d_psize == 0 || 900 (nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) { 901 error = ENXIO; 902 goto bad; 903 } 904 break; 905 906 case VREG: 907 if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p))) 908 goto bad; 909 nblocks = (int)btodb(va.va_size); 910 if ((error = 911 VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0) 912 goto bad; 913 914 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize; 915 /* 916 * limit the max # of outstanding I/O requests we issue 917 * at any one time. take it easy on NFS servers. 918 */ 919 #if defined(NFSCLIENT) 920 if (vp->v_op == &nfs_vops) 921 sdp->swd_maxactive = 2; /* XXX */ 922 else 923 #endif /* defined(NFSCLIENT) */ 924 sdp->swd_maxactive = 8; /* XXX */ 925 bufq_init(&sdp->swd_bufq, BUFQ_FIFO); 926 break; 927 928 default: 929 error = ENXIO; 930 goto bad; 931 } 932 933 /* 934 * save nblocks in a safe place and convert to pages. 935 */ 936 937 sdp->swd_nblks = nblocks; 938 npages = dbtob((u_int64_t)nblocks) >> PAGE_SHIFT; 939 940 /* 941 * for block special files, we want to make sure that leave 942 * the disklabel and bootblocks alone, so we arrange to skip 943 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 944 * note that because of this the "size" can be less than the 945 * actual number of blocks on the device. 946 */ 947 if (vp->v_type == VBLK) { 948 /* we use pages 1 to (size - 1) [inclusive] */ 949 size = npages - 1; 950 addr = 1; 951 } else { 952 /* we use pages 0 to (size - 1) [inclusive] */ 953 size = npages; 954 addr = 0; 955 } 956 957 /* 958 * make sure we have enough blocks for a reasonable sized swap 959 * area. we want at least one page. 960 */ 961 962 if (size < 1) { 963 error = EINVAL; 964 goto bad; 965 } 966 967 /* 968 * now we need to allocate an extent to manage this swap device 969 */ 970 snprintf(sdp->swd_exname, sizeof(sdp->swd_exname), "swap0x%04x", 971 count++); 972 973 /* note that extent_create's 3rd arg is inclusive, thus "- 1" */ 974 sdp->swd_ex = extent_create(sdp->swd_exname, 0, npages - 1, M_VMSWAP, 975 0, 0, EX_WAITOK); 976 /* allocate the `saved' region from the extent so it won't be used */ 977 if (addr) { 978 if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK)) 979 panic("disklabel reserve"); 980 /* XXX: is extent synchronized with swd_npginuse? */ 981 } 982 #ifdef HIBERNATE 983 /* 984 * Lock down the last region of primary disk swap, in case 985 * hibernate needs to place a signature there. 986 */ 987 if (dev == swdevt[0].sw_dev && vp->v_type == VBLK && size > 3 ) { 988 if (extent_alloc_region(sdp->swd_ex, 989 npages - 1 - 1, 1, EX_WAITOK)) 990 panic("hibernate reserve"); 991 /* XXX: is extent synchronized with swd_npginuse? */ 992 } 993 #endif 994 995 /* add a ref to vp to reflect usage as a swap device. */ 996 vref(vp); 997 998 #ifdef UVM_SWAP_ENCRYPT 999 if (uvm_doswapencrypt) 1000 uvm_swap_initcrypt(sdp, npages); 1001 #endif 1002 /* now add the new swapdev to the drum and enable. */ 1003 swapdrum_add(sdp, npages); 1004 sdp->swd_npages = size; 1005 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 1006 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 1007 uvmexp.swpages += size; 1008 return (0); 1009 1010 bad: 1011 /* failure: close device if necessary and return error. */ 1012 if (vp != rootvp) 1013 (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p); 1014 return (error); 1015 } 1016 1017 /* 1018 * swap_off: stop swapping on swapdev 1019 * 1020 * => swap data should be locked, we will unlock. 1021 */ 1022 int 1023 swap_off(struct proc *p, struct swapdev *sdp) 1024 { 1025 int error = 0; 1026 1027 /* disable the swap area being removed */ 1028 sdp->swd_flags &= ~SWF_ENABLE; 1029 1030 /* 1031 * the idea is to find all the pages that are paged out to this 1032 * device, and page them all in. in uvm, swap-backed pageable 1033 * memory can take two forms: aobjs and anons. call the 1034 * swapoff hook for each subsystem to bring in pages. 1035 */ 1036 1037 if (uao_swap_off(sdp->swd_drumoffset, 1038 sdp->swd_drumoffset + sdp->swd_drumsize) || 1039 amap_swap_off(sdp->swd_drumoffset, 1040 sdp->swd_drumoffset + sdp->swd_drumsize)) { 1041 1042 error = ENOMEM; 1043 } else if (sdp->swd_npginuse > sdp->swd_npgbad) { 1044 error = EBUSY; 1045 } 1046 1047 if (error) { 1048 sdp->swd_flags |= SWF_ENABLE; 1049 return (error); 1050 } 1051 1052 /* 1053 * done with the vnode and saved creds. 1054 * drop our ref on the vnode before calling VOP_CLOSE() 1055 * so that spec_close() can tell if this is the last close. 1056 */ 1057 if (sdp->swd_vp->v_type == VREG) { 1058 crfree(sdp->swd_cred); 1059 } 1060 vrele(sdp->swd_vp); 1061 if (sdp->swd_vp != rootvp) { 1062 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p); 1063 } 1064 1065 uvmexp.swpages -= sdp->swd_npages; 1066 1067 if (swaplist_find(sdp->swd_vp, 1) == NULL) 1068 panic("swap_off: swapdev not in list"); 1069 swaplist_trim(); 1070 1071 /* 1072 * free all resources! 1073 */ 1074 extent_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize, 1075 EX_WAITOK); 1076 extent_destroy(sdp->swd_ex); 1077 /* free sdp->swd_path ? */ 1078 free(sdp, M_VMSWAP, sizeof(*sdp)); 1079 return (0); 1080 } 1081 1082 /* 1083 * /dev/drum interface and i/o functions 1084 */ 1085 1086 /* 1087 * swstrategy: perform I/O on the drum 1088 * 1089 * => we must map the i/o request from the drum to the correct swapdev. 1090 */ 1091 void 1092 swstrategy(struct buf *bp) 1093 { 1094 struct swapdev *sdp; 1095 int s, pageno, bn; 1096 1097 /* 1098 * convert block number to swapdev. note that swapdev can't 1099 * be yanked out from under us because we are holding resources 1100 * in it (i.e. the blocks we are doing I/O on). 1101 */ 1102 pageno = dbtob((u_int64_t)bp->b_blkno) >> PAGE_SHIFT; 1103 sdp = swapdrum_getsdp(pageno); 1104 if (sdp == NULL) { 1105 bp->b_error = EINVAL; 1106 bp->b_flags |= B_ERROR; 1107 s = splbio(); 1108 biodone(bp); 1109 splx(s); 1110 return; 1111 } 1112 1113 /* convert drum page number to block number on this swapdev. */ 1114 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1115 bn = btodb((u_int64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ 1116 1117 /* 1118 * for block devices we finish up here. 1119 * for regular files we have to do more work which we delegate 1120 * to sw_reg_strategy(). 1121 */ 1122 switch (sdp->swd_vp->v_type) { 1123 default: 1124 panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type); 1125 case VBLK: 1126 /* 1127 * must convert "bp" from an I/O on /dev/drum to an I/O 1128 * on the swapdev (sdp). 1129 */ 1130 s = splbio(); 1131 buf_replacevnode(bp, sdp->swd_vp); 1132 1133 bp->b_blkno = bn; 1134 splx(s); 1135 VOP_STRATEGY(bp->b_vp, bp); 1136 return; 1137 case VREG: 1138 /* delegate to sw_reg_strategy function. */ 1139 sw_reg_strategy(sdp, bp, bn); 1140 return; 1141 } 1142 /* NOTREACHED */ 1143 } 1144 1145 /* 1146 * sw_reg_strategy: handle swap i/o to regular files 1147 */ 1148 void 1149 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) 1150 { 1151 struct vnode *vp; 1152 struct vndxfer *vnx; 1153 daddr_t nbn; 1154 caddr_t addr; 1155 off_t byteoff; 1156 int s, off, nra, error, sz, resid; 1157 1158 /* 1159 * allocate a vndxfer head for this transfer and point it to 1160 * our buffer. 1161 */ 1162 vnx = pool_get(&vndxfer_pool, PR_WAITOK); 1163 vnx->vx_flags = VX_BUSY; 1164 vnx->vx_error = 0; 1165 vnx->vx_pending = 0; 1166 vnx->vx_bp = bp; 1167 vnx->vx_sdp = sdp; 1168 1169 /* 1170 * setup for main loop where we read filesystem blocks into 1171 * our buffer. 1172 */ 1173 error = 0; 1174 bp->b_resid = bp->b_bcount; /* nothing transferred yet! */ 1175 addr = bp->b_data; /* current position in buffer */ 1176 byteoff = dbtob((u_int64_t)bn); 1177 1178 for (resid = bp->b_resid; resid; resid -= sz) { 1179 struct vndbuf *nbp; 1180 /* 1181 * translate byteoffset into block number. return values: 1182 * vp = vnode of underlying device 1183 * nbn = new block number (on underlying vnode dev) 1184 * nra = num blocks we can read-ahead (excludes requested 1185 * block) 1186 */ 1187 nra = 0; 1188 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1189 &vp, &nbn, &nra); 1190 1191 if (error == 0 && nbn == -1) { 1192 /* 1193 * this used to just set error, but that doesn't 1194 * do the right thing. Instead, it causes random 1195 * memory errors. The panic() should remain until 1196 * this condition doesn't destabilize the system. 1197 */ 1198 #if 1 1199 panic("sw_reg_strategy: swap to sparse file"); 1200 #else 1201 error = EIO; /* failure */ 1202 #endif 1203 } 1204 1205 /* 1206 * punt if there was an error or a hole in the file. 1207 * we must wait for any i/o ops we have already started 1208 * to finish before returning. 1209 * 1210 * XXX we could deal with holes here but it would be 1211 * a hassle (in the write case). 1212 */ 1213 if (error) { 1214 s = splbio(); 1215 vnx->vx_error = error; /* pass error up */ 1216 goto out; 1217 } 1218 1219 /* 1220 * compute the size ("sz") of this transfer (in bytes). 1221 */ 1222 off = byteoff % sdp->swd_bsize; 1223 sz = (1 + nra) * sdp->swd_bsize - off; 1224 if (sz > resid) 1225 sz = resid; 1226 1227 /* 1228 * now get a buf structure. note that the vb_buf is 1229 * at the front of the nbp structure so that you can 1230 * cast pointers between the two structure easily. 1231 */ 1232 nbp = pool_get(&vndbuf_pool, PR_WAITOK); 1233 nbp->vb_buf.b_flags = bp->b_flags | B_CALL; 1234 nbp->vb_buf.b_bcount = sz; 1235 nbp->vb_buf.b_bufsize = sz; 1236 nbp->vb_buf.b_error = 0; 1237 nbp->vb_buf.b_data = addr; 1238 nbp->vb_buf.b_bq = NULL; 1239 nbp->vb_buf.b_blkno = nbn + btodb(off); 1240 nbp->vb_buf.b_proc = bp->b_proc; 1241 nbp->vb_buf.b_iodone = sw_reg_iodone; 1242 nbp->vb_buf.b_vp = NULLVP; 1243 nbp->vb_buf.b_vnbufs.le_next = NOLIST; 1244 LIST_INIT(&nbp->vb_buf.b_dep); 1245 1246 /* 1247 * set b_dirtyoff/end and b_validoff/end. this is 1248 * required by the NFS client code (otherwise it will 1249 * just discard our I/O request). 1250 */ 1251 if (bp->b_dirtyend == 0) { 1252 nbp->vb_buf.b_dirtyoff = 0; 1253 nbp->vb_buf.b_dirtyend = sz; 1254 } else { 1255 nbp->vb_buf.b_dirtyoff = 1256 max(0, bp->b_dirtyoff - (bp->b_bcount-resid)); 1257 nbp->vb_buf.b_dirtyend = 1258 min(sz, 1259 max(0, bp->b_dirtyend - (bp->b_bcount-resid))); 1260 } 1261 if (bp->b_validend == 0) { 1262 nbp->vb_buf.b_validoff = 0; 1263 nbp->vb_buf.b_validend = sz; 1264 } else { 1265 nbp->vb_buf.b_validoff = 1266 max(0, bp->b_validoff - (bp->b_bcount-resid)); 1267 nbp->vb_buf.b_validend = 1268 min(sz, 1269 max(0, bp->b_validend - (bp->b_bcount-resid))); 1270 } 1271 1272 /* patch it back to the vnx */ 1273 nbp->vb_vnx = vnx; 1274 task_set(&nbp->vb_task, sw_reg_iodone_internal, nbp); 1275 1276 s = splbio(); 1277 if (vnx->vx_error != 0) { 1278 pool_put(&vndbuf_pool, nbp); 1279 goto out; 1280 } 1281 vnx->vx_pending++; 1282 1283 /* assoc new buffer with underlying vnode */ 1284 bgetvp(vp, &nbp->vb_buf); 1285 1286 /* start I/O if we are not over our limit */ 1287 bufq_queue(&sdp->swd_bufq, &nbp->vb_buf); 1288 sw_reg_start(sdp); 1289 splx(s); 1290 1291 /* 1292 * advance to the next I/O 1293 */ 1294 byteoff += sz; 1295 addr += sz; 1296 } 1297 1298 s = splbio(); 1299 1300 out: /* Arrive here at splbio */ 1301 vnx->vx_flags &= ~VX_BUSY; 1302 if (vnx->vx_pending == 0) { 1303 if (vnx->vx_error != 0) { 1304 bp->b_error = vnx->vx_error; 1305 bp->b_flags |= B_ERROR; 1306 } 1307 pool_put(&vndxfer_pool, vnx); 1308 biodone(bp); 1309 } 1310 splx(s); 1311 } 1312 1313 /* sw_reg_start: start an I/O request on the requested swapdev. */ 1314 void 1315 sw_reg_start(struct swapdev *sdp) 1316 { 1317 struct buf *bp; 1318 1319 /* XXX: recursion control */ 1320 if ((sdp->swd_flags & SWF_BUSY) != 0) 1321 return; 1322 1323 sdp->swd_flags |= SWF_BUSY; 1324 1325 while (sdp->swd_active < sdp->swd_maxactive) { 1326 bp = bufq_dequeue(&sdp->swd_bufq); 1327 if (bp == NULL) 1328 break; 1329 1330 sdp->swd_active++; 1331 1332 if ((bp->b_flags & B_READ) == 0) 1333 bp->b_vp->v_numoutput++; 1334 1335 VOP_STRATEGY(bp->b_vp, bp); 1336 } 1337 sdp->swd_flags &= ~SWF_BUSY; 1338 } 1339 1340 /* 1341 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1342 * 1343 * => note that we can recover the vndbuf struct by casting the buf ptr 1344 * 1345 * XXX: 1346 * We only put this onto a taskq here, because of the maxactive game since 1347 * it basically requires us to call back into VOP_STRATEGY() (where we must 1348 * be able to sleep) via sw_reg_start(). 1349 */ 1350 void 1351 sw_reg_iodone(struct buf *bp) 1352 { 1353 struct vndbuf *vbp = (struct vndbuf *)bp; 1354 task_add(systq, &vbp->vb_task); 1355 } 1356 1357 void 1358 sw_reg_iodone_internal(void *xvbp) 1359 { 1360 struct vndbuf *vbp = xvbp; 1361 struct vndxfer *vnx = vbp->vb_vnx; 1362 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1363 struct swapdev *sdp = vnx->vx_sdp; 1364 int resid, s; 1365 1366 s = splbio(); 1367 1368 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1369 pbp->b_resid -= resid; 1370 vnx->vx_pending--; 1371 1372 /* pass error upward */ 1373 if (vbp->vb_buf.b_error) 1374 vnx->vx_error = vbp->vb_buf.b_error; 1375 1376 /* disassociate this buffer from the vnode (if any). */ 1377 if (vbp->vb_buf.b_vp != NULL) { 1378 brelvp(&vbp->vb_buf); 1379 } 1380 1381 /* kill vbp structure */ 1382 pool_put(&vndbuf_pool, vbp); 1383 1384 /* 1385 * wrap up this transaction if it has run to completion or, in 1386 * case of an error, when all auxiliary buffers have returned. 1387 */ 1388 if (vnx->vx_error != 0) { 1389 /* pass error upward */ 1390 pbp->b_flags |= B_ERROR; 1391 pbp->b_error = vnx->vx_error; 1392 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1393 pool_put(&vndxfer_pool, vnx); 1394 biodone(pbp); 1395 } 1396 } else if (pbp->b_resid == 0) { 1397 KASSERT(vnx->vx_pending == 0); 1398 if ((vnx->vx_flags & VX_BUSY) == 0) { 1399 pool_put(&vndxfer_pool, vnx); 1400 biodone(pbp); 1401 } 1402 } 1403 1404 /* 1405 * done! start next swapdev I/O if one is pending 1406 */ 1407 sdp->swd_active--; 1408 sw_reg_start(sdp); 1409 splx(s); 1410 } 1411 1412 1413 /* 1414 * uvm_swap_alloc: allocate space on swap 1415 * 1416 * => allocation is done "round robin" down the priority list, as we 1417 * allocate in a priority we "rotate" the tail queue. 1418 * => space can be freed with uvm_swap_free 1419 * => we return the page slot number in /dev/drum (0 == invalid slot) 1420 * => we lock uvm.swap_data_lock 1421 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1422 */ 1423 int 1424 uvm_swap_alloc(int *nslots, boolean_t lessok) 1425 { 1426 struct swapdev *sdp; 1427 struct swappri *spp; 1428 u_long result; 1429 1430 /* 1431 * no swap devices configured yet? definite failure. 1432 */ 1433 if (uvmexp.nswapdev < 1) 1434 return 0; 1435 1436 /* 1437 * lock data lock, convert slots into blocks, and enter loop 1438 */ 1439 KERNEL_ASSERT_LOCKED(); 1440 ReTry: /* XXXMRG */ 1441 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1442 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1443 /* if it's not enabled, then we can't swap from it */ 1444 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1445 continue; 1446 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1447 continue; 1448 if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN, 0, 1449 EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT, 1450 &result) != 0) { 1451 continue; 1452 } 1453 1454 /* 1455 * successful allocation! now rotate the tailq. 1456 */ 1457 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1458 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1459 sdp->swd_npginuse += *nslots; 1460 uvmexp.swpginuse += *nslots; 1461 /* done! return drum slot number */ 1462 return result + sdp->swd_drumoffset; 1463 } 1464 } 1465 1466 /* XXXMRG: BEGIN HACK */ 1467 if (*nslots > 1 && lessok) { 1468 *nslots = 1; 1469 goto ReTry; /* XXXMRG: ugh! extent should support this for us */ 1470 } 1471 /* XXXMRG: END HACK */ 1472 1473 return 0; /* failed */ 1474 } 1475 1476 /* 1477 * uvm_swapisfull: return true if all of available swap is allocated 1478 * and in use. 1479 */ 1480 int 1481 uvm_swapisfull(void) 1482 { 1483 int result; 1484 1485 KERNEL_LOCK(); 1486 KASSERT(uvmexp.swpgonly <= uvmexp.swpages); 1487 result = (uvmexp.swpgonly == uvmexp.swpages); 1488 KERNEL_UNLOCK(); 1489 1490 return result; 1491 } 1492 1493 /* 1494 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1495 * 1496 * => we lock uvm.swap_data_lock 1497 */ 1498 void 1499 uvm_swap_markbad(int startslot, int nslots) 1500 { 1501 struct swapdev *sdp; 1502 1503 KERNEL_LOCK(); 1504 sdp = swapdrum_getsdp(startslot); 1505 if (sdp != NULL) { 1506 /* 1507 * we just keep track of how many pages have been marked bad 1508 * in this device, to make everything add up in swap_off(). 1509 * we assume here that the range of slots will all be within 1510 * one swap device. 1511 */ 1512 sdp->swd_npgbad += nslots; 1513 } 1514 KERNEL_UNLOCK(); 1515 } 1516 1517 /* 1518 * uvm_swap_free: free swap slots 1519 * 1520 * => this can be all or part of an allocation made by uvm_swap_alloc 1521 * => we lock uvm.swap_data_lock 1522 */ 1523 void 1524 uvm_swap_free(int startslot, int nslots) 1525 { 1526 struct swapdev *sdp; 1527 1528 /* 1529 * ignore attempts to free the "bad" slot. 1530 */ 1531 1532 if (startslot == SWSLOT_BAD) { 1533 return; 1534 } 1535 1536 /* 1537 * convert drum slot offset back to sdp, free the blocks 1538 * in the extent, and return. must hold pri lock to do 1539 * lookup and access the extent. 1540 */ 1541 KERNEL_LOCK(); 1542 sdp = swapdrum_getsdp(startslot); 1543 KASSERT(uvmexp.nswapdev >= 1); 1544 KASSERT(sdp != NULL); 1545 KASSERT(sdp->swd_npginuse >= nslots); 1546 if (extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots, 1547 EX_MALLOCOK|EX_NOWAIT) != 0) { 1548 printf("warning: resource shortage: %d pages of swap lost\n", 1549 nslots); 1550 } 1551 1552 sdp->swd_npginuse -= nslots; 1553 uvmexp.swpginuse -= nslots; 1554 #ifdef UVM_SWAP_ENCRYPT 1555 { 1556 int i; 1557 if (swap_encrypt_initialized) { 1558 /* Dereference keys */ 1559 for (i = 0; i < nslots; i++) 1560 if (uvm_swap_needdecrypt(sdp, startslot + i)) { 1561 struct swap_key *key; 1562 1563 key = SWD_KEY(sdp, startslot + i); 1564 if (key->refcount != 0) 1565 SWAP_KEY_PUT(sdp, key); 1566 } 1567 1568 /* Mark range as not decrypt */ 1569 uvm_swap_markdecrypt(sdp, startslot, nslots, 0); 1570 } 1571 } 1572 #endif /* UVM_SWAP_ENCRYPT */ 1573 KERNEL_UNLOCK(); 1574 } 1575 1576 /* 1577 * uvm_swap_put: put any number of pages into a contig place on swap 1578 * 1579 * => can be sync or async 1580 */ 1581 int 1582 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) 1583 { 1584 int result; 1585 1586 result = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1587 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1588 1589 return (result); 1590 } 1591 1592 /* 1593 * uvm_swap_get: get a single page from swap 1594 * 1595 * => usually a sync op (from fault) 1596 */ 1597 int 1598 uvm_swap_get(struct vm_page *page, int swslot, int flags) 1599 { 1600 int result; 1601 1602 uvmexp.nswget++; 1603 KASSERT(flags & PGO_SYNCIO); 1604 if (swslot == SWSLOT_BAD) { 1605 return VM_PAGER_ERROR; 1606 } 1607 1608 KERNEL_LOCK(); 1609 result = uvm_swap_io(&page, swslot, 1, B_READ); 1610 KERNEL_UNLOCK(); 1611 1612 if (result == VM_PAGER_OK || result == VM_PAGER_PEND) { 1613 /* 1614 * this page is no longer only in swap. 1615 */ 1616 atomic_dec_int(&uvmexp.swpgonly); 1617 } 1618 return (result); 1619 } 1620 1621 /* 1622 * uvm_swap_io: do an i/o operation to swap 1623 */ 1624 1625 int 1626 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) 1627 { 1628 daddr_t startblk; 1629 struct buf *bp; 1630 vaddr_t kva; 1631 int result, s, mapinflags, pflag, bounce = 0, i; 1632 boolean_t write, async; 1633 vaddr_t bouncekva; 1634 struct vm_page *tpps[SWCLUSTPAGES]; 1635 int pdaemon = (curproc == uvm.pagedaemon_proc); 1636 #ifdef UVM_SWAP_ENCRYPT 1637 struct swapdev *sdp; 1638 int encrypt = 0; 1639 #endif 1640 1641 KERNEL_ASSERT_LOCKED(); 1642 1643 write = (flags & B_READ) == 0; 1644 async = (flags & B_ASYNC) != 0; 1645 1646 /* convert starting drum slot to block number */ 1647 startblk = btodb((u_int64_t)startslot << PAGE_SHIFT); 1648 1649 pflag = (async || pdaemon) ? PR_NOWAIT : PR_WAITOK; 1650 bp = pool_get(&bufpool, pflag | PR_ZERO); 1651 if (bp == NULL) 1652 return (VM_PAGER_AGAIN); 1653 1654 /* 1655 * map the pages into the kernel (XXX: currently required 1656 * by buffer system). 1657 */ 1658 mapinflags = !write ? UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE; 1659 if (!async) 1660 mapinflags |= UVMPAGER_MAPIN_WAITOK; 1661 kva = uvm_pagermapin(pps, npages, mapinflags); 1662 if (kva == 0) { 1663 pool_put(&bufpool, bp); 1664 return (VM_PAGER_AGAIN); 1665 } 1666 1667 #ifdef UVM_SWAP_ENCRYPT 1668 if (write) { 1669 /* 1670 * Check if we need to do swap encryption on old pages. 1671 * Later we need a different scheme, that swap encrypts 1672 * all pages of a process that had at least one page swap 1673 * encrypted. Then we might not need to copy all pages 1674 * in the cluster, and avoid the memory overheard in 1675 * swapping. 1676 */ 1677 if (uvm_doswapencrypt) 1678 encrypt = 1; 1679 } 1680 1681 if (swap_encrypt_initialized || encrypt) { 1682 /* 1683 * we need to know the swap device that we are swapping to/from 1684 * to see if the pages need to be marked for decryption or 1685 * actually need to be decrypted. 1686 * XXX - does this information stay the same over the whole 1687 * execution of this function? 1688 */ 1689 sdp = swapdrum_getsdp(startslot); 1690 } 1691 1692 /* 1693 * Check that we are dma capable for read (write always bounces 1694 * through the swapencrypt anyway... 1695 */ 1696 if (write && encrypt) { 1697 bounce = 1; /* bounce through swapencrypt always */ 1698 } else { 1699 #else 1700 { 1701 #endif 1702 1703 for (i = 0; i < npages; i++) { 1704 if (VM_PAGE_TO_PHYS(pps[i]) < dma_constraint.ucr_low || 1705 VM_PAGE_TO_PHYS(pps[i]) > dma_constraint.ucr_high) { 1706 bounce = 1; 1707 break; 1708 } 1709 } 1710 } 1711 1712 if (bounce) { 1713 int swmapflags, plaflags; 1714 1715 /* We always need write access. */ 1716 swmapflags = UVMPAGER_MAPIN_READ; 1717 plaflags = UVM_PLA_NOWAIT; 1718 if (!async) { 1719 swmapflags |= UVMPAGER_MAPIN_WAITOK; 1720 plaflags = UVM_PLA_WAITOK; 1721 } 1722 if (uvm_swap_allocpages(tpps, npages, plaflags)) { 1723 pool_put(&bufpool, bp); 1724 uvm_pagermapout(kva, npages); 1725 return (VM_PAGER_AGAIN); 1726 } 1727 1728 bouncekva = uvm_pagermapin(tpps, npages, swmapflags); 1729 if (bouncekva == 0) { 1730 pool_put(&bufpool, bp); 1731 uvm_pagermapout(kva, npages); 1732 uvm_swap_freepages(tpps, npages); 1733 return (VM_PAGER_AGAIN); 1734 } 1735 } 1736 1737 /* encrypt to swap */ 1738 if (write && bounce) { 1739 int i, opages; 1740 caddr_t src, dst; 1741 u_int64_t block; 1742 1743 src = (caddr_t) kva; 1744 dst = (caddr_t) bouncekva; 1745 block = startblk; 1746 for (i = 0; i < npages; i++) { 1747 #ifdef UVM_SWAP_ENCRYPT 1748 struct swap_key *key; 1749 1750 if (encrypt) { 1751 key = SWD_KEY(sdp, startslot + i); 1752 SWAP_KEY_GET(sdp, key); /* add reference */ 1753 1754 swap_encrypt(key, src, dst, block, PAGE_SIZE); 1755 block += btodb(PAGE_SIZE); 1756 } else { 1757 #else 1758 { 1759 #endif /* UVM_SWAP_ENCRYPT */ 1760 memcpy(dst, src, PAGE_SIZE); 1761 } 1762 /* this just tells async callbacks to free */ 1763 atomic_setbits_int(&tpps[i]->pg_flags, PQ_ENCRYPT); 1764 src += PAGE_SIZE; 1765 dst += PAGE_SIZE; 1766 } 1767 1768 uvm_pagermapout(kva, npages); 1769 1770 /* dispose of pages we dont use anymore */ 1771 opages = npages; 1772 uvm_pager_dropcluster(NULL, NULL, pps, &opages, 1773 PGO_PDFREECLUST); 1774 1775 kva = bouncekva; 1776 } 1777 1778 /* 1779 * prevent ASYNC reads. 1780 * uvm_swap_io is only called from uvm_swap_get, uvm_swap_get 1781 * assumes that all gets are SYNCIO. Just make sure here. 1782 * XXXARTUBC - might not be true anymore. 1783 */ 1784 if (!write) { 1785 flags &= ~B_ASYNC; 1786 async = 0; 1787 } 1788 1789 /* 1790 * fill in the bp. we currently route our i/o through 1791 * /dev/drum's vnode [swapdev_vp]. 1792 */ 1793 bp->b_flags = B_BUSY | B_NOCACHE | B_RAW | (flags & (B_READ|B_ASYNC)); 1794 bp->b_proc = &proc0; /* XXX */ 1795 bp->b_vnbufs.le_next = NOLIST; 1796 if (bounce) 1797 bp->b_data = (caddr_t)bouncekva; 1798 else 1799 bp->b_data = (caddr_t)kva; 1800 bp->b_bq = NULL; 1801 bp->b_blkno = startblk; 1802 LIST_INIT(&bp->b_dep); 1803 s = splbio(); 1804 bp->b_vp = NULL; 1805 buf_replacevnode(bp, swapdev_vp); 1806 splx(s); 1807 bp->b_bufsize = bp->b_bcount = (long)npages << PAGE_SHIFT; 1808 1809 /* 1810 * for pageouts we must set "dirtyoff" [NFS client code needs it]. 1811 * and we bump v_numoutput (counter of number of active outputs). 1812 */ 1813 if (write) { 1814 bp->b_dirtyoff = 0; 1815 bp->b_dirtyend = npages << PAGE_SHIFT; 1816 #ifdef UVM_SWAP_ENCRYPT 1817 /* mark the pages in the drum for decryption */ 1818 if (swap_encrypt_initialized) 1819 uvm_swap_markdecrypt(sdp, startslot, npages, encrypt); 1820 #endif 1821 s = splbio(); 1822 swapdev_vp->v_numoutput++; 1823 splx(s); 1824 } 1825 1826 /* for async ops we must set up the iodone handler. */ 1827 if (async) { 1828 bp->b_flags |= B_CALL | (pdaemon ? B_PDAEMON : 0); 1829 bp->b_iodone = uvm_aio_biodone; 1830 } 1831 1832 /* now we start the I/O, and if async, return. */ 1833 VOP_STRATEGY(bp->b_vp, bp); 1834 if (async) 1835 return (VM_PAGER_PEND); 1836 1837 /* must be sync i/o. wait for it to finish */ 1838 (void) biowait(bp); 1839 result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK; 1840 1841 /* decrypt swap */ 1842 if (!write && !(bp->b_flags & B_ERROR)) { 1843 int i; 1844 caddr_t data = (caddr_t)kva; 1845 caddr_t dst = (caddr_t)kva; 1846 u_int64_t block = startblk; 1847 1848 if (bounce) 1849 data = (caddr_t)bouncekva; 1850 1851 for (i = 0; i < npages; i++) { 1852 #ifdef UVM_SWAP_ENCRYPT 1853 struct swap_key *key; 1854 1855 /* Check if we need to decrypt */ 1856 if (swap_encrypt_initialized && 1857 uvm_swap_needdecrypt(sdp, startslot + i)) { 1858 key = SWD_KEY(sdp, startslot + i); 1859 if (key->refcount == 0) { 1860 result = VM_PAGER_ERROR; 1861 break; 1862 } 1863 swap_decrypt(key, data, dst, block, PAGE_SIZE); 1864 } else if (bounce) { 1865 #else 1866 if (bounce) { 1867 #endif 1868 memcpy(dst, data, PAGE_SIZE); 1869 } 1870 data += PAGE_SIZE; 1871 dst += PAGE_SIZE; 1872 block += btodb(PAGE_SIZE); 1873 } 1874 if (bounce) 1875 uvm_pagermapout(bouncekva, npages); 1876 } 1877 /* kill the pager mapping */ 1878 uvm_pagermapout(kva, npages); 1879 1880 /* Not anymore needed, free after encryption/bouncing */ 1881 if (!write && bounce) 1882 uvm_swap_freepages(tpps, npages); 1883 1884 /* now dispose of the buf */ 1885 s = splbio(); 1886 if (bp->b_vp) 1887 brelvp(bp); 1888 1889 if (write && bp->b_vp) 1890 vwakeup(bp->b_vp); 1891 pool_put(&bufpool, bp); 1892 splx(s); 1893 1894 /* finally return. */ 1895 return (result); 1896 } 1897 1898 void 1899 swapmount(void) 1900 { 1901 struct swapdev *sdp; 1902 struct swappri *spp; 1903 struct vnode *vp; 1904 dev_t swap_dev = swdevt[0].sw_dev; 1905 char *nam; 1906 char path[MNAMELEN + 1]; 1907 1908 /* 1909 * No locking here since we happen to know that we will just be called 1910 * once before any other process has forked. 1911 */ 1912 if (swap_dev == NODEV) 1913 return; 1914 1915 #if defined(NFSCLIENT) 1916 if (swap_dev == NETDEV) { 1917 extern struct nfs_diskless nfs_diskless; 1918 1919 snprintf(path, sizeof(path), "%s", 1920 nfs_diskless.nd_swap.ndm_host); 1921 vp = nfs_diskless.sw_vp; 1922 goto gotit; 1923 } else 1924 #endif 1925 if (bdevvp(swap_dev, &vp)) 1926 return; 1927 1928 /* Construct a potential path to swap */ 1929 if ((nam = findblkname(major(swap_dev)))) 1930 snprintf(path, sizeof(path), "/dev/%s%d%c", nam, 1931 DISKUNIT(swap_dev), 'a' + DISKPART(swap_dev)); 1932 else 1933 snprintf(path, sizeof(path), "blkdev0x%x", 1934 swap_dev); 1935 1936 #if defined(NFSCLIENT) 1937 gotit: 1938 #endif 1939 sdp = malloc(sizeof(*sdp), M_VMSWAP, M_WAITOK|M_ZERO); 1940 spp = malloc(sizeof(*spp), M_VMSWAP, M_WAITOK); 1941 1942 sdp->swd_flags = SWF_FAKE; 1943 sdp->swd_dev = swap_dev; 1944 1945 sdp->swd_pathlen = strlen(path) + 1; 1946 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK | M_ZERO); 1947 strlcpy(sdp->swd_path, path, sdp->swd_pathlen); 1948 1949 sdp->swd_vp = vp; 1950 1951 swaplist_insert(sdp, spp, 0); 1952 1953 if (swap_on(curproc, sdp)) { 1954 swaplist_find(vp, 1); 1955 swaplist_trim(); 1956 vput(sdp->swd_vp); 1957 free(sdp->swd_path, M_VMSWAP, sdp->swd_pathlen); 1958 free(sdp, M_VMSWAP, sizeof(*sdp)); 1959 return; 1960 } 1961 } 1962 1963 #ifdef HIBERNATE 1964 int 1965 uvm_hibswap(dev_t dev, u_long *sp, u_long *ep) 1966 { 1967 struct swapdev *sdp, *swd = NULL; 1968 struct swappri *spp; 1969 struct extent_region *exr, *exrn; 1970 u_long start = 0, end = 0, size = 0; 1971 1972 /* no swap devices configured yet? */ 1973 if (uvmexp.nswapdev < 1 || dev != swdevt[0].sw_dev) 1974 return (1); 1975 1976 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1977 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1978 if (sdp->swd_dev == dev) 1979 swd = sdp; 1980 } 1981 } 1982 1983 if (swd == NULL || (swd->swd_flags & SWF_ENABLE) == 0) 1984 return (1); 1985 1986 LIST_FOREACH(exr, &swd->swd_ex->ex_regions, er_link) { 1987 u_long gapstart, gapend, gapsize; 1988 1989 gapstart = exr->er_end + 1; 1990 exrn = LIST_NEXT(exr, er_link); 1991 if (!exrn) 1992 break; 1993 gapend = exrn->er_start - 1; 1994 gapsize = gapend - gapstart; 1995 if (gapsize > size) { 1996 start = gapstart; 1997 end = gapend; 1998 size = gapsize; 1999 } 2000 } 2001 2002 if (size) { 2003 *sp = start; 2004 *ep = end; 2005 return (0); 2006 } 2007 return (1); 2008 } 2009 #endif /* HIBERNATE */ 2010