1 /* $OpenBSD: uvm_swap.c,v 1.34 2001/08/11 10:57:22 art Exp $ */ 2 /* $NetBSD: uvm_swap.c,v 1.37 2000/05/19 03:45:04 thorpej Exp $ */ 3 4 /* 5 * Copyright (c) 1995, 1996, 1997 Matthew R. Green 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. The name of the author may not be used to endorse or promote products 17 * derived from this software without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 26 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 32 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/buf.h> 38 #include <sys/conf.h> 39 #include <sys/proc.h> 40 #include <sys/namei.h> 41 #include <sys/disklabel.h> 42 #include <sys/errno.h> 43 #include <sys/kernel.h> 44 #include <sys/malloc.h> 45 #include <sys/vnode.h> 46 #include <sys/file.h> 47 #include <sys/extent.h> 48 #include <sys/mount.h> 49 #include <sys/pool.h> 50 #include <sys/syscallargs.h> 51 #include <sys/swap.h> 52 53 #include <vm/vm.h> 54 #include <uvm/uvm.h> 55 #ifdef UVM_SWAP_ENCRYPT 56 #include <sys/syslog.h> 57 #endif 58 59 #include <miscfs/specfs/specdev.h> 60 61 /* 62 * uvm_swap.c: manage configuration and i/o to swap space. 63 */ 64 65 /* 66 * swap space is managed in the following way: 67 * 68 * each swap partition or file is described by a "swapdev" structure. 69 * each "swapdev" structure contains a "swapent" structure which contains 70 * information that is passed up to the user (via system calls). 71 * 72 * each swap partition is assigned a "priority" (int) which controls 73 * swap partition usage. 74 * 75 * the system maintains a global data structure describing all swap 76 * partitions/files. there is a sorted LIST of "swappri" structures 77 * which describe "swapdev"'s at that priority. this LIST is headed 78 * by the "swap_priority" global var. each "swappri" contains a 79 * CIRCLEQ of "swapdev" structures at that priority. 80 * 81 * the system maintains a fixed pool of "swapbuf" structures for use 82 * at swap i/o time. a swapbuf includes a "buf" structure and an 83 * "aiodone" [we want to avoid malloc()'ing anything at swapout time 84 * since memory may be low]. 85 * 86 * locking: 87 * - swap_syscall_lock (sleep lock): this lock serializes the swapctl 88 * system call and prevents the swap priority list from changing 89 * while we are in the middle of a system call (e.g. SWAP_STATS). 90 * - uvm.swap_data_lock (simple_lock): this lock protects all swap data 91 * structures including the priority list, the swapdev structures, 92 * and the swapmap extent. 93 * - swap_buf_lock (simple_lock): this lock protects the free swapbuf 94 * pool. 95 * 96 * each swap device has the following info: 97 * - swap device in use (could be disabled, preventing future use) 98 * - swap enabled (allows new allocations on swap) 99 * - map info in /dev/drum 100 * - vnode pointer 101 * for swap files only: 102 * - block size 103 * - max byte count in buffer 104 * - buffer 105 * - credentials to use when doing i/o to file 106 * 107 * userland controls and configures swap with the swapctl(2) system call. 108 * the sys_swapctl performs the following operations: 109 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 110 * [2] SWAP_STATS: given a pointer to an array of swapent structures 111 * (passed in via "arg") of a size passed in via "misc" ... we load 112 * the current swap config into the array. 113 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 114 * priority in "misc", start swapping on it. 115 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 116 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 117 * "misc") 118 */ 119 120 /* 121 * swapdev: describes a single swap partition/file 122 * 123 * note the following should be true: 124 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 125 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] 126 */ 127 struct swapdev { 128 struct swapent swd_se; 129 #define swd_dev swd_se.se_dev /* device id */ 130 #define swd_flags swd_se.se_flags /* flags:inuse/enable/fake */ 131 #define swd_priority swd_se.se_priority /* our priority */ 132 #define swd_inuse swd_se.se_inuse /* our priority */ 133 #define swd_nblks swd_se.se_nblks /* our priority */ 134 char *swd_path; /* saved pathname of device */ 135 int swd_pathlen; /* length of pathname */ 136 int swd_npages; /* #pages we can use */ 137 int swd_npginuse; /* #pages in use */ 138 int swd_npgbad; /* #pages bad */ 139 int swd_drumoffset; /* page0 offset in drum */ 140 int swd_drumsize; /* #pages in drum */ 141 struct extent *swd_ex; /* extent for this swapdev */ 142 struct vnode *swd_vp; /* backing vnode */ 143 CIRCLEQ_ENTRY(swapdev) swd_next; /* priority circleq */ 144 145 int swd_bsize; /* blocksize (bytes) */ 146 int swd_maxactive; /* max active i/o reqs */ 147 struct buf swd_tab; /* buffer list */ 148 struct ucred *swd_cred; /* cred for file access */ 149 #ifdef UVM_SWAP_ENCRYPT 150 #define SWD_KEY_SHIFT 7 /* One key per 0.5 MByte */ 151 #define SWD_KEY(x,y) &((x)->swd_keys[((y) - (x)->swd_drumoffset) >> SWD_KEY_SHIFT]) 152 153 #define SWD_DCRYPT_SHIFT 5 154 #define SWD_DCRYPT_BITS 32 155 #define SWD_DCRYPT_MASK (SWD_DCRYPT_BITS - 1) 156 #define SWD_DCRYPT_OFF(x) ((x) >> SWD_DCRYPT_SHIFT) 157 #define SWD_DCRYPT_BIT(x) ((x) & SWD_DCRYPT_MASK) 158 #define SWD_DCRYPT_SIZE(x) (SWD_DCRYPT_OFF((x) + SWD_DCRYPT_MASK) * sizeof(u_int32_t)) 159 u_int32_t *swd_decrypt; /* bitmap for decryption */ 160 struct swap_key *swd_keys; /* keys for different parts */ 161 int swd_nkeys; /* active keys */ 162 #endif 163 }; 164 165 /* 166 * swap device priority entry; the list is kept sorted on `spi_priority'. 167 */ 168 struct swappri { 169 int spi_priority; /* priority */ 170 CIRCLEQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 171 /* circleq of swapdevs at this priority */ 172 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 173 }; 174 175 /* 176 * swapbuf, swapbuffer plus async i/o info 177 */ 178 struct swapbuf { 179 struct buf sw_buf; /* a buffer structure */ 180 struct uvm_aiodesc sw_aio; /* aiodesc structure, used if ASYNC */ 181 SIMPLEQ_ENTRY(swapbuf) sw_sq; /* free list pointer */ 182 }; 183 184 /* 185 * The following two structures are used to keep track of data transfers 186 * on swap devices associated with regular files. 187 * NOTE: this code is more or less a copy of vnd.c; we use the same 188 * structure names here to ease porting.. 189 */ 190 struct vndxfer { 191 struct buf *vx_bp; /* Pointer to parent buffer */ 192 struct swapdev *vx_sdp; 193 int vx_error; 194 int vx_pending; /* # of pending aux buffers */ 195 int vx_flags; 196 #define VX_BUSY 1 197 #define VX_DEAD 2 198 }; 199 200 struct vndbuf { 201 struct buf vb_buf; 202 struct vndxfer *vb_xfer; 203 }; 204 205 206 /* 207 * We keep a of pool vndbuf's and vndxfer structures. 208 */ 209 struct pool vndxfer_pool; 210 struct pool vndbuf_pool; 211 212 #define getvndxfer(vnx) do { \ 213 int s = splbio(); \ 214 vnx = pool_get(&vndxfer_pool, PR_MALLOCOK|PR_WAITOK); \ 215 splx(s); \ 216 } while (0) 217 218 #define putvndxfer(vnx) { \ 219 pool_put(&vndxfer_pool, (void *)(vnx)); \ 220 } 221 222 #define getvndbuf(vbp) do { \ 223 int s = splbio(); \ 224 vbp = pool_get(&vndbuf_pool, PR_MALLOCOK|PR_WAITOK); \ 225 splx(s); \ 226 } while (0) 227 228 #define putvndbuf(vbp) { \ 229 pool_put(&vndbuf_pool, (void *)(vbp)); \ 230 } 231 232 /* /dev/drum */ 233 bdev_decl(sw); 234 cdev_decl(sw); 235 236 /* 237 * local variables 238 */ 239 static struct extent *swapmap; /* controls the mapping of /dev/drum */ 240 SIMPLEQ_HEAD(swapbufhead, swapbuf); 241 struct pool swapbuf_pool; 242 243 /* list of all active swap devices [by priority] */ 244 LIST_HEAD(swap_priority, swappri); 245 static struct swap_priority swap_priority; 246 247 /* locks */ 248 lock_data_t swap_syscall_lock; 249 250 /* 251 * prototypes 252 */ 253 static void swapdrum_add __P((struct swapdev *, int)); 254 static struct swapdev *swapdrum_getsdp __P((int)); 255 256 static struct swapdev *swaplist_find __P((struct vnode *, int)); 257 static void swaplist_insert __P((struct swapdev *, 258 struct swappri *, int)); 259 static void swaplist_trim __P((void)); 260 261 static int swap_on __P((struct proc *, struct swapdev *)); 262 static int swap_off __P((struct proc *, struct swapdev *)); 263 264 static void sw_reg_strategy __P((struct swapdev *, struct buf *, int)); 265 static void sw_reg_iodone __P((struct buf *)); 266 static void sw_reg_start __P((struct swapdev *)); 267 268 static void uvm_swap_aiodone __P((struct uvm_aiodesc *)); 269 static void uvm_swap_bufdone __P((struct buf *)); 270 static int uvm_swap_io __P((struct vm_page **, int, int, int)); 271 272 static void swapmount __P((void)); 273 274 #ifdef UVM_SWAP_ENCRYPT 275 /* for swap encrypt */ 276 boolean_t uvm_swap_allocpages __P((struct vm_page **, int)); 277 void uvm_swap_freepages __P((struct vm_page **, int)); 278 void uvm_swap_markdecrypt __P((struct swapdev *, int, int, int)); 279 boolean_t uvm_swap_needdecrypt __P((struct swapdev *, int)); 280 void uvm_swap_initcrypt __P((struct swapdev *, int)); 281 #endif 282 283 /* 284 * uvm_swap_init: init the swap system data structures and locks 285 * 286 * => called at boot time from init_main.c after the filesystems 287 * are brought up (which happens after uvm_init()) 288 */ 289 void 290 uvm_swap_init() 291 { 292 UVMHIST_FUNC("uvm_swap_init"); 293 294 UVMHIST_CALLED(pdhist); 295 /* 296 * first, init the swap list, its counter, and its lock. 297 * then get a handle on the vnode for /dev/drum by using 298 * the its dev_t number ("swapdev", from MD conf.c). 299 */ 300 301 LIST_INIT(&swap_priority); 302 uvmexp.nswapdev = 0; 303 lockinit(&swap_syscall_lock, PVM, "swapsys", 0, 0); 304 simple_lock_init(&uvm.swap_data_lock); 305 306 if (bdevvp(swapdev, &swapdev_vp)) 307 panic("uvm_swap_init: can't get vnode for swap device"); 308 309 /* 310 * create swap block resource map to map /dev/drum. the range 311 * from 1 to INT_MAX allows 2 gigablocks of swap space. note 312 * that block 0 is reserved (used to indicate an allocation 313 * failure, or no allocation). 314 */ 315 swapmap = extent_create("swapmap", 1, INT_MAX, 316 M_VMSWAP, 0, 0, EX_NOWAIT); 317 if (swapmap == 0) 318 panic("uvm_swap_init: extent_create failed"); 319 320 /* 321 * allocate our private pool of "swapbuf" structures (includes 322 * a "buf" structure). ["nswbuf" comes from param.c and can 323 * be adjusted by MD code before we get here]. 324 */ 325 326 327 pool_init(&swapbuf_pool, sizeof(struct swapbuf), 0, 0, 0, "swp buf", 0, 328 NULL, NULL, 0); 329 /* XXX - set a maximum on swapbuf_pool? */ 330 331 pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx", 332 0, NULL, NULL, 0); 333 334 pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd", 0, 335 NULL, NULL, 0); 336 337 /* 338 * Setup the initial swap partition 339 */ 340 swapmount(); 341 342 /* 343 * done! 344 */ 345 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); 346 } 347 348 #ifdef UVM_SWAP_ENCRYPT 349 void 350 uvm_swap_initcrypt_all(void) 351 { 352 struct swapdev *sdp; 353 struct swappri *spp; 354 355 simple_lock(&uvm.swap_data_lock); 356 357 for (spp = swap_priority.lh_first; spp != NULL; 358 spp = spp->spi_swappri.le_next) { 359 for (sdp = spp->spi_swapdev.cqh_first; 360 sdp != (void *)&spp->spi_swapdev; 361 sdp = sdp->swd_next.cqe_next) 362 if (sdp->swd_decrypt == NULL) 363 uvm_swap_initcrypt(sdp, sdp->swd_npages); 364 } 365 simple_unlock(&uvm.swap_data_lock); 366 } 367 368 void 369 uvm_swap_initcrypt(struct swapdev *sdp, int npages) 370 { 371 /* 372 * keep information if a page needs to be decrypted when we get it 373 * from the swap device. 374 * We cannot chance a malloc later, if we are doing ASYNC puts, 375 * we may not call malloc with M_WAITOK. This consumes only 376 * 8KB memory for a 256MB swap partition. 377 */ 378 sdp->swd_decrypt = malloc(SWD_DCRYPT_SIZE(npages), M_VMSWAP, M_WAITOK); 379 memset(sdp->swd_decrypt, 0, SWD_DCRYPT_SIZE(npages)); 380 sdp->swd_keys = malloc((npages >> SWD_KEY_SHIFT) * sizeof(struct swap_key), 381 M_VMSWAP, M_WAITOK); 382 memset(sdp->swd_keys, 0, (npages >> SWD_KEY_SHIFT) * sizeof(struct swap_key)); 383 sdp->swd_nkeys = 0; 384 } 385 386 boolean_t 387 uvm_swap_allocpages(struct vm_page **pps, int npages) 388 { 389 int i, s; 390 int minus, reserve; 391 boolean_t fail; 392 393 /* Estimate if we will succeed */ 394 s = uvm_lock_fpageq(); 395 396 minus = uvmexp.free - npages; 397 reserve = uvmexp.reserve_kernel; 398 fail = uvmexp.free - npages < uvmexp.reserve_kernel; 399 400 uvm_unlock_fpageq(s); 401 402 if (fail) 403 return FALSE; 404 405 /* Get new pages */ 406 for (i = 0; i < npages; i++) { 407 pps[i] = uvm_pagealloc(NULL, 0, NULL, 0); 408 if (pps[i] == NULL) 409 break; 410 } 411 412 /* On failure free and return */ 413 if (i < npages) { 414 uvm_swap_freepages(pps, i); 415 return FALSE; 416 } 417 418 return TRUE; 419 } 420 421 void 422 uvm_swap_freepages(struct vm_page **pps, int npages) 423 { 424 int i; 425 426 uvm_lock_pageq(); 427 for (i = 0; i < npages; i++) 428 uvm_pagefree(pps[i]); 429 uvm_unlock_pageq(); 430 } 431 432 /* 433 * Mark pages on the swap device for later decryption 434 */ 435 436 void 437 uvm_swap_markdecrypt(struct swapdev *sdp, int startslot, int npages, 438 int decrypt) 439 { 440 int pagestart, i; 441 int off, bit; 442 443 if (!sdp) 444 return; 445 446 pagestart = startslot - sdp->swd_drumoffset; 447 for (i = 0; i < npages; i++, pagestart++) { 448 off = SWD_DCRYPT_OFF(pagestart); 449 bit = SWD_DCRYPT_BIT(pagestart); 450 if (decrypt) 451 /* pages read need decryption */ 452 sdp->swd_decrypt[off] |= 1 << bit; 453 else 454 /* pages read do not need decryption */ 455 sdp->swd_decrypt[off] &= ~(1 << bit); 456 } 457 } 458 459 /* 460 * Check if the page that we got from disk needs to be decrypted 461 */ 462 463 boolean_t 464 uvm_swap_needdecrypt(struct swapdev *sdp, int off) 465 { 466 if (!sdp) 467 return FALSE; 468 469 off -= sdp->swd_drumoffset; 470 return sdp->swd_decrypt[SWD_DCRYPT_OFF(off)] & (1 << SWD_DCRYPT_BIT(off)) ? 471 TRUE : FALSE; 472 } 473 #endif /* UVM_SWAP_ENCRYPT */ 474 /* 475 * swaplist functions: functions that operate on the list of swap 476 * devices on the system. 477 */ 478 479 /* 480 * swaplist_insert: insert swap device "sdp" into the global list 481 * 482 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 483 * => caller must provide a newly malloc'd swappri structure (we will 484 * FREE it if we don't need it... this it to prevent malloc blocking 485 * here while adding swap) 486 */ 487 static void 488 swaplist_insert(sdp, newspp, priority) 489 struct swapdev *sdp; 490 struct swappri *newspp; 491 int priority; 492 { 493 struct swappri *spp, *pspp; 494 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist); 495 496 /* 497 * find entry at or after which to insert the new device. 498 */ 499 for (pspp = NULL, spp = LIST_FIRST(&swap_priority); spp != NULL; 500 spp = LIST_NEXT(spp, spi_swappri)) { 501 if (priority <= spp->spi_priority) 502 break; 503 pspp = spp; 504 } 505 506 /* 507 * new priority? 508 */ 509 if (spp == NULL || spp->spi_priority != priority) { 510 spp = newspp; /* use newspp! */ 511 UVMHIST_LOG(pdhist, "created new swappri = %d", 512 priority, 0, 0, 0); 513 514 spp->spi_priority = priority; 515 CIRCLEQ_INIT(&spp->spi_swapdev); 516 517 if (pspp) 518 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 519 else 520 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 521 } else { 522 /* we don't need a new priority structure, free it */ 523 FREE(newspp, M_VMSWAP); 524 } 525 526 /* 527 * priority found (or created). now insert on the priority's 528 * circleq list and bump the total number of swapdevs. 529 */ 530 sdp->swd_priority = priority; 531 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 532 uvmexp.nswapdev++; 533 } 534 535 /* 536 * swaplist_find: find and optionally remove a swap device from the 537 * global list. 538 * 539 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 540 * => we return the swapdev we found (and removed) 541 */ 542 static struct swapdev * 543 swaplist_find(vp, remove) 544 struct vnode *vp; 545 boolean_t remove; 546 { 547 struct swapdev *sdp; 548 struct swappri *spp; 549 550 /* 551 * search the lists for the requested vp 552 */ 553 for (spp = LIST_FIRST(&swap_priority); spp != NULL; 554 spp = LIST_NEXT(spp, spi_swappri)) { 555 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev); 556 sdp != (void *)&spp->spi_swapdev; 557 sdp = CIRCLEQ_NEXT(sdp, swd_next)) 558 if (sdp->swd_vp == vp) { 559 if (remove) { 560 CIRCLEQ_REMOVE(&spp->spi_swapdev, 561 sdp, swd_next); 562 uvmexp.nswapdev--; 563 } 564 return(sdp); 565 } 566 } 567 return (NULL); 568 } 569 570 571 /* 572 * swaplist_trim: scan priority list for empty priority entries and kill 573 * them. 574 * 575 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 576 */ 577 static void 578 swaplist_trim() 579 { 580 struct swappri *spp, *nextspp; 581 582 for (spp = LIST_FIRST(&swap_priority); spp != NULL; spp = nextspp) { 583 nextspp = LIST_NEXT(spp, spi_swappri); 584 if (CIRCLEQ_FIRST(&spp->spi_swapdev) != 585 (void *)&spp->spi_swapdev) 586 continue; 587 LIST_REMOVE(spp, spi_swappri); 588 free(spp, M_VMSWAP); 589 } 590 } 591 592 /* 593 * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area. 594 * 595 * => caller must hold swap_syscall_lock 596 * => uvm.swap_data_lock should be unlocked (we may sleep) 597 */ 598 static void 599 swapdrum_add(sdp, npages) 600 struct swapdev *sdp; 601 int npages; 602 { 603 u_long result; 604 605 if (extent_alloc(swapmap, npages, EX_NOALIGN, 0, EX_NOBOUNDARY, 606 EX_WAITOK, &result)) 607 panic("swapdrum_add"); 608 609 sdp->swd_drumoffset = result; 610 sdp->swd_drumsize = npages; 611 } 612 613 /* 614 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 615 * to the "swapdev" that maps that section of the drum. 616 * 617 * => each swapdev takes one big contig chunk of the drum 618 * => caller must hold uvm.swap_data_lock 619 */ 620 static struct swapdev * 621 swapdrum_getsdp(pgno) 622 int pgno; 623 { 624 struct swapdev *sdp; 625 struct swappri *spp; 626 627 for (spp = LIST_FIRST(&swap_priority); spp != NULL; 628 spp = LIST_NEXT(spp, spi_swappri)) 629 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev); 630 sdp != (void *)&spp->spi_swapdev; 631 sdp = CIRCLEQ_NEXT(sdp, swd_next)) 632 if (pgno >= sdp->swd_drumoffset && 633 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 634 return sdp; 635 } 636 return NULL; 637 } 638 639 640 /* 641 * sys_swapctl: main entry point for swapctl(2) system call 642 * [with two helper functions: swap_on and swap_off] 643 */ 644 int 645 sys_swapctl(p, v, retval) 646 struct proc *p; 647 void *v; 648 register_t *retval; 649 { 650 struct sys_swapctl_args /* { 651 syscallarg(int) cmd; 652 syscallarg(void *) arg; 653 syscallarg(int) misc; 654 } */ *uap = (struct sys_swapctl_args *)v; 655 struct vnode *vp; 656 struct nameidata nd; 657 struct swappri *spp; 658 struct swapdev *sdp; 659 struct swapent *sep; 660 char userpath[MAXPATHLEN]; 661 size_t len; 662 int count, error, misc; 663 int priority; 664 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist); 665 666 misc = SCARG(uap, misc); 667 668 /* 669 * ensure serialized syscall access by grabbing the swap_syscall_lock 670 */ 671 lockmgr(&swap_syscall_lock, LK_EXCLUSIVE, NULL, p); 672 673 /* 674 * we handle the non-priv NSWAP and STATS request first. 675 * 676 * SWAP_NSWAP: return number of config'd swap devices 677 * [can also be obtained with uvmexp sysctl] 678 */ 679 if (SCARG(uap, cmd) == SWAP_NSWAP) { 680 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev, 681 0, 0, 0); 682 *retval = uvmexp.nswapdev; 683 error = 0; 684 goto out; 685 } 686 687 /* 688 * SWAP_STATS: get stats on current # of configured swap devs 689 * 690 * note that the swap_priority list can't change as long 691 * as we are holding the swap_syscall_lock. we don't want 692 * to grab the uvm.swap_data_lock because we may fault&sleep during 693 * copyout() and we don't want to be holding that lock then! 694 */ 695 if (SCARG(uap, cmd) == SWAP_STATS 696 #if defined(COMPAT_13) 697 || SCARG(uap, cmd) == SWAP_OSTATS 698 #endif 699 ) { 700 sep = (struct swapent *)SCARG(uap, arg); 701 count = 0; 702 703 for (spp = LIST_FIRST(&swap_priority); spp != NULL; 704 spp = LIST_NEXT(spp, spi_swappri)) { 705 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev); 706 sdp != (void *)&spp->spi_swapdev && misc-- > 0; 707 sdp = CIRCLEQ_NEXT(sdp, swd_next)) { 708 sdp->swd_inuse = 709 btodb(sdp->swd_npginuse << PAGE_SHIFT); 710 error = copyout(&sdp->swd_se, sep, 711 sizeof(struct swapent)); 712 713 /* now copy out the path if necessary */ 714 #if defined(COMPAT_13) 715 if (error == 0 && SCARG(uap, cmd) == SWAP_STATS) 716 #else 717 if (error == 0) 718 #endif 719 error = copyout(sdp->swd_path, 720 &sep->se_path, sdp->swd_pathlen); 721 722 if (error) 723 goto out; 724 count++; 725 #if defined(COMPAT_13) 726 if (SCARG(uap, cmd) == SWAP_OSTATS) 727 ((struct oswapent *)sep)++; 728 else 729 #endif 730 sep++; 731 } 732 } 733 734 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); 735 736 *retval = count; 737 error = 0; 738 goto out; 739 } 740 741 /* 742 * all other requests require superuser privs. verify. 743 */ 744 if ((error = suser(p->p_ucred, &p->p_acflag))) 745 goto out; 746 747 /* 748 * at this point we expect a path name in arg. we will 749 * use namei() to gain a vnode reference (vref), and lock 750 * the vnode (VOP_LOCK). 751 * 752 * XXX: a NULL arg means use the root vnode pointer (e.g. for 753 * miniroot) 754 */ 755 if (SCARG(uap, arg) == NULL) { 756 vp = rootvp; /* miniroot */ 757 if (vget(vp, LK_EXCLUSIVE, p)) { 758 error = EBUSY; 759 goto out; 760 } 761 if (SCARG(uap, cmd) == SWAP_ON && 762 copystr("miniroot", userpath, sizeof userpath, &len)) 763 panic("swapctl: miniroot copy failed"); 764 } else { 765 int space; 766 char *where; 767 768 if (SCARG(uap, cmd) == SWAP_ON) { 769 if ((error = copyinstr(SCARG(uap, arg), userpath, 770 sizeof userpath, &len))) 771 goto out; 772 space = UIO_SYSSPACE; 773 where = userpath; 774 } else { 775 space = UIO_USERSPACE; 776 where = (char *)SCARG(uap, arg); 777 } 778 NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, space, where, p); 779 if ((error = namei(&nd))) 780 goto out; 781 vp = nd.ni_vp; 782 } 783 /* note: "vp" is referenced and locked */ 784 785 error = 0; /* assume no error */ 786 switch(SCARG(uap, cmd)) { 787 case SWAP_DUMPDEV: 788 if (vp->v_type != VBLK) { 789 error = ENOTBLK; 790 goto out; 791 } 792 dumpdev = vp->v_rdev; 793 794 break; 795 796 case SWAP_CTL: 797 /* 798 * get new priority, remove old entry (if any) and then 799 * reinsert it in the correct place. finally, prune out 800 * any empty priority structures. 801 */ 802 priority = SCARG(uap, misc); 803 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 804 simple_lock(&uvm.swap_data_lock); 805 if ((sdp = swaplist_find(vp, 1)) == NULL) { 806 error = ENOENT; 807 } else { 808 swaplist_insert(sdp, spp, priority); 809 swaplist_trim(); 810 } 811 simple_unlock(&uvm.swap_data_lock); 812 if (error) 813 free(spp, M_VMSWAP); 814 break; 815 816 case SWAP_ON: 817 818 /* 819 * check for duplicates. if none found, then insert a 820 * dummy entry on the list to prevent someone else from 821 * trying to enable this device while we are working on 822 * it. 823 */ 824 825 priority = SCARG(uap, misc); 826 simple_lock(&uvm.swap_data_lock); 827 if ((sdp = swaplist_find(vp, 0)) != NULL) { 828 error = EBUSY; 829 simple_unlock(&uvm.swap_data_lock); 830 break; 831 } 832 sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK); 833 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 834 memset(sdp, 0, sizeof(*sdp)); 835 sdp->swd_flags = SWF_FAKE; /* placeholder only */ 836 sdp->swd_vp = vp; 837 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 838 839 /* 840 * XXX Is NFS elaboration necessary? 841 */ 842 if (vp->v_type == VREG) { 843 sdp->swd_cred = crdup(p->p_ucred); 844 } 845 846 swaplist_insert(sdp, spp, priority); 847 simple_unlock(&uvm.swap_data_lock); 848 849 sdp->swd_pathlen = len; 850 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK); 851 if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0) 852 panic("swapctl: copystr"); 853 854 /* 855 * we've now got a FAKE placeholder in the swap list. 856 * now attempt to enable swap on it. if we fail, undo 857 * what we've done and kill the fake entry we just inserted. 858 * if swap_on is a success, it will clear the SWF_FAKE flag 859 */ 860 861 if ((error = swap_on(p, sdp)) != 0) { 862 simple_lock(&uvm.swap_data_lock); 863 (void) swaplist_find(vp, 1); /* kill fake entry */ 864 swaplist_trim(); 865 simple_unlock(&uvm.swap_data_lock); 866 if (vp->v_type == VREG) { 867 crfree(sdp->swd_cred); 868 } 869 free(sdp->swd_path, M_VMSWAP); 870 free(sdp, M_VMSWAP); 871 break; 872 } 873 874 /* 875 * got it! now add a second reference to vp so that 876 * we keep a reference to the vnode after we return. 877 */ 878 vref(vp); 879 break; 880 881 case SWAP_OFF: 882 simple_lock(&uvm.swap_data_lock); 883 if ((sdp = swaplist_find(vp, 0)) == NULL) { 884 simple_unlock(&uvm.swap_data_lock); 885 error = ENXIO; 886 break; 887 } 888 889 /* 890 * If a device isn't in use or enabled, we 891 * can't stop swapping from it (again). 892 */ 893 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 894 simple_unlock(&uvm.swap_data_lock); 895 error = EBUSY; 896 break; 897 } 898 899 /* 900 * do the real work. 901 */ 902 if ((error = swap_off(p, sdp)) != 0) 903 goto out; 904 905 break; 906 907 default: 908 error = EINVAL; 909 } 910 911 /* 912 * done! use vput to drop our reference and unlock 913 */ 914 vput(vp); 915 out: 916 lockmgr(&swap_syscall_lock, LK_RELEASE, NULL, p); 917 918 UVMHIST_LOG(pdhist, "<- done! error=%d", error, 0, 0, 0); 919 return (error); 920 } 921 922 /* 923 * swap_on: attempt to enable a swapdev for swapping. note that the 924 * swapdev is already on the global list, but disabled (marked 925 * SWF_FAKE). 926 * 927 * => we avoid the start of the disk (to protect disk labels) 928 * => we also avoid the miniroot, if we are swapping to root. 929 * => caller should leave uvm.swap_data_lock unlocked, we may lock it 930 * if needed. 931 */ 932 static int 933 swap_on(p, sdp) 934 struct proc *p; 935 struct swapdev *sdp; 936 { 937 static int count = 0; /* static */ 938 struct vnode *vp; 939 int error, npages, nblocks, size; 940 long addr; 941 struct vattr va; 942 #if defined(NFSCLIENT) 943 extern int (**nfsv2_vnodeop_p) __P((void *)); 944 #endif /* defined(NFSCLIENT) */ 945 dev_t dev; 946 char *name; 947 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist); 948 949 /* 950 * we want to enable swapping on sdp. the swd_vp contains 951 * the vnode we want (locked and ref'd), and the swd_dev 952 * contains the dev_t of the file, if it a block device. 953 */ 954 955 vp = sdp->swd_vp; 956 dev = sdp->swd_dev; 957 958 /* 959 * open the swap file (mostly useful for block device files to 960 * let device driver know what is up). 961 * 962 * we skip the open/close for root on swap because the root 963 * has already been opened when root was mounted (mountroot). 964 */ 965 if (vp != rootvp) { 966 if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p))) 967 return (error); 968 } 969 970 /* XXX this only works for block devices */ 971 UVMHIST_LOG(pdhist, " dev=%d, major(dev)=%d", dev, major(dev), 0,0); 972 973 /* 974 * we now need to determine the size of the swap area. for 975 * block specials we can call the d_psize function. 976 * for normal files, we must stat [get attrs]. 977 * 978 * we put the result in nblks. 979 * for normal files, we also want the filesystem block size 980 * (which we get with statfs). 981 */ 982 switch (vp->v_type) { 983 case VBLK: 984 if (bdevsw[major(dev)].d_psize == 0 || 985 (nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) { 986 error = ENXIO; 987 goto bad; 988 } 989 break; 990 991 case VREG: 992 if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p))) 993 goto bad; 994 nblocks = (int)btodb(va.va_size); 995 if ((error = 996 VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0) 997 goto bad; 998 999 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize; 1000 /* 1001 * limit the max # of outstanding I/O requests we issue 1002 * at any one time. take it easy on NFS servers. 1003 */ 1004 #if defined(NFSCLIENT) 1005 if (vp->v_op == nfsv2_vnodeop_p) 1006 sdp->swd_maxactive = 2; /* XXX */ 1007 else 1008 #endif /* defined(NFSCLIENT) */ 1009 sdp->swd_maxactive = 8; /* XXX */ 1010 break; 1011 1012 default: 1013 error = ENXIO; 1014 goto bad; 1015 } 1016 1017 /* 1018 * save nblocks in a safe place and convert to pages. 1019 */ 1020 1021 sdp->swd_nblks = nblocks; 1022 npages = dbtob((u_int64_t)nblocks) >> PAGE_SHIFT; 1023 1024 /* 1025 * for block special files, we want to make sure that leave 1026 * the disklabel and bootblocks alone, so we arrange to skip 1027 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 1028 * note that because of this the "size" can be less than the 1029 * actual number of blocks on the device. 1030 */ 1031 if (vp->v_type == VBLK) { 1032 /* we use pages 1 to (size - 1) [inclusive] */ 1033 size = npages - 1; 1034 addr = 1; 1035 } else { 1036 /* we use pages 0 to (size - 1) [inclusive] */ 1037 size = npages; 1038 addr = 0; 1039 } 1040 1041 /* 1042 * make sure we have enough blocks for a reasonable sized swap 1043 * area. we want at least one page. 1044 */ 1045 1046 if (size < 1) { 1047 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); 1048 error = EINVAL; 1049 goto bad; 1050 } 1051 1052 UVMHIST_LOG(pdhist, " dev=%x: size=%d addr=%ld\n", dev, size, addr, 0); 1053 1054 /* 1055 * now we need to allocate an extent to manage this swap device 1056 */ 1057 name = malloc(12, M_VMSWAP, M_WAITOK); 1058 sprintf(name, "swap0x%04x", count++); 1059 1060 /* note that extent_create's 3rd arg is inclusive, thus "- 1" */ 1061 sdp->swd_ex = extent_create(name, 0, npages - 1, M_VMSWAP, 1062 0, 0, EX_WAITOK); 1063 /* allocate the `saved' region from the extent so it won't be used */ 1064 if (addr) { 1065 if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK)) 1066 panic("disklabel region"); 1067 } 1068 1069 /* 1070 * if the vnode we are swapping to is the root vnode 1071 * (i.e. we are swapping to the miniroot) then we want 1072 * to make sure we don't overwrite it. do a statfs to 1073 * find its size and skip over it. 1074 */ 1075 if (vp == rootvp) { 1076 struct mount *mp; 1077 struct statfs *sp; 1078 int rootblocks, rootpages; 1079 1080 mp = rootvnode->v_mount; 1081 sp = &mp->mnt_stat; 1082 rootblocks = sp->f_blocks * btodb(sp->f_bsize); 1083 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; 1084 if (rootpages > size) 1085 panic("swap_on: miniroot larger than swap?"); 1086 1087 if (extent_alloc_region(sdp->swd_ex, addr, 1088 rootpages, EX_WAITOK)) 1089 panic("swap_on: unable to preserve miniroot"); 1090 1091 size -= rootpages; 1092 printf("Preserved %d pages of miniroot ", rootpages); 1093 printf("leaving %d pages of swap\n", size); 1094 } 1095 1096 /* 1097 * add anons to reflect the new swap space 1098 */ 1099 uvm_anon_add(size); 1100 1101 #ifdef UVM_SWAP_ENCRYPT 1102 if (uvm_doswapencrypt) 1103 uvm_swap_initcrypt(sdp, npages); 1104 #endif 1105 /* 1106 * now add the new swapdev to the drum and enable. 1107 */ 1108 simple_lock(&uvm.swap_data_lock); 1109 swapdrum_add(sdp, npages); 1110 sdp->swd_npages = size; 1111 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 1112 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 1113 simple_unlock(&uvm.swap_data_lock); 1114 uvmexp.swpages += size; 1115 1116 return (0); 1117 1118 bad: 1119 /* 1120 * failure: close device if necessary and return error. 1121 */ 1122 if (vp != rootvp) 1123 (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p); 1124 return (error); 1125 } 1126 1127 /* 1128 * swap_off: stop swapping on swapdev 1129 * 1130 * => swap data should be locked, we will unlock. 1131 */ 1132 static int 1133 swap_off(p, sdp) 1134 struct proc *p; 1135 struct swapdev *sdp; 1136 { 1137 void *name; 1138 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist); 1139 UVMHIST_LOG(pdhist, " dev=%x", sdp->swd_dev,0,0,0); 1140 1141 /* disable the swap area being removed */ 1142 sdp->swd_flags &= ~SWF_ENABLE; 1143 simple_unlock(&uvm.swap_data_lock); 1144 1145 /* 1146 * the idea is to find all the pages that are paged out to this 1147 * device, and page them all in. in uvm, swap-backed pageable 1148 * memory can take two forms: aobjs and anons. call the 1149 * swapoff hook for each subsystem to bring in pages. 1150 */ 1151 1152 if (uao_swap_off(sdp->swd_drumoffset, 1153 sdp->swd_drumoffset + sdp->swd_drumsize) || 1154 anon_swap_off(sdp->swd_drumoffset, 1155 sdp->swd_drumoffset + sdp->swd_drumsize)) { 1156 1157 simple_lock(&uvm.swap_data_lock); 1158 sdp->swd_flags |= SWF_ENABLE; 1159 simple_unlock(&uvm.swap_data_lock); 1160 return ENOMEM; 1161 } 1162 1163 #ifdef DIAGNOSTIC 1164 if (sdp->swd_npginuse != sdp->swd_npgbad) { 1165 panic("swap_off: sdp %p - %d pages still in use (%d bad)\n", 1166 sdp, sdp->swd_npginuse, sdp->swd_npgbad); 1167 } 1168 #endif 1169 1170 /* 1171 * done with the vnode. 1172 */ 1173 if (sdp->swd_vp->v_type == VREG) { 1174 crfree(sdp->swd_cred); 1175 } 1176 if (sdp->swd_vp != rootvp) { 1177 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p); 1178 } 1179 if (sdp->swd_vp) { 1180 vrele(sdp->swd_vp); 1181 } 1182 1183 /* remove anons from the system */ 1184 uvm_anon_remove(sdp->swd_npages); 1185 1186 simple_lock(&uvm.swap_data_lock); 1187 uvmexp.swpages -= sdp->swd_npages; 1188 1189 if (swaplist_find(sdp->swd_vp, 1) == NULL) 1190 panic("swap_off: swapdev not in list\n"); 1191 swaplist_trim(); 1192 1193 /* 1194 * free all resources! 1195 */ 1196 extent_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize, 1197 EX_WAITOK); 1198 name = (void *)sdp->swd_ex->ex_name; 1199 extent_destroy(sdp->swd_ex); 1200 free(name, M_VMSWAP); 1201 free(sdp, M_VMSWAP); 1202 simple_unlock(&uvm.swap_data_lock); 1203 return (0); 1204 } 1205 1206 /* 1207 * /dev/drum interface and i/o functions 1208 */ 1209 1210 /* 1211 * swread: the read function for the drum (just a call to physio) 1212 */ 1213 /*ARGSUSED*/ 1214 int 1215 swread(dev, uio, ioflag) 1216 dev_t dev; 1217 struct uio *uio; 1218 int ioflag; 1219 { 1220 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist); 1221 1222 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); 1223 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); 1224 } 1225 1226 /* 1227 * swwrite: the write function for the drum (just a call to physio) 1228 */ 1229 /*ARGSUSED*/ 1230 int 1231 swwrite(dev, uio, ioflag) 1232 dev_t dev; 1233 struct uio *uio; 1234 int ioflag; 1235 { 1236 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist); 1237 1238 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); 1239 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); 1240 } 1241 1242 /* 1243 * swstrategy: perform I/O on the drum 1244 * 1245 * => we must map the i/o request from the drum to the correct swapdev. 1246 */ 1247 void 1248 swstrategy(bp) 1249 struct buf *bp; 1250 { 1251 struct swapdev *sdp; 1252 int s, pageno, bn; 1253 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist); 1254 1255 /* 1256 * convert block number to swapdev. note that swapdev can't 1257 * be yanked out from under us because we are holding resources 1258 * in it (i.e. the blocks we are doing I/O on). 1259 */ 1260 pageno = dbtob(bp->b_blkno) >> PAGE_SHIFT; 1261 simple_lock(&uvm.swap_data_lock); 1262 sdp = swapdrum_getsdp(pageno); 1263 simple_unlock(&uvm.swap_data_lock); 1264 if (sdp == NULL) { 1265 bp->b_error = EINVAL; 1266 bp->b_flags |= B_ERROR; 1267 biodone(bp); 1268 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); 1269 return; 1270 } 1271 1272 /* 1273 * convert drum page number to block number on this swapdev. 1274 */ 1275 1276 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1277 bn = btodb(pageno << PAGE_SHIFT); /* convert to diskblock */ 1278 1279 UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld\n", 1280 ((bp->b_flags & B_READ) == 0) ? "write" : "read", 1281 sdp->swd_drumoffset, bn, bp->b_bcount); 1282 1283 /* 1284 * for block devices we finish up here. 1285 * for regular files we have to do more work which we delegate 1286 * to sw_reg_strategy(). 1287 */ 1288 1289 switch (sdp->swd_vp->v_type) { 1290 default: 1291 panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type); 1292 1293 case VBLK: 1294 1295 /* 1296 * must convert "bp" from an I/O on /dev/drum to an I/O 1297 * on the swapdev (sdp). 1298 */ 1299 s = splbio(); 1300 buf_replacevnode(bp, sdp->swd_vp); 1301 1302 bp->b_blkno = bn; 1303 splx(s); 1304 VOP_STRATEGY(bp); 1305 return; 1306 1307 case VREG: 1308 /* 1309 * delegate to sw_reg_strategy function. 1310 */ 1311 sw_reg_strategy(sdp, bp, bn); 1312 return; 1313 } 1314 /* NOTREACHED */ 1315 } 1316 1317 /* 1318 * sw_reg_strategy: handle swap i/o to regular files 1319 */ 1320 static void 1321 sw_reg_strategy(sdp, bp, bn) 1322 struct swapdev *sdp; 1323 struct buf *bp; 1324 int bn; 1325 { 1326 struct vnode *vp; 1327 struct vndxfer *vnx; 1328 daddr_t nbn, byteoff; 1329 caddr_t addr; 1330 int s, off, nra, error, sz, resid; 1331 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist); 1332 1333 /* 1334 * allocate a vndxfer head for this transfer and point it to 1335 * our buffer. 1336 */ 1337 getvndxfer(vnx); 1338 vnx->vx_flags = VX_BUSY; 1339 vnx->vx_error = 0; 1340 vnx->vx_pending = 0; 1341 vnx->vx_bp = bp; 1342 vnx->vx_sdp = sdp; 1343 1344 /* 1345 * setup for main loop where we read filesystem blocks into 1346 * our buffer. 1347 */ 1348 error = 0; 1349 bp->b_resid = bp->b_bcount; /* nothing transferred yet! */ 1350 addr = bp->b_data; /* current position in buffer */ 1351 byteoff = dbtob(bn); 1352 1353 for (resid = bp->b_resid; resid; resid -= sz) { 1354 struct vndbuf *nbp; 1355 1356 /* 1357 * translate byteoffset into block number. return values: 1358 * vp = vnode of underlying device 1359 * nbn = new block number (on underlying vnode dev) 1360 * nra = num blocks we can read-ahead (excludes requested 1361 * block) 1362 */ 1363 nra = 0; 1364 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1365 &vp, &nbn, &nra); 1366 1367 if (error == 0 && nbn == (daddr_t)-1) { 1368 /* 1369 * this used to just set error, but that doesn't 1370 * do the right thing. Instead, it causes random 1371 * memory errors. The panic() should remain until 1372 * this condition doesn't destabilize the system. 1373 */ 1374 #if 1 1375 panic("sw_reg_strategy: swap to sparse file"); 1376 #else 1377 error = EIO; /* failure */ 1378 #endif 1379 } 1380 1381 /* 1382 * punt if there was an error or a hole in the file. 1383 * we must wait for any i/o ops we have already started 1384 * to finish before returning. 1385 * 1386 * XXX we could deal with holes here but it would be 1387 * a hassle (in the write case). 1388 */ 1389 if (error) { 1390 s = splbio(); 1391 vnx->vx_error = error; /* pass error up */ 1392 goto out; 1393 } 1394 1395 /* 1396 * compute the size ("sz") of this transfer (in bytes). 1397 * XXXCDC: ignores read-ahead for non-zero offset 1398 */ 1399 if ((off = (byteoff % sdp->swd_bsize)) != 0) 1400 sz = sdp->swd_bsize - off; 1401 else 1402 sz = (1 + nra) * sdp->swd_bsize; 1403 1404 if (resid < sz) 1405 sz = resid; 1406 1407 UVMHIST_LOG(pdhist, "sw_reg_strategy: vp %p/%p offset 0x%x/0x%x", 1408 sdp->swd_vp, vp, byteoff, nbn); 1409 1410 /* 1411 * now get a buf structure. note that the vb_buf is 1412 * at the front of the nbp structure so that you can 1413 * cast pointers between the two structure easily. 1414 */ 1415 getvndbuf(nbp); 1416 nbp->vb_buf.b_flags = bp->b_flags | B_CALL; 1417 nbp->vb_buf.b_bcount = sz; 1418 nbp->vb_buf.b_bufsize = sz; 1419 nbp->vb_buf.b_error = 0; 1420 nbp->vb_buf.b_data = addr; 1421 nbp->vb_buf.b_blkno = nbn + btodb(off); 1422 nbp->vb_buf.b_proc = bp->b_proc; 1423 nbp->vb_buf.b_iodone = sw_reg_iodone; 1424 nbp->vb_buf.b_vp = NULLVP; 1425 nbp->vb_buf.b_vnbufs.le_next = NOLIST; 1426 nbp->vb_buf.b_rcred = sdp->swd_cred; 1427 nbp->vb_buf.b_wcred = sdp->swd_cred; 1428 LIST_INIT(&nbp->vb_buf.b_dep); 1429 1430 /* 1431 * set b_dirtyoff/end and b_validoff/end. this is 1432 * required by the NFS client code (otherwise it will 1433 * just discard our I/O request). 1434 */ 1435 if (bp->b_dirtyend == 0) { 1436 nbp->vb_buf.b_dirtyoff = 0; 1437 nbp->vb_buf.b_dirtyend = sz; 1438 } else { 1439 nbp->vb_buf.b_dirtyoff = 1440 max(0, bp->b_dirtyoff - (bp->b_bcount-resid)); 1441 nbp->vb_buf.b_dirtyend = 1442 min(sz, 1443 max(0, bp->b_dirtyend - (bp->b_bcount-resid))); 1444 } 1445 if (bp->b_validend == 0) { 1446 nbp->vb_buf.b_validoff = 0; 1447 nbp->vb_buf.b_validend = sz; 1448 } else { 1449 nbp->vb_buf.b_validoff = 1450 max(0, bp->b_validoff - (bp->b_bcount-resid)); 1451 nbp->vb_buf.b_validend = 1452 min(sz, 1453 max(0, bp->b_validend - (bp->b_bcount-resid))); 1454 } 1455 1456 nbp->vb_xfer = vnx; /* patch it back in to vnx */ 1457 1458 /* 1459 * Just sort by block number 1460 */ 1461 nbp->vb_buf.b_cylinder = nbp->vb_buf.b_blkno; 1462 s = splbio(); 1463 if (vnx->vx_error != 0) { 1464 putvndbuf(nbp); 1465 goto out; 1466 } 1467 vnx->vx_pending++; 1468 1469 /* assoc new buffer with underlying vnode */ 1470 bgetvp(vp, &nbp->vb_buf); 1471 1472 /* sort it in and start I/O if we are not over our limit */ 1473 disksort(&sdp->swd_tab, &nbp->vb_buf); 1474 sw_reg_start(sdp); 1475 splx(s); 1476 1477 /* 1478 * advance to the next I/O 1479 */ 1480 byteoff += sz; 1481 addr += sz; 1482 } 1483 1484 s = splbio(); 1485 1486 out: /* Arrive here at splbio */ 1487 vnx->vx_flags &= ~VX_BUSY; 1488 if (vnx->vx_pending == 0) { 1489 if (vnx->vx_error != 0) { 1490 bp->b_error = vnx->vx_error; 1491 bp->b_flags |= B_ERROR; 1492 } 1493 putvndxfer(vnx); 1494 biodone(bp); 1495 } 1496 splx(s); 1497 } 1498 1499 /* 1500 * sw_reg_start: start an I/O request on the requested swapdev 1501 * 1502 * => reqs are sorted by disksort (above) 1503 */ 1504 static void 1505 sw_reg_start(sdp) 1506 struct swapdev *sdp; 1507 { 1508 struct buf *bp; 1509 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist); 1510 1511 /* recursion control */ 1512 if ((sdp->swd_flags & SWF_BUSY) != 0) 1513 return; 1514 1515 sdp->swd_flags |= SWF_BUSY; 1516 1517 while (sdp->swd_tab.b_active < sdp->swd_maxactive) { 1518 bp = sdp->swd_tab.b_actf; 1519 if (bp == NULL) 1520 break; 1521 sdp->swd_tab.b_actf = bp->b_actf; 1522 sdp->swd_tab.b_active++; 1523 1524 UVMHIST_LOG(pdhist, 1525 "sw_reg_start: bp %p vp %p blkno %p cnt %lx", 1526 bp, bp->b_vp, bp->b_blkno, bp->b_bcount); 1527 if ((bp->b_flags & B_READ) == 0) 1528 bp->b_vp->v_numoutput++; 1529 VOP_STRATEGY(bp); 1530 } 1531 sdp->swd_flags &= ~SWF_BUSY; 1532 } 1533 1534 /* 1535 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1536 * 1537 * => note that we can recover the vndbuf struct by casting the buf ptr 1538 */ 1539 static void 1540 sw_reg_iodone(bp) 1541 struct buf *bp; 1542 { 1543 struct vndbuf *vbp = (struct vndbuf *) bp; 1544 struct vndxfer *vnx = vbp->vb_xfer; 1545 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1546 struct swapdev *sdp = vnx->vx_sdp; 1547 int s, resid; 1548 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist); 1549 1550 UVMHIST_LOG(pdhist, " vbp=%p vp=%p blkno=%x addr=%p", 1551 vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data); 1552 UVMHIST_LOG(pdhist, " cnt=%lx resid=%lx", 1553 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); 1554 1555 /* 1556 * protect vbp at splbio and update. 1557 */ 1558 1559 s = splbio(); 1560 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1561 pbp->b_resid -= resid; 1562 vnx->vx_pending--; 1563 1564 if (vbp->vb_buf.b_error) { 1565 UVMHIST_LOG(pdhist, " got error=%d !", 1566 vbp->vb_buf.b_error, 0, 0, 0); 1567 1568 /* pass error upward */ 1569 vnx->vx_error = vbp->vb_buf.b_error; 1570 } 1571 1572 /* 1573 * disassociate this buffer from the vnode (if any). 1574 */ 1575 if (vbp->vb_buf.b_vp != NULLVP) { 1576 brelvp(&vbp->vb_buf); 1577 } 1578 1579 /* 1580 * disassociate this buffer from the vnode (if any). 1581 */ 1582 if (vbp->vb_buf.b_vp != NULLVP) { 1583 brelvp(&vbp->vb_buf); 1584 } 1585 1586 /* 1587 * kill vbp structure 1588 */ 1589 putvndbuf(vbp); 1590 1591 /* 1592 * wrap up this transaction if it has run to completion or, in 1593 * case of an error, when all auxiliary buffers have returned. 1594 */ 1595 if (vnx->vx_error != 0) { 1596 /* pass error upward */ 1597 pbp->b_flags |= B_ERROR; 1598 pbp->b_error = vnx->vx_error; 1599 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1600 putvndxfer(vnx); 1601 biodone(pbp); 1602 } 1603 } else if (pbp->b_resid == 0) { 1604 #ifdef DIAGNOSTIC 1605 if (vnx->vx_pending != 0) 1606 panic("sw_reg_iodone: vnx pending: %d",vnx->vx_pending); 1607 #endif 1608 1609 if ((vnx->vx_flags & VX_BUSY) == 0) { 1610 UVMHIST_LOG(pdhist, " iodone error=%d !", 1611 pbp, vnx->vx_error, 0, 0); 1612 putvndxfer(vnx); 1613 biodone(pbp); 1614 } 1615 } 1616 1617 /* 1618 * done! start next swapdev I/O if one is pending 1619 */ 1620 sdp->swd_tab.b_active--; 1621 sw_reg_start(sdp); 1622 splx(s); 1623 } 1624 1625 1626 /* 1627 * uvm_swap_alloc: allocate space on swap 1628 * 1629 * => allocation is done "round robin" down the priority list, as we 1630 * allocate in a priority we "rotate" the circle queue. 1631 * => space can be freed with uvm_swap_free 1632 * => we return the page slot number in /dev/drum (0 == invalid slot) 1633 * => we lock uvm.swap_data_lock 1634 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1635 */ 1636 int 1637 uvm_swap_alloc(nslots, lessok) 1638 int *nslots; /* IN/OUT */ 1639 boolean_t lessok; 1640 { 1641 struct swapdev *sdp; 1642 struct swappri *spp; 1643 u_long result; 1644 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist); 1645 1646 /* 1647 * no swap devices configured yet? definite failure. 1648 */ 1649 if (uvmexp.nswapdev < 1) 1650 return 0; 1651 1652 /* 1653 * lock data lock, convert slots into blocks, and enter loop 1654 */ 1655 simple_lock(&uvm.swap_data_lock); 1656 1657 ReTry: /* XXXMRG */ 1658 for (spp = LIST_FIRST(&swap_priority); spp != NULL; 1659 spp = LIST_NEXT(spp, spi_swappri)) { 1660 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev); 1661 sdp != (void *)&spp->spi_swapdev; 1662 sdp = CIRCLEQ_NEXT(sdp,swd_next)) { 1663 /* if it's not enabled, then we can't swap from it */ 1664 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1665 continue; 1666 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1667 continue; 1668 if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN, 0, 1669 EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT, 1670 &result) != 0) { 1671 continue; 1672 } 1673 1674 /* 1675 * successful allocation! now rotate the circleq. 1676 */ 1677 CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1678 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1679 sdp->swd_npginuse += *nslots; 1680 uvmexp.swpginuse += *nslots; 1681 simple_unlock(&uvm.swap_data_lock); 1682 /* done! return drum slot number */ 1683 UVMHIST_LOG(pdhist, 1684 "success! returning %d slots starting at %d", 1685 *nslots, result + sdp->swd_drumoffset, 0, 0); 1686 return(result + sdp->swd_drumoffset); 1687 } 1688 } 1689 1690 /* XXXMRG: BEGIN HACK */ 1691 if (*nslots > 1 && lessok) { 1692 *nslots = 1; 1693 goto ReTry; /* XXXMRG: ugh! extent should support this for us */ 1694 } 1695 /* XXXMRG: END HACK */ 1696 1697 simple_unlock(&uvm.swap_data_lock); 1698 return 0; /* failed */ 1699 } 1700 1701 /* 1702 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1703 * 1704 * => we lock uvm.swap_data_lock 1705 */ 1706 void 1707 uvm_swap_markbad(startslot, nslots) 1708 int startslot; 1709 int nslots; 1710 { 1711 struct swapdev *sdp; 1712 UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist); 1713 1714 simple_lock(&uvm.swap_data_lock); 1715 sdp = swapdrum_getsdp(startslot); 1716 1717 /* 1718 * we just keep track of how many pages have been marked bad 1719 * in this device, to make everything add up in swap_off(). 1720 * we assume here that the range of slots will all be within 1721 * one swap device. 1722 */ 1723 sdp->swd_npgbad += nslots; 1724 1725 simple_unlock(&uvm.swap_data_lock); 1726 } 1727 1728 /* 1729 * uvm_swap_free: free swap slots 1730 * 1731 * => this can be all or part of an allocation made by uvm_swap_alloc 1732 * => we lock uvm.swap_data_lock 1733 */ 1734 void 1735 uvm_swap_free(startslot, nslots) 1736 int startslot; 1737 int nslots; 1738 { 1739 struct swapdev *sdp; 1740 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist); 1741 1742 UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots, 1743 startslot, 0, 0); 1744 1745 /* 1746 * ignore attempts to free the "bad" slot. 1747 */ 1748 if (startslot == SWSLOT_BAD) { 1749 return; 1750 } 1751 1752 /* 1753 * convert drum slot offset back to sdp, free the blocks 1754 * in the extent, and return. must hold pri lock to do 1755 * lookup and access the extent. 1756 */ 1757 simple_lock(&uvm.swap_data_lock); 1758 sdp = swapdrum_getsdp(startslot); 1759 1760 #ifdef DIAGNOSTIC 1761 if (uvmexp.nswapdev < 1) 1762 panic("uvm_swap_free: uvmexp.nswapdev < 1\n"); 1763 if (sdp == NULL) { 1764 printf("uvm_swap_free: startslot %d, nslots %d\n", startslot, 1765 nslots); 1766 panic("uvm_swap_free: unmapped address\n"); 1767 } 1768 #endif 1769 if (extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots, 1770 EX_MALLOCOK|EX_NOWAIT) != 0) { 1771 printf("warning: resource shortage: %d pages of swap lost\n", 1772 nslots); 1773 } 1774 1775 sdp->swd_npginuse -= nslots; 1776 uvmexp.swpginuse -= nslots; 1777 #ifdef DIAGNOSTIC 1778 if (sdp->swd_npginuse < 0) 1779 panic("uvm_swap_free: inuse < 0"); 1780 #endif 1781 #ifdef UVM_SWAP_ENCRYPT 1782 { 1783 int i; 1784 if (swap_encrypt_initalized) { 1785 /* Dereference keys */ 1786 for (i = 0; i < nslots; i++) 1787 if (uvm_swap_needdecrypt(sdp, startslot + i)) 1788 SWAP_KEY_PUT(sdp, SWD_KEY(sdp, startslot + i)); 1789 1790 /* Mark range as not decrypt */ 1791 uvm_swap_markdecrypt(sdp, startslot, nslots, 0); 1792 } 1793 } 1794 #endif /* UVM_SWAP_ENCRYPT */ 1795 simple_unlock(&uvm.swap_data_lock); 1796 } 1797 1798 /* 1799 * uvm_swap_put: put any number of pages into a contig place on swap 1800 * 1801 * => can be sync or async 1802 * => XXXMRG: consider making it an inline or macro 1803 */ 1804 int 1805 uvm_swap_put(swslot, ppsp, npages, flags) 1806 int swslot; 1807 struct vm_page **ppsp; 1808 int npages; 1809 int flags; 1810 { 1811 int result; 1812 1813 result = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1814 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1815 1816 return (result); 1817 } 1818 1819 /* 1820 * uvm_swap_get: get a single page from swap 1821 * 1822 * => usually a sync op (from fault) 1823 * => XXXMRG: consider making it an inline or macro 1824 */ 1825 int 1826 uvm_swap_get(page, swslot, flags) 1827 struct vm_page *page; 1828 int swslot, flags; 1829 { 1830 int result; 1831 1832 uvmexp.nswget++; 1833 #ifdef DIAGNOSTIC 1834 if ((flags & PGO_SYNCIO) == 0) 1835 printf("uvm_swap_get: ASYNC get requested?\n"); 1836 #endif 1837 1838 if (swslot == SWSLOT_BAD) { 1839 return VM_PAGER_ERROR; 1840 } 1841 1842 /* 1843 * this page is (about to be) no longer only in swap. 1844 */ 1845 simple_lock(&uvm.swap_data_lock); 1846 uvmexp.swpgonly--; 1847 simple_unlock(&uvm.swap_data_lock); 1848 1849 result = uvm_swap_io(&page, swslot, 1, B_READ | 1850 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1851 1852 if (result != VM_PAGER_OK && result != VM_PAGER_PEND) { 1853 /* 1854 * oops, the read failed so it really is still only in swap. 1855 */ 1856 simple_lock(&uvm.swap_data_lock); 1857 uvmexp.swpgonly++; 1858 simple_unlock(&uvm.swap_data_lock); 1859 } 1860 1861 return (result); 1862 } 1863 1864 /* 1865 * uvm_swap_io: do an i/o operation to swap 1866 */ 1867 1868 static int 1869 uvm_swap_io(pps, startslot, npages, flags) 1870 struct vm_page **pps; 1871 int startslot, npages, flags; 1872 { 1873 daddr_t startblk; 1874 struct swapbuf *sbp; 1875 struct buf *bp; 1876 vaddr_t kva; 1877 int result, s, mapinflags, pflag; 1878 #ifdef UVM_SWAP_ENCRYPT 1879 vaddr_t dstkva; 1880 struct vm_page *tpps[MAXBSIZE >> PAGE_SHIFT]; 1881 struct swapdev *sdp; 1882 int encrypt = 0; 1883 #endif 1884 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist); 1885 1886 UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d", 1887 startslot, npages, flags, 0); 1888 1889 /* 1890 * convert starting drum slot to block number 1891 */ 1892 startblk = btodb(startslot << PAGE_SHIFT); 1893 1894 /* 1895 * first, map the pages into the kernel (XXX: currently required 1896 * by buffer system). note that we don't let pagermapin alloc 1897 * an aiodesc structure because we don't want to chance a malloc. 1898 * we've got our own pool of aiodesc structures (in swapbuf). 1899 */ 1900 mapinflags = (flags & B_READ) ? UVMPAGER_MAPIN_READ : 1901 UVMPAGER_MAPIN_WRITE; 1902 if ((flags & B_ASYNC) == 0) 1903 mapinflags |= UVMPAGER_MAPIN_WAITOK; 1904 kva = uvm_pagermapin(pps, npages, NULL, mapinflags); 1905 if (kva == 0) 1906 return (VM_PAGER_AGAIN); 1907 1908 #ifdef UVM_SWAP_ENCRYPT 1909 if ((flags & B_READ) == 0) { 1910 /* 1911 * Check if we need to do swap encryption on old pages. 1912 * Later we need a different scheme, that swap encrypts 1913 * all pages of a process that had at least one page swap 1914 * encrypted. Then we might not need to copy all pages 1915 * in the cluster, and avoid the memory overheard in 1916 * swapping. 1917 */ 1918 if (uvm_doswapencrypt) 1919 encrypt = 1; 1920 } 1921 1922 if (swap_encrypt_initalized || encrypt) { 1923 /* 1924 * we need to know the swap device that we are swapping to/from 1925 * to see if the pages need to be marked for decryption or 1926 * actually need to be decrypted. 1927 * XXX - does this information stay the same over the whole 1928 * execution of this function? 1929 */ 1930 simple_lock(&uvm.swap_data_lock); 1931 sdp = swapdrum_getsdp(startslot); 1932 simple_unlock(&uvm.swap_data_lock); 1933 } 1934 1935 /* 1936 * encrypt to swap 1937 */ 1938 if ((flags & B_READ) == 0 && encrypt) { 1939 int i, opages; 1940 caddr_t src, dst; 1941 struct swap_key *key; 1942 u_int64_t block; 1943 int swmapflags; 1944 1945 /* We always need write access. */ 1946 swmapflags = UVMPAGER_MAPIN_READ; 1947 if ((flags & B_ASYNC) == 0) 1948 swmapflags |= UVMPAGER_MAPIN_WAITOK; 1949 1950 if (!uvm_swap_allocpages(tpps, npages)) { 1951 uvm_pagermapout(kva, npages); 1952 return (VM_PAGER_AGAIN); 1953 } 1954 1955 dstkva = uvm_pagermapin(tpps, npages, NULL, swmapflags); 1956 if (dstkva == NULL) { 1957 uvm_pagermapout(kva, npages); 1958 uvm_swap_freepages(tpps, npages); 1959 return (VM_PAGER_AGAIN); 1960 } 1961 1962 src = (caddr_t) kva; 1963 dst = (caddr_t) dstkva; 1964 block = startblk; 1965 for (i = 0; i < npages; i++) { 1966 key = SWD_KEY(sdp, startslot + i); 1967 SWAP_KEY_GET(sdp, key); /* add reference */ 1968 1969 /* mark for async writes */ 1970 tpps[i]->pqflags |= PQ_ENCRYPT; 1971 swap_encrypt(key, src, dst, block, 1 << PAGE_SHIFT); 1972 src += 1 << PAGE_SHIFT; 1973 dst += 1 << PAGE_SHIFT; 1974 block += btodb(1 << PAGE_SHIFT); 1975 } 1976 1977 uvm_pagermapout(kva, npages); 1978 1979 /* dispose of pages we dont use anymore */ 1980 opages = npages; 1981 uvm_pager_dropcluster(NULL, NULL, pps, &opages, 1982 PGO_PDFREECLUST); 1983 1984 kva = dstkva; 1985 } 1986 #endif /* UVM_SWAP_ENCRYPT */ 1987 1988 /* 1989 * now allocate a swap buffer off of freesbufs 1990 * [make sure we don't put the pagedaemon to sleep...] 1991 */ 1992 s = splbio(); 1993 pflag = ((flags & B_ASYNC) != 0 || curproc == uvm.pagedaemon_proc) 1994 ? 0 1995 : PR_WAITOK; 1996 sbp = pool_get(&swapbuf_pool, pflag); 1997 splx(s); /* drop splbio */ 1998 1999 /* 2000 * if we failed to get a swapbuf, return "try again" 2001 */ 2002 if (sbp == NULL) { 2003 #ifdef UVM_SWAP_ENCRYPT 2004 if ((flags & B_READ) == 0 && encrypt) { 2005 int i; 2006 2007 /* swap encrypt needs cleanup */ 2008 for (i = 0; i < npages; i++) 2009 SWAP_KEY_PUT(sdp, SWD_KEY(sdp, startslot + i)); 2010 2011 uvm_pagermapout(kva, npages); 2012 uvm_swap_freepages(tpps, npages); 2013 } 2014 #endif 2015 return (VM_PAGER_AGAIN); 2016 } 2017 2018 #ifdef UVM_SWAP_ENCRYPT 2019 /* 2020 * prevent ASYNC reads. 2021 * uvm_swap_io is only called from uvm_swap_get, uvm_swap_get 2022 * assumes that all gets are SYNCIO. Just make sure here. 2023 */ 2024 if (flags & B_READ) 2025 flags &= ~B_ASYNC; 2026 #endif 2027 /* 2028 * fill in the bp/sbp. we currently route our i/o through 2029 * /dev/drum's vnode [swapdev_vp]. 2030 */ 2031 bp = &sbp->sw_buf; 2032 bp->b_flags = B_BUSY | B_NOCACHE | (flags & (B_READ|B_ASYNC)); 2033 bp->b_proc = &proc0; /* XXX */ 2034 bp->b_rcred = bp->b_wcred = proc0.p_ucred; 2035 bp->b_vnbufs.le_next = NOLIST; 2036 bp->b_data = (caddr_t)kva; 2037 bp->b_blkno = startblk; 2038 LIST_INIT(&bp->b_dep); 2039 s = splbio(); 2040 bp->b_vp = NULL; 2041 buf_replacevnode(bp, swapdev_vp); 2042 splx(s); 2043 bp->b_bcount = npages << PAGE_SHIFT; 2044 2045 /* 2046 * for pageouts we must set "dirtyoff" [NFS client code needs it]. 2047 * and we bump v_numoutput (counter of number of active outputs). 2048 */ 2049 if ((bp->b_flags & B_READ) == 0) { 2050 bp->b_dirtyoff = 0; 2051 bp->b_dirtyend = npages << PAGE_SHIFT; 2052 #ifdef UVM_SWAP_ENCRYPT 2053 /* mark the pages in the drum for decryption */ 2054 if (swap_encrypt_initalized) 2055 uvm_swap_markdecrypt(sdp, startslot, npages, encrypt); 2056 #endif 2057 s = splbio(); 2058 swapdev_vp->v_numoutput++; 2059 splx(s); 2060 } 2061 2062 /* 2063 * for async ops we must set up the aiodesc and setup the callback 2064 * XXX: we expect no async-reads, but we don't prevent it here. 2065 */ 2066 if (flags & B_ASYNC) { 2067 sbp->sw_aio.aiodone = uvm_swap_aiodone; 2068 sbp->sw_aio.kva = kva; 2069 sbp->sw_aio.npages = npages; 2070 sbp->sw_aio.pd_ptr = sbp; /* backpointer */ 2071 bp->b_flags |= B_CALL; /* set callback */ 2072 bp->b_iodone = uvm_swap_bufdone;/* "buf" iodone function */ 2073 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); 2074 } 2075 UVMHIST_LOG(pdhist, 2076 "about to start io: data = 0x%p blkno = 0x%x, bcount = %ld", 2077 bp->b_data, bp->b_blkno, bp->b_bcount, 0); 2078 2079 /* 2080 * now we start the I/O, and if async, return. 2081 */ 2082 VOP_STRATEGY(bp); 2083 if (flags & B_ASYNC) 2084 return (VM_PAGER_PEND); 2085 2086 /* 2087 * must be sync i/o. wait for it to finish 2088 */ 2089 bp->b_error = biowait(bp); 2090 result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK; 2091 2092 #ifdef UVM_SWAP_ENCRYPT 2093 /* 2094 * decrypt swap 2095 */ 2096 if (swap_encrypt_initalized && 2097 (bp->b_flags & B_READ) && !(bp->b_flags & B_ERROR)) { 2098 int i; 2099 caddr_t data = bp->b_data; 2100 u_int64_t block = startblk; 2101 struct swap_key *key = NULL; 2102 2103 for (i = 0; i < npages; i++) { 2104 /* Check if we need to decrypt */ 2105 if (uvm_swap_needdecrypt(sdp, startslot + i)) { 2106 key = SWD_KEY(sdp, startslot + i); 2107 swap_decrypt(key, data, data, block, 2108 1 << PAGE_SHIFT); 2109 } 2110 data += 1 << PAGE_SHIFT; 2111 block += btodb(1 << PAGE_SHIFT); 2112 } 2113 } 2114 #endif 2115 /* 2116 * kill the pager mapping 2117 */ 2118 uvm_pagermapout(kva, npages); 2119 2120 #ifdef UVM_SWAP_ENCRYPT 2121 /* 2122 * Not anymore needed, free after encryption 2123 */ 2124 if ((bp->b_flags & B_READ) == 0 && encrypt) 2125 uvm_swap_freepages(tpps, npages); 2126 #endif 2127 /* 2128 * now dispose of the swap buffer 2129 */ 2130 s = splbio(); 2131 if (bp->b_vp) 2132 brelvp(bp); 2133 2134 pool_put(&swapbuf_pool, sbp); 2135 splx(s); 2136 2137 /* 2138 * finally return. 2139 */ 2140 UVMHIST_LOG(pdhist, "<- done (sync) result=%d", result, 0, 0, 0); 2141 return (result); 2142 } 2143 2144 /* 2145 * uvm_swap_bufdone: called from the buffer system when the i/o is done 2146 */ 2147 static void 2148 uvm_swap_bufdone(bp) 2149 struct buf *bp; 2150 { 2151 struct swapbuf *sbp = (struct swapbuf *) bp; 2152 int s = splbio(); 2153 UVMHIST_FUNC("uvm_swap_bufdone"); UVMHIST_CALLED(pdhist); 2154 2155 UVMHIST_LOG(pdhist, "cleaning buf %p", buf, 0, 0, 0); 2156 #ifdef DIAGNOSTIC 2157 /* 2158 * sanity check: swapbufs are private, so they shouldn't be wanted 2159 */ 2160 if (bp->b_flags & B_WANTED) 2161 panic("uvm_swap_bufdone: private buf wanted"); 2162 #endif 2163 2164 /* 2165 * drop the buffer's reference to the vnode. 2166 */ 2167 if (bp->b_vp) 2168 brelvp(bp); 2169 2170 /* 2171 * now put the aio on the uvm.aio_done list and wake the 2172 * pagedaemon (which will finish up our job in its context). 2173 */ 2174 simple_lock(&uvm.pagedaemon_lock); /* locks uvm.aio_done */ 2175 TAILQ_INSERT_TAIL(&uvm.aio_done, &sbp->sw_aio, aioq); 2176 simple_unlock(&uvm.pagedaemon_lock); 2177 2178 wakeup(&uvm.pagedaemon); 2179 splx(s); 2180 } 2181 2182 /* 2183 * uvm_swap_aiodone: aiodone function for anonymous memory 2184 * 2185 * => this is called in the context of the pagedaemon (but with the 2186 * page queues unlocked!) 2187 * => our "aio" structure must be part of a "swapbuf" 2188 */ 2189 static void 2190 uvm_swap_aiodone(aio) 2191 struct uvm_aiodesc *aio; 2192 { 2193 struct swapbuf *sbp = aio->pd_ptr; 2194 struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT]; 2195 int lcv, s; 2196 vaddr_t addr; 2197 UVMHIST_FUNC("uvm_swap_aiodone"); UVMHIST_CALLED(pdhist); 2198 2199 UVMHIST_LOG(pdhist, "done with aio %p", aio, 0, 0, 0); 2200 #ifdef DIAGNOSTIC 2201 /* 2202 * sanity check 2203 */ 2204 if (aio->npages > (MAXBSIZE >> PAGE_SHIFT)) 2205 panic("uvm_swap_aiodone: aio too big!"); 2206 #endif 2207 2208 /* 2209 * first, we have to recover the page pointers (pps) by poking in the 2210 * kernel pmap (XXX: should be saved in the buf structure). 2211 */ 2212 for (addr = aio->kva, lcv = 0 ; lcv < aio->npages ; 2213 addr += PAGE_SIZE, lcv++) { 2214 pps[lcv] = uvm_pageratop(addr); 2215 } 2216 2217 /* 2218 * now we can dispose of the kernel mappings of the buffer 2219 */ 2220 uvm_pagermapout(aio->kva, aio->npages); 2221 2222 /* 2223 * now we can dispose of the pages by using the dropcluster function 2224 * [note that we have no "page of interest" so we pass in null] 2225 */ 2226 2227 #ifdef UVM_SWAP_ENCRYPT 2228 /* 2229 * XXX - assumes that we only get ASYNC writes. used to be above. 2230 */ 2231 if (pps[0]->pqflags & PQ_ENCRYPT) 2232 uvm_swap_freepages(pps, aio->npages); 2233 else 2234 #endif /* UVM_SWAP_ENCRYPT */ 2235 uvm_pager_dropcluster(NULL, NULL, pps, &aio->npages, 2236 PGO_PDFREECLUST); 2237 2238 /* 2239 * finally, we can dispose of the swapbuf 2240 */ 2241 s = splbio(); 2242 pool_put(&swapbuf_pool, sbp); 2243 splx(s); 2244 2245 /* 2246 * done! 2247 */ 2248 } 2249 2250 static void 2251 swapmount() 2252 { 2253 struct swapdev *sdp; 2254 struct swappri *spp; 2255 struct vnode *vp; 2256 dev_t swap_dev = swdevt[0].sw_dev; 2257 2258 /* 2259 * No locking here since we happen to know that we will just be called 2260 * once before any other process has forked. 2261 */ 2262 2263 if (swap_dev == NODEV) { 2264 printf("swapmount: no device\n"); 2265 return; 2266 } 2267 2268 if (bdevvp(swap_dev, &vp)) { 2269 printf("swapmount: no device 2\n"); 2270 return; 2271 } 2272 2273 sdp = malloc(sizeof(*sdp), M_VMSWAP, M_WAITOK); 2274 spp = malloc(sizeof(*spp), M_VMSWAP, M_WAITOK); 2275 memset(sdp, 0, sizeof(*sdp)); 2276 2277 sdp->swd_flags = SWF_FAKE; 2278 sdp->swd_dev = swap_dev; 2279 sdp->swd_vp = vp; 2280 swaplist_insert(sdp, spp, 0); 2281 sdp->swd_pathlen = strlen("swap_device") + 1; 2282 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK); 2283 if (copystr("swap_device", sdp->swd_path, sdp->swd_pathlen, 0)) 2284 panic("swapmount: copystr"); 2285 2286 if (swap_on(curproc, sdp)) { 2287 swaplist_find(vp, 1); 2288 swaplist_trim(); 2289 vput(sdp->swd_vp); 2290 free(sdp->swd_path, M_VMSWAP); 2291 free(sdp, M_VMSWAP); 2292 return; 2293 } 2294 2295 VOP_UNLOCK(vp, 0, curproc); 2296 } 2297