1 /* $NetBSD: uvm_swap.c,v 1.177 2018/03/15 03:21:58 christos Exp $ */ 2 3 /* 4 * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 29 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.177 2018/03/15 03:21:58 christos Exp $"); 34 35 #include "opt_uvmhist.h" 36 #include "opt_compat_netbsd.h" 37 #include "opt_ddb.h" 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/buf.h> 42 #include <sys/bufq.h> 43 #include <sys/conf.h> 44 #include <sys/proc.h> 45 #include <sys/namei.h> 46 #include <sys/disklabel.h> 47 #include <sys/errno.h> 48 #include <sys/kernel.h> 49 #include <sys/vnode.h> 50 #include <sys/file.h> 51 #include <sys/vmem.h> 52 #include <sys/blist.h> 53 #include <sys/mount.h> 54 #include <sys/pool.h> 55 #include <sys/kmem.h> 56 #include <sys/syscallargs.h> 57 #include <sys/swap.h> 58 #include <sys/kauth.h> 59 #include <sys/sysctl.h> 60 #include <sys/workqueue.h> 61 62 #include <uvm/uvm.h> 63 64 #include <miscfs/specfs/specdev.h> 65 66 /* 67 * uvm_swap.c: manage configuration and i/o to swap space. 68 */ 69 70 /* 71 * swap space is managed in the following way: 72 * 73 * each swap partition or file is described by a "swapdev" structure. 74 * each "swapdev" structure contains a "swapent" structure which contains 75 * information that is passed up to the user (via system calls). 76 * 77 * each swap partition is assigned a "priority" (int) which controls 78 * swap parition usage. 79 * 80 * the system maintains a global data structure describing all swap 81 * partitions/files. there is a sorted LIST of "swappri" structures 82 * which describe "swapdev"'s at that priority. this LIST is headed 83 * by the "swap_priority" global var. each "swappri" contains a 84 * TAILQ of "swapdev" structures at that priority. 85 * 86 * locking: 87 * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl 88 * system call and prevents the swap priority list from changing 89 * while we are in the middle of a system call (e.g. SWAP_STATS). 90 * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data 91 * structures including the priority list, the swapdev structures, 92 * and the swapmap arena. 93 * 94 * each swap device has the following info: 95 * - swap device in use (could be disabled, preventing future use) 96 * - swap enabled (allows new allocations on swap) 97 * - map info in /dev/drum 98 * - vnode pointer 99 * for swap files only: 100 * - block size 101 * - max byte count in buffer 102 * - buffer 103 * 104 * userland controls and configures swap with the swapctl(2) system call. 105 * the sys_swapctl performs the following operations: 106 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 107 * [2] SWAP_STATS: given a pointer to an array of swapent structures 108 * (passed in via "arg") of a size passed in via "misc" ... we load 109 * the current swap config into the array. The actual work is done 110 * in the uvm_swap_stats() function. 111 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 112 * priority in "misc", start swapping on it. 113 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 114 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 115 * "misc") 116 */ 117 118 /* 119 * swapdev: describes a single swap partition/file 120 * 121 * note the following should be true: 122 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 123 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] 124 */ 125 struct swapdev { 126 dev_t swd_dev; /* device id */ 127 int swd_flags; /* flags:inuse/enable/fake */ 128 int swd_priority; /* our priority */ 129 int swd_nblks; /* blocks in this device */ 130 char *swd_path; /* saved pathname of device */ 131 int swd_pathlen; /* length of pathname */ 132 int swd_npages; /* #pages we can use */ 133 int swd_npginuse; /* #pages in use */ 134 int swd_npgbad; /* #pages bad */ 135 int swd_drumoffset; /* page0 offset in drum */ 136 int swd_drumsize; /* #pages in drum */ 137 blist_t swd_blist; /* blist for this swapdev */ 138 struct vnode *swd_vp; /* backing vnode */ 139 TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */ 140 141 int swd_bsize; /* blocksize (bytes) */ 142 int swd_maxactive; /* max active i/o reqs */ 143 struct bufq_state *swd_tab; /* buffer list */ 144 int swd_active; /* number of active buffers */ 145 }; 146 147 /* 148 * swap device priority entry; the list is kept sorted on `spi_priority'. 149 */ 150 struct swappri { 151 int spi_priority; /* priority */ 152 TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 153 /* tailq of swapdevs at this priority */ 154 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 155 }; 156 157 /* 158 * The following two structures are used to keep track of data transfers 159 * on swap devices associated with regular files. 160 * NOTE: this code is more or less a copy of vnd.c; we use the same 161 * structure names here to ease porting.. 162 */ 163 struct vndxfer { 164 struct buf *vx_bp; /* Pointer to parent buffer */ 165 struct swapdev *vx_sdp; 166 int vx_error; 167 int vx_pending; /* # of pending aux buffers */ 168 int vx_flags; 169 #define VX_BUSY 1 170 #define VX_DEAD 2 171 }; 172 173 struct vndbuf { 174 struct buf vb_buf; 175 struct vndxfer *vb_xfer; 176 }; 177 178 /* 179 * We keep a of pool vndbuf's and vndxfer structures. 180 */ 181 static struct pool vndxfer_pool, vndbuf_pool; 182 183 /* 184 * local variables 185 */ 186 static vmem_t *swapmap; /* controls the mapping of /dev/drum */ 187 188 /* list of all active swap devices [by priority] */ 189 LIST_HEAD(swap_priority, swappri); 190 static struct swap_priority swap_priority; 191 192 /* locks */ 193 static krwlock_t swap_syscall_lock; 194 195 /* workqueue and use counter for swap to regular files */ 196 static int sw_reg_count = 0; 197 static struct workqueue *sw_reg_workqueue; 198 199 /* tuneables */ 200 u_int uvm_swapisfull_factor = 99; 201 202 /* 203 * prototypes 204 */ 205 static struct swapdev *swapdrum_getsdp(int); 206 207 static struct swapdev *swaplist_find(struct vnode *, bool); 208 static void swaplist_insert(struct swapdev *, 209 struct swappri *, int); 210 static void swaplist_trim(void); 211 212 static int swap_on(struct lwp *, struct swapdev *); 213 static int swap_off(struct lwp *, struct swapdev *); 214 215 static void sw_reg_strategy(struct swapdev *, struct buf *, int); 216 static void sw_reg_biodone(struct buf *); 217 static void sw_reg_iodone(struct work *wk, void *dummy); 218 static void sw_reg_start(struct swapdev *); 219 220 static int uvm_swap_io(struct vm_page **, int, int, int); 221 222 /* 223 * uvm_swap_init: init the swap system data structures and locks 224 * 225 * => called at boot time from init_main.c after the filesystems 226 * are brought up (which happens after uvm_init()) 227 */ 228 void 229 uvm_swap_init(void) 230 { 231 UVMHIST_FUNC("uvm_swap_init"); 232 233 UVMHIST_CALLED(pdhist); 234 /* 235 * first, init the swap list, its counter, and its lock. 236 * then get a handle on the vnode for /dev/drum by using 237 * the its dev_t number ("swapdev", from MD conf.c). 238 */ 239 240 LIST_INIT(&swap_priority); 241 uvmexp.nswapdev = 0; 242 rw_init(&swap_syscall_lock); 243 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE); 244 245 if (bdevvp(swapdev, &swapdev_vp)) 246 panic("%s: can't get vnode for swap device", __func__); 247 if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY)) 248 panic("%s: can't lock swap device", __func__); 249 if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED)) 250 panic("%s: can't open swap device", __func__); 251 VOP_UNLOCK(swapdev_vp); 252 253 /* 254 * create swap block resource map to map /dev/drum. the range 255 * from 1 to INT_MAX allows 2 gigablocks of swap space. note 256 * that block 0 is reserved (used to indicate an allocation 257 * failure, or no allocation). 258 */ 259 swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0, 260 VM_NOSLEEP, IPL_NONE); 261 if (swapmap == 0) { 262 panic("%s: vmem_create failed", __func__); 263 } 264 265 pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx", 266 NULL, IPL_BIO); 267 pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd", 268 NULL, IPL_BIO); 269 270 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); 271 } 272 273 /* 274 * swaplist functions: functions that operate on the list of swap 275 * devices on the system. 276 */ 277 278 /* 279 * swaplist_insert: insert swap device "sdp" into the global list 280 * 281 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 282 * => caller must provide a newly allocated swappri structure (we will 283 * FREE it if we don't need it... this it to prevent allocation 284 * blocking here while adding swap) 285 */ 286 static void 287 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) 288 { 289 struct swappri *spp, *pspp; 290 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist); 291 292 /* 293 * find entry at or after which to insert the new device. 294 */ 295 pspp = NULL; 296 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 297 if (priority <= spp->spi_priority) 298 break; 299 pspp = spp; 300 } 301 302 /* 303 * new priority? 304 */ 305 if (spp == NULL || spp->spi_priority != priority) { 306 spp = newspp; /* use newspp! */ 307 UVMHIST_LOG(pdhist, "created new swappri = %jd", 308 priority, 0, 0, 0); 309 310 spp->spi_priority = priority; 311 TAILQ_INIT(&spp->spi_swapdev); 312 313 if (pspp) 314 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 315 else 316 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 317 } else { 318 /* we don't need a new priority structure, free it */ 319 kmem_free(newspp, sizeof(*newspp)); 320 } 321 322 /* 323 * priority found (or created). now insert on the priority's 324 * tailq list and bump the total number of swapdevs. 325 */ 326 sdp->swd_priority = priority; 327 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 328 uvmexp.nswapdev++; 329 } 330 331 /* 332 * swaplist_find: find and optionally remove a swap device from the 333 * global list. 334 * 335 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 336 * => we return the swapdev we found (and removed) 337 */ 338 static struct swapdev * 339 swaplist_find(struct vnode *vp, bool remove) 340 { 341 struct swapdev *sdp; 342 struct swappri *spp; 343 344 /* 345 * search the lists for the requested vp 346 */ 347 348 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 349 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 350 if (sdp->swd_vp == vp) { 351 if (remove) { 352 TAILQ_REMOVE(&spp->spi_swapdev, 353 sdp, swd_next); 354 uvmexp.nswapdev--; 355 } 356 return(sdp); 357 } 358 } 359 } 360 return (NULL); 361 } 362 363 /* 364 * swaplist_trim: scan priority list for empty priority entries and kill 365 * them. 366 * 367 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 368 */ 369 static void 370 swaplist_trim(void) 371 { 372 struct swappri *spp, *nextspp; 373 374 LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) { 375 if (!TAILQ_EMPTY(&spp->spi_swapdev)) 376 continue; 377 LIST_REMOVE(spp, spi_swappri); 378 kmem_free(spp, sizeof(*spp)); 379 } 380 } 381 382 /* 383 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 384 * to the "swapdev" that maps that section of the drum. 385 * 386 * => each swapdev takes one big contig chunk of the drum 387 * => caller must hold uvm_swap_data_lock 388 */ 389 static struct swapdev * 390 swapdrum_getsdp(int pgno) 391 { 392 struct swapdev *sdp; 393 struct swappri *spp; 394 395 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 396 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 397 if (sdp->swd_flags & SWF_FAKE) 398 continue; 399 if (pgno >= sdp->swd_drumoffset && 400 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 401 return sdp; 402 } 403 } 404 } 405 return NULL; 406 } 407 408 void swapsys_lock(krw_t op) 409 { 410 rw_enter(&swap_syscall_lock, op); 411 } 412 413 void swapsys_unlock(void) 414 { 415 rw_exit(&swap_syscall_lock); 416 } 417 418 static void 419 swapent_cvt(struct swapent *se, const struct swapdev *sdp, int inuse) 420 { 421 se->se_dev = sdp->swd_dev; 422 se->se_flags = sdp->swd_flags; 423 se->se_nblks = sdp->swd_nblks; 424 se->se_inuse = inuse; 425 se->se_priority = sdp->swd_priority; 426 KASSERT(sdp->swd_pathlen < sizeof(se->se_path)); 427 strcpy(se->se_path, sdp->swd_path); 428 } 429 430 int (*uvm_swap_stats13)(const struct sys_swapctl_args *, register_t *) = 431 (void *)enosys; 432 int (*uvm_swap_stats50)(const struct sys_swapctl_args *, register_t *) = 433 (void *)enosys; 434 435 /* 436 * sys_swapctl: main entry point for swapctl(2) system call 437 * [with two helper functions: swap_on and swap_off] 438 */ 439 int 440 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval) 441 { 442 /* { 443 syscallarg(int) cmd; 444 syscallarg(void *) arg; 445 syscallarg(int) misc; 446 } */ 447 struct vnode *vp; 448 struct nameidata nd; 449 struct swappri *spp; 450 struct swapdev *sdp; 451 #define SWAP_PATH_MAX (PATH_MAX + 1) 452 char *userpath; 453 size_t len = 0; 454 int error; 455 int priority; 456 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist); 457 458 /* 459 * we handle the non-priv NSWAP and STATS request first. 460 * 461 * SWAP_NSWAP: return number of config'd swap devices 462 * [can also be obtained with uvmexp sysctl] 463 */ 464 if (SCARG(uap, cmd) == SWAP_NSWAP) { 465 const int nswapdev = uvmexp.nswapdev; 466 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%jd", nswapdev, 467 0, 0, 0); 468 *retval = nswapdev; 469 return 0; 470 } 471 472 userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP); 473 474 /* 475 * ensure serialized syscall access by grabbing the swap_syscall_lock 476 */ 477 rw_enter(&swap_syscall_lock, RW_WRITER); 478 479 /* 480 * SWAP_STATS: get stats on current # of configured swap devs 481 * 482 * note that the swap_priority list can't change as long 483 * as we are holding the swap_syscall_lock. we don't want 484 * to grab the uvm_swap_data_lock because we may fault&sleep during 485 * copyout() and we don't want to be holding that lock then! 486 */ 487 switch (SCARG(uap, cmd)) { 488 case SWAP_STATS13: 489 error = (*uvm_swap_stats13)(uap, retval); 490 goto out; 491 case SWAP_STATS50: 492 error = (*uvm_swap_stats50)(uap, retval); 493 goto out; 494 case SWAP_STATS: 495 error = uvm_swap_stats(SCARG(uap, arg), SCARG(uap, misc), 496 NULL, sizeof(struct swapent), retval); 497 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); 498 goto out; 499 500 case SWAP_GETDUMPDEV: 501 error = copyout(&dumpdev, SCARG(uap, arg), sizeof(dumpdev)); 502 goto out; 503 default: 504 break; 505 } 506 507 /* 508 * all other requests require superuser privs. verify. 509 */ 510 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL, 511 0, NULL, NULL, NULL))) 512 goto out; 513 514 if (SCARG(uap, cmd) == SWAP_DUMPOFF) { 515 /* drop the current dump device */ 516 dumpdev = NODEV; 517 dumpcdev = NODEV; 518 cpu_dumpconf(); 519 goto out; 520 } 521 522 /* 523 * at this point we expect a path name in arg. we will 524 * use namei() to gain a vnode reference (vref), and lock 525 * the vnode (VOP_LOCK). 526 * 527 * XXX: a NULL arg means use the root vnode pointer (e.g. for 528 * miniroot) 529 */ 530 if (SCARG(uap, arg) == NULL) { 531 vp = rootvp; /* miniroot */ 532 vref(vp); 533 if (vn_lock(vp, LK_EXCLUSIVE)) { 534 vrele(vp); 535 error = EBUSY; 536 goto out; 537 } 538 if (SCARG(uap, cmd) == SWAP_ON && 539 copystr("miniroot", userpath, SWAP_PATH_MAX, &len)) 540 panic("swapctl: miniroot copy failed"); 541 } else { 542 struct pathbuf *pb; 543 544 /* 545 * This used to allow copying in one extra byte 546 * (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON. 547 * This was completely pointless because if anyone 548 * used that extra byte namei would fail with 549 * ENAMETOOLONG anyway, so I've removed the excess 550 * logic. - dholland 20100215 551 */ 552 553 error = pathbuf_copyin(SCARG(uap, arg), &pb); 554 if (error) { 555 goto out; 556 } 557 if (SCARG(uap, cmd) == SWAP_ON) { 558 /* get a copy of the string */ 559 pathbuf_copystring(pb, userpath, SWAP_PATH_MAX); 560 len = strlen(userpath) + 1; 561 } 562 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb); 563 if ((error = namei(&nd))) { 564 pathbuf_destroy(pb); 565 goto out; 566 } 567 vp = nd.ni_vp; 568 pathbuf_destroy(pb); 569 } 570 /* note: "vp" is referenced and locked */ 571 572 error = 0; /* assume no error */ 573 switch(SCARG(uap, cmd)) { 574 575 case SWAP_DUMPDEV: 576 if (vp->v_type != VBLK) { 577 error = ENOTBLK; 578 break; 579 } 580 if (bdevsw_lookup(vp->v_rdev)) { 581 dumpdev = vp->v_rdev; 582 dumpcdev = devsw_blk2chr(dumpdev); 583 } else 584 dumpdev = NODEV; 585 cpu_dumpconf(); 586 break; 587 588 case SWAP_CTL: 589 /* 590 * get new priority, remove old entry (if any) and then 591 * reinsert it in the correct place. finally, prune out 592 * any empty priority structures. 593 */ 594 priority = SCARG(uap, misc); 595 spp = kmem_alloc(sizeof(*spp), KM_SLEEP); 596 mutex_enter(&uvm_swap_data_lock); 597 if ((sdp = swaplist_find(vp, true)) == NULL) { 598 error = ENOENT; 599 } else { 600 swaplist_insert(sdp, spp, priority); 601 swaplist_trim(); 602 } 603 mutex_exit(&uvm_swap_data_lock); 604 if (error) 605 kmem_free(spp, sizeof(*spp)); 606 break; 607 608 case SWAP_ON: 609 610 /* 611 * check for duplicates. if none found, then insert a 612 * dummy entry on the list to prevent someone else from 613 * trying to enable this device while we are working on 614 * it. 615 */ 616 617 priority = SCARG(uap, misc); 618 sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP); 619 spp = kmem_alloc(sizeof(*spp), KM_SLEEP); 620 sdp->swd_flags = SWF_FAKE; 621 sdp->swd_vp = vp; 622 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 623 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK); 624 mutex_enter(&uvm_swap_data_lock); 625 if (swaplist_find(vp, false) != NULL) { 626 error = EBUSY; 627 mutex_exit(&uvm_swap_data_lock); 628 bufq_free(sdp->swd_tab); 629 kmem_free(sdp, sizeof(*sdp)); 630 kmem_free(spp, sizeof(*spp)); 631 break; 632 } 633 swaplist_insert(sdp, spp, priority); 634 mutex_exit(&uvm_swap_data_lock); 635 636 KASSERT(len > 0); 637 sdp->swd_pathlen = len; 638 sdp->swd_path = kmem_alloc(len, KM_SLEEP); 639 if (copystr(userpath, sdp->swd_path, len, 0) != 0) 640 panic("swapctl: copystr"); 641 642 /* 643 * we've now got a FAKE placeholder in the swap list. 644 * now attempt to enable swap on it. if we fail, undo 645 * what we've done and kill the fake entry we just inserted. 646 * if swap_on is a success, it will clear the SWF_FAKE flag 647 */ 648 649 if ((error = swap_on(l, sdp)) != 0) { 650 mutex_enter(&uvm_swap_data_lock); 651 (void) swaplist_find(vp, true); /* kill fake entry */ 652 swaplist_trim(); 653 mutex_exit(&uvm_swap_data_lock); 654 bufq_free(sdp->swd_tab); 655 kmem_free(sdp->swd_path, sdp->swd_pathlen); 656 kmem_free(sdp, sizeof(*sdp)); 657 break; 658 } 659 break; 660 661 case SWAP_OFF: 662 mutex_enter(&uvm_swap_data_lock); 663 if ((sdp = swaplist_find(vp, false)) == NULL) { 664 mutex_exit(&uvm_swap_data_lock); 665 error = ENXIO; 666 break; 667 } 668 669 /* 670 * If a device isn't in use or enabled, we 671 * can't stop swapping from it (again). 672 */ 673 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 674 mutex_exit(&uvm_swap_data_lock); 675 error = EBUSY; 676 break; 677 } 678 679 /* 680 * do the real work. 681 */ 682 error = swap_off(l, sdp); 683 break; 684 685 default: 686 error = EINVAL; 687 } 688 689 /* 690 * done! release the ref gained by namei() and unlock. 691 */ 692 vput(vp); 693 out: 694 rw_exit(&swap_syscall_lock); 695 kmem_free(userpath, SWAP_PATH_MAX); 696 697 UVMHIST_LOG(pdhist, "<- done! error=%jd", error, 0, 0, 0); 698 return (error); 699 } 700 701 /* 702 * uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept 703 * away from sys_swapctl() in order to allow COMPAT_* swapctl() 704 * emulation to use it directly without going through sys_swapctl(). 705 * The problem with using sys_swapctl() there is that it involves 706 * copying the swapent array to the stackgap, and this array's size 707 * is not known at build time. Hence it would not be possible to 708 * ensure it would fit in the stackgap in any case. 709 */ 710 int 711 uvm_swap_stats(char *ptr, int misc, 712 void (*f)(void *, const struct swapent *), size_t len, 713 register_t *retval) 714 { 715 struct swappri *spp; 716 struct swapdev *sdp; 717 struct swapent sep; 718 int count = 0; 719 int error; 720 721 KASSERT(len <= sizeof(sep)); 722 if (len == 0) 723 return ENOSYS; 724 725 if (misc < 0) 726 return EINVAL; 727 728 if (misc == 0 || uvmexp.nswapdev == 0) 729 return 0; 730 731 /* Make sure userland cannot exhaust kernel memory */ 732 if ((size_t)misc > (size_t)uvmexp.nswapdev) 733 misc = uvmexp.nswapdev; 734 735 KASSERT(rw_lock_held(&swap_syscall_lock)); 736 737 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 738 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 739 int inuse; 740 741 if (misc-- <= 0) 742 break; 743 744 inuse = btodb((uint64_t)sdp->swd_npginuse << 745 PAGE_SHIFT); 746 747 swapent_cvt(&sep, sdp, inuse); 748 if (f) 749 (*f)(&sep, &sep); 750 if ((error = copyout(&sep, ptr, len)) != 0) 751 return error; 752 ptr += len; 753 count++; 754 } 755 } 756 *retval = count; 757 return 0; 758 } 759 760 /* 761 * swap_on: attempt to enable a swapdev for swapping. note that the 762 * swapdev is already on the global list, but disabled (marked 763 * SWF_FAKE). 764 * 765 * => we avoid the start of the disk (to protect disk labels) 766 * => we also avoid the miniroot, if we are swapping to root. 767 * => caller should leave uvm_swap_data_lock unlocked, we may lock it 768 * if needed. 769 */ 770 static int 771 swap_on(struct lwp *l, struct swapdev *sdp) 772 { 773 struct vnode *vp; 774 int error, npages, nblocks, size; 775 long addr; 776 vmem_addr_t result; 777 struct vattr va; 778 dev_t dev; 779 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist); 780 781 /* 782 * we want to enable swapping on sdp. the swd_vp contains 783 * the vnode we want (locked and ref'd), and the swd_dev 784 * contains the dev_t of the file, if it a block device. 785 */ 786 787 vp = sdp->swd_vp; 788 dev = sdp->swd_dev; 789 790 /* 791 * open the swap file (mostly useful for block device files to 792 * let device driver know what is up). 793 * 794 * we skip the open/close for root on swap because the root 795 * has already been opened when root was mounted (mountroot). 796 */ 797 if (vp != rootvp) { 798 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred))) 799 return (error); 800 } 801 802 /* XXX this only works for block devices */ 803 UVMHIST_LOG(pdhist, " dev=%jd, major(dev)=%jd", dev, major(dev), 0, 0); 804 805 /* 806 * we now need to determine the size of the swap area. for 807 * block specials we can call the d_psize function. 808 * for normal files, we must stat [get attrs]. 809 * 810 * we put the result in nblks. 811 * for normal files, we also want the filesystem block size 812 * (which we get with statfs). 813 */ 814 switch (vp->v_type) { 815 case VBLK: 816 if ((nblocks = bdev_size(dev)) == -1) { 817 error = ENXIO; 818 goto bad; 819 } 820 break; 821 822 case VREG: 823 if ((error = VOP_GETATTR(vp, &va, l->l_cred))) 824 goto bad; 825 nblocks = (int)btodb(va.va_size); 826 sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift; 827 /* 828 * limit the max # of outstanding I/O requests we issue 829 * at any one time. take it easy on NFS servers. 830 */ 831 if (vp->v_tag == VT_NFS) 832 sdp->swd_maxactive = 2; /* XXX */ 833 else 834 sdp->swd_maxactive = 8; /* XXX */ 835 break; 836 837 default: 838 error = ENXIO; 839 goto bad; 840 } 841 842 /* 843 * save nblocks in a safe place and convert to pages. 844 */ 845 846 sdp->swd_nblks = nblocks; 847 npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT; 848 849 /* 850 * for block special files, we want to make sure that leave 851 * the disklabel and bootblocks alone, so we arrange to skip 852 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 853 * note that because of this the "size" can be less than the 854 * actual number of blocks on the device. 855 */ 856 if (vp->v_type == VBLK) { 857 /* we use pages 1 to (size - 1) [inclusive] */ 858 size = npages - 1; 859 addr = 1; 860 } else { 861 /* we use pages 0 to (size - 1) [inclusive] */ 862 size = npages; 863 addr = 0; 864 } 865 866 /* 867 * make sure we have enough blocks for a reasonable sized swap 868 * area. we want at least one page. 869 */ 870 871 if (size < 1) { 872 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); 873 error = EINVAL; 874 goto bad; 875 } 876 877 UVMHIST_LOG(pdhist, " dev=%jx: size=%jd addr=%jd", dev, size, addr, 0); 878 879 /* 880 * now we need to allocate an extent to manage this swap device 881 */ 882 883 sdp->swd_blist = blist_create(npages); 884 /* mark all expect the `saved' region free. */ 885 blist_free(sdp->swd_blist, addr, size); 886 887 /* 888 * if the vnode we are swapping to is the root vnode 889 * (i.e. we are swapping to the miniroot) then we want 890 * to make sure we don't overwrite it. do a statfs to 891 * find its size and skip over it. 892 */ 893 if (vp == rootvp) { 894 struct mount *mp; 895 struct statvfs *sp; 896 int rootblocks, rootpages; 897 898 mp = rootvnode->v_mount; 899 sp = &mp->mnt_stat; 900 rootblocks = sp->f_blocks * btodb(sp->f_frsize); 901 /* 902 * XXX: sp->f_blocks isn't the total number of 903 * blocks in the filesystem, it's the number of 904 * data blocks. so, our rootblocks almost 905 * definitely underestimates the total size 906 * of the filesystem - how badly depends on the 907 * details of the filesystem type. there isn't 908 * an obvious way to deal with this cleanly 909 * and perfectly, so for now we just pad our 910 * rootblocks estimate with an extra 5 percent. 911 */ 912 rootblocks += (rootblocks >> 5) + 913 (rootblocks >> 6) + 914 (rootblocks >> 7); 915 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; 916 if (rootpages > size) 917 panic("swap_on: miniroot larger than swap?"); 918 919 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) { 920 panic("swap_on: unable to preserve miniroot"); 921 } 922 923 size -= rootpages; 924 printf("Preserved %d pages of miniroot ", rootpages); 925 printf("leaving %d pages of swap\n", size); 926 } 927 928 /* 929 * add a ref to vp to reflect usage as a swap device. 930 */ 931 vref(vp); 932 933 /* 934 * now add the new swapdev to the drum and enable. 935 */ 936 error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result); 937 if (error != 0) 938 panic("swapdrum_add"); 939 /* 940 * If this is the first regular swap create the workqueue. 941 * => Protected by swap_syscall_lock. 942 */ 943 if (vp->v_type != VBLK) { 944 if (sw_reg_count++ == 0) { 945 KASSERT(sw_reg_workqueue == NULL); 946 if (workqueue_create(&sw_reg_workqueue, "swapiod", 947 sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0) 948 panic("%s: workqueue_create failed", __func__); 949 } 950 } 951 952 sdp->swd_drumoffset = (int)result; 953 sdp->swd_drumsize = npages; 954 sdp->swd_npages = size; 955 mutex_enter(&uvm_swap_data_lock); 956 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 957 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 958 uvmexp.swpages += size; 959 uvmexp.swpgavail += size; 960 mutex_exit(&uvm_swap_data_lock); 961 return (0); 962 963 /* 964 * failure: clean up and return error. 965 */ 966 967 bad: 968 if (sdp->swd_blist) { 969 blist_destroy(sdp->swd_blist); 970 } 971 if (vp != rootvp) { 972 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred); 973 } 974 return (error); 975 } 976 977 /* 978 * swap_off: stop swapping on swapdev 979 * 980 * => swap data should be locked, we will unlock. 981 */ 982 static int 983 swap_off(struct lwp *l, struct swapdev *sdp) 984 { 985 int npages = sdp->swd_npages; 986 int error = 0; 987 988 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist); 989 UVMHIST_LOG(pdhist, " dev=%jx, npages=%jd", sdp->swd_dev,npages, 0, 0); 990 991 /* disable the swap area being removed */ 992 sdp->swd_flags &= ~SWF_ENABLE; 993 uvmexp.swpgavail -= npages; 994 mutex_exit(&uvm_swap_data_lock); 995 996 /* 997 * the idea is to find all the pages that are paged out to this 998 * device, and page them all in. in uvm, swap-backed pageable 999 * memory can take two forms: aobjs and anons. call the 1000 * swapoff hook for each subsystem to bring in pages. 1001 */ 1002 1003 if (uao_swap_off(sdp->swd_drumoffset, 1004 sdp->swd_drumoffset + sdp->swd_drumsize) || 1005 amap_swap_off(sdp->swd_drumoffset, 1006 sdp->swd_drumoffset + sdp->swd_drumsize)) { 1007 error = ENOMEM; 1008 } else if (sdp->swd_npginuse > sdp->swd_npgbad) { 1009 error = EBUSY; 1010 } 1011 1012 if (error) { 1013 mutex_enter(&uvm_swap_data_lock); 1014 sdp->swd_flags |= SWF_ENABLE; 1015 uvmexp.swpgavail += npages; 1016 mutex_exit(&uvm_swap_data_lock); 1017 1018 return error; 1019 } 1020 1021 /* 1022 * If this is the last regular swap destroy the workqueue. 1023 * => Protected by swap_syscall_lock. 1024 */ 1025 if (sdp->swd_vp->v_type != VBLK) { 1026 KASSERT(sw_reg_count > 0); 1027 KASSERT(sw_reg_workqueue != NULL); 1028 if (--sw_reg_count == 0) { 1029 workqueue_destroy(sw_reg_workqueue); 1030 sw_reg_workqueue = NULL; 1031 } 1032 } 1033 1034 /* 1035 * done with the vnode. 1036 * drop our ref on the vnode before calling VOP_CLOSE() 1037 * so that spec_close() can tell if this is the last close. 1038 */ 1039 vrele(sdp->swd_vp); 1040 if (sdp->swd_vp != rootvp) { 1041 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred); 1042 } 1043 1044 mutex_enter(&uvm_swap_data_lock); 1045 uvmexp.swpages -= npages; 1046 uvmexp.swpginuse -= sdp->swd_npgbad; 1047 1048 if (swaplist_find(sdp->swd_vp, true) == NULL) 1049 panic("%s: swapdev not in list", __func__); 1050 swaplist_trim(); 1051 mutex_exit(&uvm_swap_data_lock); 1052 1053 /* 1054 * free all resources! 1055 */ 1056 vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize); 1057 blist_destroy(sdp->swd_blist); 1058 bufq_free(sdp->swd_tab); 1059 kmem_free(sdp, sizeof(*sdp)); 1060 return (0); 1061 } 1062 1063 void 1064 uvm_swap_shutdown(struct lwp *l) 1065 { 1066 struct swapdev *sdp; 1067 struct swappri *spp; 1068 struct vnode *vp; 1069 int error; 1070 1071 printf("turning of swap..."); 1072 rw_enter(&swap_syscall_lock, RW_WRITER); 1073 mutex_enter(&uvm_swap_data_lock); 1074 again: 1075 LIST_FOREACH(spp, &swap_priority, spi_swappri) 1076 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1077 if (sdp->swd_flags & SWF_FAKE) 1078 continue; 1079 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) 1080 continue; 1081 #ifdef DEBUG 1082 printf("\nturning off swap on %s...", 1083 sdp->swd_path); 1084 #endif 1085 if (vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE)) { 1086 error = EBUSY; 1087 vp = NULL; 1088 } else 1089 error = 0; 1090 if (!error) { 1091 error = swap_off(l, sdp); 1092 mutex_enter(&uvm_swap_data_lock); 1093 } 1094 if (error) { 1095 printf("stopping swap on %s failed " 1096 "with error %d\n", sdp->swd_path, error); 1097 TAILQ_REMOVE(&spp->spi_swapdev, sdp, 1098 swd_next); 1099 uvmexp.nswapdev--; 1100 swaplist_trim(); 1101 if (vp) 1102 vput(vp); 1103 } 1104 goto again; 1105 } 1106 printf(" done\n"); 1107 mutex_exit(&uvm_swap_data_lock); 1108 rw_exit(&swap_syscall_lock); 1109 } 1110 1111 1112 /* 1113 * /dev/drum interface and i/o functions 1114 */ 1115 1116 /* 1117 * swstrategy: perform I/O on the drum 1118 * 1119 * => we must map the i/o request from the drum to the correct swapdev. 1120 */ 1121 static void 1122 swstrategy(struct buf *bp) 1123 { 1124 struct swapdev *sdp; 1125 struct vnode *vp; 1126 int pageno, bn; 1127 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist); 1128 1129 /* 1130 * convert block number to swapdev. note that swapdev can't 1131 * be yanked out from under us because we are holding resources 1132 * in it (i.e. the blocks we are doing I/O on). 1133 */ 1134 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT; 1135 mutex_enter(&uvm_swap_data_lock); 1136 sdp = swapdrum_getsdp(pageno); 1137 mutex_exit(&uvm_swap_data_lock); 1138 if (sdp == NULL) { 1139 bp->b_error = EINVAL; 1140 bp->b_resid = bp->b_bcount; 1141 biodone(bp); 1142 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); 1143 return; 1144 } 1145 1146 /* 1147 * convert drum page number to block number on this swapdev. 1148 */ 1149 1150 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1151 bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ 1152 1153 UVMHIST_LOG(pdhist, " Rd/Wr (0/1) %jd: mapoff=%jx bn=%jx bcount=%jd", 1154 ((bp->b_flags & B_READ) == 0) ? 1 : 0, 1155 sdp->swd_drumoffset, bn, bp->b_bcount); 1156 1157 /* 1158 * for block devices we finish up here. 1159 * for regular files we have to do more work which we delegate 1160 * to sw_reg_strategy(). 1161 */ 1162 1163 vp = sdp->swd_vp; /* swapdev vnode pointer */ 1164 switch (vp->v_type) { 1165 default: 1166 panic("%s: vnode type 0x%x", __func__, vp->v_type); 1167 1168 case VBLK: 1169 1170 /* 1171 * must convert "bp" from an I/O on /dev/drum to an I/O 1172 * on the swapdev (sdp). 1173 */ 1174 bp->b_blkno = bn; /* swapdev block number */ 1175 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */ 1176 1177 /* 1178 * if we are doing a write, we have to redirect the i/o on 1179 * drum's v_numoutput counter to the swapdevs. 1180 */ 1181 if ((bp->b_flags & B_READ) == 0) { 1182 mutex_enter(bp->b_objlock); 1183 vwakeup(bp); /* kills one 'v_numoutput' on drum */ 1184 mutex_exit(bp->b_objlock); 1185 mutex_enter(vp->v_interlock); 1186 vp->v_numoutput++; /* put it on swapdev */ 1187 mutex_exit(vp->v_interlock); 1188 } 1189 1190 /* 1191 * finally plug in swapdev vnode and start I/O 1192 */ 1193 bp->b_vp = vp; 1194 bp->b_objlock = vp->v_interlock; 1195 VOP_STRATEGY(vp, bp); 1196 return; 1197 1198 case VREG: 1199 /* 1200 * delegate to sw_reg_strategy function. 1201 */ 1202 sw_reg_strategy(sdp, bp, bn); 1203 return; 1204 } 1205 /* NOTREACHED */ 1206 } 1207 1208 /* 1209 * swread: the read function for the drum (just a call to physio) 1210 */ 1211 /*ARGSUSED*/ 1212 static int 1213 swread(dev_t dev, struct uio *uio, int ioflag) 1214 { 1215 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist); 1216 1217 UVMHIST_LOG(pdhist, " dev=%jx offset=%jx", dev, uio->uio_offset, 0, 0); 1218 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); 1219 } 1220 1221 /* 1222 * swwrite: the write function for the drum (just a call to physio) 1223 */ 1224 /*ARGSUSED*/ 1225 static int 1226 swwrite(dev_t dev, struct uio *uio, int ioflag) 1227 { 1228 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist); 1229 1230 UVMHIST_LOG(pdhist, " dev=%jx offset=%jx", dev, uio->uio_offset, 0, 0); 1231 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); 1232 } 1233 1234 const struct bdevsw swap_bdevsw = { 1235 .d_open = nullopen, 1236 .d_close = nullclose, 1237 .d_strategy = swstrategy, 1238 .d_ioctl = noioctl, 1239 .d_dump = nodump, 1240 .d_psize = nosize, 1241 .d_discard = nodiscard, 1242 .d_flag = D_OTHER 1243 }; 1244 1245 const struct cdevsw swap_cdevsw = { 1246 .d_open = nullopen, 1247 .d_close = nullclose, 1248 .d_read = swread, 1249 .d_write = swwrite, 1250 .d_ioctl = noioctl, 1251 .d_stop = nostop, 1252 .d_tty = notty, 1253 .d_poll = nopoll, 1254 .d_mmap = nommap, 1255 .d_kqfilter = nokqfilter, 1256 .d_discard = nodiscard, 1257 .d_flag = D_OTHER, 1258 }; 1259 1260 /* 1261 * sw_reg_strategy: handle swap i/o to regular files 1262 */ 1263 static void 1264 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) 1265 { 1266 struct vnode *vp; 1267 struct vndxfer *vnx; 1268 daddr_t nbn; 1269 char *addr; 1270 off_t byteoff; 1271 int s, off, nra, error, sz, resid; 1272 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist); 1273 1274 /* 1275 * allocate a vndxfer head for this transfer and point it to 1276 * our buffer. 1277 */ 1278 vnx = pool_get(&vndxfer_pool, PR_WAITOK); 1279 vnx->vx_flags = VX_BUSY; 1280 vnx->vx_error = 0; 1281 vnx->vx_pending = 0; 1282 vnx->vx_bp = bp; 1283 vnx->vx_sdp = sdp; 1284 1285 /* 1286 * setup for main loop where we read filesystem blocks into 1287 * our buffer. 1288 */ 1289 error = 0; 1290 bp->b_resid = bp->b_bcount; /* nothing transfered yet! */ 1291 addr = bp->b_data; /* current position in buffer */ 1292 byteoff = dbtob((uint64_t)bn); 1293 1294 for (resid = bp->b_resid; resid; resid -= sz) { 1295 struct vndbuf *nbp; 1296 1297 /* 1298 * translate byteoffset into block number. return values: 1299 * vp = vnode of underlying device 1300 * nbn = new block number (on underlying vnode dev) 1301 * nra = num blocks we can read-ahead (excludes requested 1302 * block) 1303 */ 1304 nra = 0; 1305 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1306 &vp, &nbn, &nra); 1307 1308 if (error == 0 && nbn == (daddr_t)-1) { 1309 /* 1310 * this used to just set error, but that doesn't 1311 * do the right thing. Instead, it causes random 1312 * memory errors. The panic() should remain until 1313 * this condition doesn't destabilize the system. 1314 */ 1315 #if 1 1316 panic("%s: swap to sparse file", __func__); 1317 #else 1318 error = EIO; /* failure */ 1319 #endif 1320 } 1321 1322 /* 1323 * punt if there was an error or a hole in the file. 1324 * we must wait for any i/o ops we have already started 1325 * to finish before returning. 1326 * 1327 * XXX we could deal with holes here but it would be 1328 * a hassle (in the write case). 1329 */ 1330 if (error) { 1331 s = splbio(); 1332 vnx->vx_error = error; /* pass error up */ 1333 goto out; 1334 } 1335 1336 /* 1337 * compute the size ("sz") of this transfer (in bytes). 1338 */ 1339 off = byteoff % sdp->swd_bsize; 1340 sz = (1 + nra) * sdp->swd_bsize - off; 1341 if (sz > resid) 1342 sz = resid; 1343 1344 UVMHIST_LOG(pdhist, "sw_reg_strategy: " 1345 "vp %#jx/%#jx offset 0x%jx/0x%jx", 1346 (uintptr_t)sdp->swd_vp, (uintptr_t)vp, byteoff, nbn); 1347 1348 /* 1349 * now get a buf structure. note that the vb_buf is 1350 * at the front of the nbp structure so that you can 1351 * cast pointers between the two structure easily. 1352 */ 1353 nbp = pool_get(&vndbuf_pool, PR_WAITOK); 1354 buf_init(&nbp->vb_buf); 1355 nbp->vb_buf.b_flags = bp->b_flags; 1356 nbp->vb_buf.b_cflags = bp->b_cflags; 1357 nbp->vb_buf.b_oflags = bp->b_oflags; 1358 nbp->vb_buf.b_bcount = sz; 1359 nbp->vb_buf.b_bufsize = sz; 1360 nbp->vb_buf.b_error = 0; 1361 nbp->vb_buf.b_data = addr; 1362 nbp->vb_buf.b_lblkno = 0; 1363 nbp->vb_buf.b_blkno = nbn + btodb(off); 1364 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno; 1365 nbp->vb_buf.b_iodone = sw_reg_biodone; 1366 nbp->vb_buf.b_vp = vp; 1367 nbp->vb_buf.b_objlock = vp->v_interlock; 1368 if (vp->v_type == VBLK) { 1369 nbp->vb_buf.b_dev = vp->v_rdev; 1370 } 1371 1372 nbp->vb_xfer = vnx; /* patch it back in to vnx */ 1373 1374 /* 1375 * Just sort by block number 1376 */ 1377 s = splbio(); 1378 if (vnx->vx_error != 0) { 1379 buf_destroy(&nbp->vb_buf); 1380 pool_put(&vndbuf_pool, nbp); 1381 goto out; 1382 } 1383 vnx->vx_pending++; 1384 1385 /* sort it in and start I/O if we are not over our limit */ 1386 /* XXXAD locking */ 1387 bufq_put(sdp->swd_tab, &nbp->vb_buf); 1388 sw_reg_start(sdp); 1389 splx(s); 1390 1391 /* 1392 * advance to the next I/O 1393 */ 1394 byteoff += sz; 1395 addr += sz; 1396 } 1397 1398 s = splbio(); 1399 1400 out: /* Arrive here at splbio */ 1401 vnx->vx_flags &= ~VX_BUSY; 1402 if (vnx->vx_pending == 0) { 1403 error = vnx->vx_error; 1404 pool_put(&vndxfer_pool, vnx); 1405 bp->b_error = error; 1406 biodone(bp); 1407 } 1408 splx(s); 1409 } 1410 1411 /* 1412 * sw_reg_start: start an I/O request on the requested swapdev 1413 * 1414 * => reqs are sorted by b_rawblkno (above) 1415 */ 1416 static void 1417 sw_reg_start(struct swapdev *sdp) 1418 { 1419 struct buf *bp; 1420 struct vnode *vp; 1421 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist); 1422 1423 /* recursion control */ 1424 if ((sdp->swd_flags & SWF_BUSY) != 0) 1425 return; 1426 1427 sdp->swd_flags |= SWF_BUSY; 1428 1429 while (sdp->swd_active < sdp->swd_maxactive) { 1430 bp = bufq_get(sdp->swd_tab); 1431 if (bp == NULL) 1432 break; 1433 sdp->swd_active++; 1434 1435 UVMHIST_LOG(pdhist, 1436 "sw_reg_start: bp %#jx vp %#jx blkno %#jx cnt %jx", 1437 (uintptr_t)bp, (uintptr_t)bp->b_vp, (uintptr_t)bp->b_blkno, 1438 bp->b_bcount); 1439 vp = bp->b_vp; 1440 KASSERT(bp->b_objlock == vp->v_interlock); 1441 if ((bp->b_flags & B_READ) == 0) { 1442 mutex_enter(vp->v_interlock); 1443 vp->v_numoutput++; 1444 mutex_exit(vp->v_interlock); 1445 } 1446 VOP_STRATEGY(vp, bp); 1447 } 1448 sdp->swd_flags &= ~SWF_BUSY; 1449 } 1450 1451 /* 1452 * sw_reg_biodone: one of our i/o's has completed 1453 */ 1454 static void 1455 sw_reg_biodone(struct buf *bp) 1456 { 1457 workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL); 1458 } 1459 1460 /* 1461 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1462 * 1463 * => note that we can recover the vndbuf struct by casting the buf ptr 1464 */ 1465 static void 1466 sw_reg_iodone(struct work *wk, void *dummy) 1467 { 1468 struct vndbuf *vbp = (void *)wk; 1469 struct vndxfer *vnx = vbp->vb_xfer; 1470 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1471 struct swapdev *sdp = vnx->vx_sdp; 1472 int s, resid, error; 1473 KASSERT(&vbp->vb_buf.b_work == wk); 1474 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist); 1475 1476 UVMHIST_LOG(pdhist, " vbp=%#jx vp=%#jx blkno=%jx addr=%#jx", 1477 (uintptr_t)vbp, (uintptr_t)vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, 1478 (uintptr_t)vbp->vb_buf.b_data); 1479 UVMHIST_LOG(pdhist, " cnt=%jx resid=%jx", 1480 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); 1481 1482 /* 1483 * protect vbp at splbio and update. 1484 */ 1485 1486 s = splbio(); 1487 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1488 pbp->b_resid -= resid; 1489 vnx->vx_pending--; 1490 1491 if (vbp->vb_buf.b_error != 0) { 1492 /* pass error upward */ 1493 error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO; 1494 UVMHIST_LOG(pdhist, " got error=%jd !", error, 0, 0, 0); 1495 vnx->vx_error = error; 1496 } 1497 1498 /* 1499 * kill vbp structure 1500 */ 1501 buf_destroy(&vbp->vb_buf); 1502 pool_put(&vndbuf_pool, vbp); 1503 1504 /* 1505 * wrap up this transaction if it has run to completion or, in 1506 * case of an error, when all auxiliary buffers have returned. 1507 */ 1508 if (vnx->vx_error != 0) { 1509 /* pass error upward */ 1510 error = vnx->vx_error; 1511 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1512 pbp->b_error = error; 1513 biodone(pbp); 1514 pool_put(&vndxfer_pool, vnx); 1515 } 1516 } else if (pbp->b_resid == 0) { 1517 KASSERT(vnx->vx_pending == 0); 1518 if ((vnx->vx_flags & VX_BUSY) == 0) { 1519 UVMHIST_LOG(pdhist, " iodone, pbp=%#jx error=%jd !", 1520 (uintptr_t)pbp, vnx->vx_error, 0, 0); 1521 biodone(pbp); 1522 pool_put(&vndxfer_pool, vnx); 1523 } 1524 } 1525 1526 /* 1527 * done! start next swapdev I/O if one is pending 1528 */ 1529 sdp->swd_active--; 1530 sw_reg_start(sdp); 1531 splx(s); 1532 } 1533 1534 1535 /* 1536 * uvm_swap_alloc: allocate space on swap 1537 * 1538 * => allocation is done "round robin" down the priority list, as we 1539 * allocate in a priority we "rotate" the circle queue. 1540 * => space can be freed with uvm_swap_free 1541 * => we return the page slot number in /dev/drum (0 == invalid slot) 1542 * => we lock uvm_swap_data_lock 1543 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1544 */ 1545 int 1546 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok) 1547 { 1548 struct swapdev *sdp; 1549 struct swappri *spp; 1550 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist); 1551 1552 /* 1553 * no swap devices configured yet? definite failure. 1554 */ 1555 if (uvmexp.nswapdev < 1) 1556 return 0; 1557 1558 /* 1559 * XXXJAK: BEGIN HACK 1560 * 1561 * blist_alloc() in subr_blist.c will panic if we try to allocate 1562 * too many slots. 1563 */ 1564 if (*nslots > BLIST_MAX_ALLOC) { 1565 if (__predict_false(lessok == false)) 1566 return 0; 1567 *nslots = BLIST_MAX_ALLOC; 1568 } 1569 /* XXXJAK: END HACK */ 1570 1571 /* 1572 * lock data lock, convert slots into blocks, and enter loop 1573 */ 1574 mutex_enter(&uvm_swap_data_lock); 1575 1576 ReTry: /* XXXMRG */ 1577 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1578 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1579 uint64_t result; 1580 1581 /* if it's not enabled, then we can't swap from it */ 1582 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1583 continue; 1584 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1585 continue; 1586 result = blist_alloc(sdp->swd_blist, *nslots); 1587 if (result == BLIST_NONE) { 1588 continue; 1589 } 1590 KASSERT(result < sdp->swd_drumsize); 1591 1592 /* 1593 * successful allocation! now rotate the tailq. 1594 */ 1595 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1596 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1597 sdp->swd_npginuse += *nslots; 1598 uvmexp.swpginuse += *nslots; 1599 mutex_exit(&uvm_swap_data_lock); 1600 /* done! return drum slot number */ 1601 UVMHIST_LOG(pdhist, 1602 "success! returning %jd slots starting at %jd", 1603 *nslots, result + sdp->swd_drumoffset, 0, 0); 1604 return (result + sdp->swd_drumoffset); 1605 } 1606 } 1607 1608 /* XXXMRG: BEGIN HACK */ 1609 if (*nslots > 1 && lessok) { 1610 *nslots = 1; 1611 /* XXXMRG: ugh! blist should support this for us */ 1612 goto ReTry; 1613 } 1614 /* XXXMRG: END HACK */ 1615 1616 mutex_exit(&uvm_swap_data_lock); 1617 return 0; 1618 } 1619 1620 /* 1621 * uvm_swapisfull: return true if most of available swap is allocated 1622 * and in use. we don't count some small portion as it may be inaccessible 1623 * to us at any given moment, for example if there is lock contention or if 1624 * pages are busy. 1625 */ 1626 bool 1627 uvm_swapisfull(void) 1628 { 1629 int swpgonly; 1630 bool rv; 1631 1632 mutex_enter(&uvm_swap_data_lock); 1633 KASSERT(uvmexp.swpgonly <= uvmexp.swpages); 1634 swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 / 1635 uvm_swapisfull_factor); 1636 rv = (swpgonly >= uvmexp.swpgavail); 1637 mutex_exit(&uvm_swap_data_lock); 1638 1639 return (rv); 1640 } 1641 1642 /* 1643 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1644 * 1645 * => we lock uvm_swap_data_lock 1646 */ 1647 void 1648 uvm_swap_markbad(int startslot, int nslots) 1649 { 1650 struct swapdev *sdp; 1651 UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist); 1652 1653 mutex_enter(&uvm_swap_data_lock); 1654 sdp = swapdrum_getsdp(startslot); 1655 KASSERT(sdp != NULL); 1656 1657 /* 1658 * we just keep track of how many pages have been marked bad 1659 * in this device, to make everything add up in swap_off(). 1660 * we assume here that the range of slots will all be within 1661 * one swap device. 1662 */ 1663 1664 KASSERT(uvmexp.swpgonly >= nslots); 1665 uvmexp.swpgonly -= nslots; 1666 sdp->swd_npgbad += nslots; 1667 UVMHIST_LOG(pdhist, "now %jd bad", sdp->swd_npgbad, 0,0,0); 1668 mutex_exit(&uvm_swap_data_lock); 1669 } 1670 1671 /* 1672 * uvm_swap_free: free swap slots 1673 * 1674 * => this can be all or part of an allocation made by uvm_swap_alloc 1675 * => we lock uvm_swap_data_lock 1676 */ 1677 void 1678 uvm_swap_free(int startslot, int nslots) 1679 { 1680 struct swapdev *sdp; 1681 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist); 1682 1683 UVMHIST_LOG(pdhist, "freeing %jd slots starting at %jd", nslots, 1684 startslot, 0, 0); 1685 1686 /* 1687 * ignore attempts to free the "bad" slot. 1688 */ 1689 1690 if (startslot == SWSLOT_BAD) { 1691 return; 1692 } 1693 1694 /* 1695 * convert drum slot offset back to sdp, free the blocks 1696 * in the extent, and return. must hold pri lock to do 1697 * lookup and access the extent. 1698 */ 1699 1700 mutex_enter(&uvm_swap_data_lock); 1701 sdp = swapdrum_getsdp(startslot); 1702 KASSERT(uvmexp.nswapdev >= 1); 1703 KASSERT(sdp != NULL); 1704 KASSERT(sdp->swd_npginuse >= nslots); 1705 blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots); 1706 sdp->swd_npginuse -= nslots; 1707 uvmexp.swpginuse -= nslots; 1708 mutex_exit(&uvm_swap_data_lock); 1709 } 1710 1711 /* 1712 * uvm_swap_put: put any number of pages into a contig place on swap 1713 * 1714 * => can be sync or async 1715 */ 1716 1717 int 1718 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) 1719 { 1720 int error; 1721 1722 error = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1723 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1724 return error; 1725 } 1726 1727 /* 1728 * uvm_swap_get: get a single page from swap 1729 * 1730 * => usually a sync op (from fault) 1731 */ 1732 1733 int 1734 uvm_swap_get(struct vm_page *page, int swslot, int flags) 1735 { 1736 int error; 1737 1738 uvmexp.nswget++; 1739 KASSERT(flags & PGO_SYNCIO); 1740 if (swslot == SWSLOT_BAD) { 1741 return EIO; 1742 } 1743 1744 error = uvm_swap_io(&page, swslot, 1, B_READ | 1745 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1746 if (error == 0) { 1747 1748 /* 1749 * this page is no longer only in swap. 1750 */ 1751 1752 mutex_enter(&uvm_swap_data_lock); 1753 KASSERT(uvmexp.swpgonly > 0); 1754 uvmexp.swpgonly--; 1755 mutex_exit(&uvm_swap_data_lock); 1756 } 1757 return error; 1758 } 1759 1760 /* 1761 * uvm_swap_io: do an i/o operation to swap 1762 */ 1763 1764 static int 1765 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) 1766 { 1767 daddr_t startblk; 1768 struct buf *bp; 1769 vaddr_t kva; 1770 int error, mapinflags; 1771 bool write, async; 1772 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist); 1773 1774 UVMHIST_LOG(pdhist, "<- called, startslot=%jd, npages=%jd, flags=%jd", 1775 startslot, npages, flags, 0); 1776 1777 write = (flags & B_READ) == 0; 1778 async = (flags & B_ASYNC) != 0; 1779 1780 /* 1781 * allocate a buf for the i/o. 1782 */ 1783 1784 KASSERT(curlwp != uvm.pagedaemon_lwp || (write && async)); 1785 bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp); 1786 if (bp == NULL) { 1787 uvm_aio_aiodone_pages(pps, npages, true, ENOMEM); 1788 return ENOMEM; 1789 } 1790 1791 /* 1792 * convert starting drum slot to block number 1793 */ 1794 1795 startblk = btodb((uint64_t)startslot << PAGE_SHIFT); 1796 1797 /* 1798 * first, map the pages into the kernel. 1799 */ 1800 1801 mapinflags = !write ? 1802 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ : 1803 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE; 1804 kva = uvm_pagermapin(pps, npages, mapinflags); 1805 1806 /* 1807 * fill in the bp/sbp. we currently route our i/o through 1808 * /dev/drum's vnode [swapdev_vp]. 1809 */ 1810 1811 bp->b_cflags = BC_BUSY | BC_NOCACHE; 1812 bp->b_flags = (flags & (B_READ|B_ASYNC)); 1813 bp->b_proc = &proc0; /* XXX */ 1814 bp->b_vnbufs.le_next = NOLIST; 1815 bp->b_data = (void *)kva; 1816 bp->b_blkno = startblk; 1817 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT; 1818 1819 /* 1820 * bump v_numoutput (counter of number of active outputs). 1821 */ 1822 1823 if (write) { 1824 mutex_enter(swapdev_vp->v_interlock); 1825 swapdev_vp->v_numoutput++; 1826 mutex_exit(swapdev_vp->v_interlock); 1827 } 1828 1829 /* 1830 * for async ops we must set up the iodone handler. 1831 */ 1832 1833 if (async) { 1834 bp->b_iodone = uvm_aio_biodone; 1835 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); 1836 if (curlwp == uvm.pagedaemon_lwp) 1837 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1838 else 1839 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 1840 } else { 1841 bp->b_iodone = NULL; 1842 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1843 } 1844 UVMHIST_LOG(pdhist, 1845 "about to start io: data = %#jx blkno = 0x%jx, bcount = %jd", 1846 (uintptr_t)bp->b_data, bp->b_blkno, bp->b_bcount, 0); 1847 1848 /* 1849 * now we start the I/O, and if async, return. 1850 */ 1851 1852 VOP_STRATEGY(swapdev_vp, bp); 1853 if (async) 1854 return 0; 1855 1856 /* 1857 * must be sync i/o. wait for it to finish 1858 */ 1859 1860 error = biowait(bp); 1861 1862 /* 1863 * kill the pager mapping 1864 */ 1865 1866 uvm_pagermapout(kva, npages); 1867 1868 /* 1869 * now dispose of the buf and we're done. 1870 */ 1871 1872 if (write) { 1873 mutex_enter(swapdev_vp->v_interlock); 1874 vwakeup(bp); 1875 mutex_exit(swapdev_vp->v_interlock); 1876 } 1877 putiobuf(bp); 1878 UVMHIST_LOG(pdhist, "<- done (sync) error=%jd", error, 0, 0, 0); 1879 1880 return (error); 1881 } 1882