1 /* $NetBSD: uvm_swap.c,v 1.186 2020/02/18 20:23:17 chs Exp $ */ 2 3 /* 4 * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 29 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.186 2020/02/18 20:23:17 chs Exp $"); 34 35 #include "opt_uvmhist.h" 36 #include "opt_compat_netbsd.h" 37 #include "opt_ddb.h" 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/atomic.h> 42 #include <sys/buf.h> 43 #include <sys/bufq.h> 44 #include <sys/conf.h> 45 #include <sys/proc.h> 46 #include <sys/namei.h> 47 #include <sys/disklabel.h> 48 #include <sys/errno.h> 49 #include <sys/kernel.h> 50 #include <sys/vnode.h> 51 #include <sys/file.h> 52 #include <sys/vmem.h> 53 #include <sys/blist.h> 54 #include <sys/mount.h> 55 #include <sys/pool.h> 56 #include <sys/kmem.h> 57 #include <sys/syscallargs.h> 58 #include <sys/swap.h> 59 #include <sys/kauth.h> 60 #include <sys/sysctl.h> 61 #include <sys/workqueue.h> 62 63 #include <uvm/uvm.h> 64 65 #include <miscfs/specfs/specdev.h> 66 67 /* 68 * uvm_swap.c: manage configuration and i/o to swap space. 69 */ 70 71 /* 72 * swap space is managed in the following way: 73 * 74 * each swap partition or file is described by a "swapdev" structure. 75 * each "swapdev" structure contains a "swapent" structure which contains 76 * information that is passed up to the user (via system calls). 77 * 78 * each swap partition is assigned a "priority" (int) which controls 79 * swap parition usage. 80 * 81 * the system maintains a global data structure describing all swap 82 * partitions/files. there is a sorted LIST of "swappri" structures 83 * which describe "swapdev"'s at that priority. this LIST is headed 84 * by the "swap_priority" global var. each "swappri" contains a 85 * TAILQ of "swapdev" structures at that priority. 86 * 87 * locking: 88 * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl 89 * system call and prevents the swap priority list from changing 90 * while we are in the middle of a system call (e.g. SWAP_STATS). 91 * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data 92 * structures including the priority list, the swapdev structures, 93 * and the swapmap arena. 94 * 95 * each swap device has the following info: 96 * - swap device in use (could be disabled, preventing future use) 97 * - swap enabled (allows new allocations on swap) 98 * - map info in /dev/drum 99 * - vnode pointer 100 * for swap files only: 101 * - block size 102 * - max byte count in buffer 103 * - buffer 104 * 105 * userland controls and configures swap with the swapctl(2) system call. 106 * the sys_swapctl performs the following operations: 107 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 108 * [2] SWAP_STATS: given a pointer to an array of swapent structures 109 * (passed in via "arg") of a size passed in via "misc" ... we load 110 * the current swap config into the array. The actual work is done 111 * in the uvm_swap_stats() function. 112 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 113 * priority in "misc", start swapping on it. 114 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 115 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 116 * "misc") 117 */ 118 119 /* 120 * swapdev: describes a single swap partition/file 121 * 122 * note the following should be true: 123 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 124 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] 125 */ 126 struct swapdev { 127 dev_t swd_dev; /* device id */ 128 int swd_flags; /* flags:inuse/enable/fake */ 129 int swd_priority; /* our priority */ 130 int swd_nblks; /* blocks in this device */ 131 char *swd_path; /* saved pathname of device */ 132 int swd_pathlen; /* length of pathname */ 133 int swd_npages; /* #pages we can use */ 134 int swd_npginuse; /* #pages in use */ 135 int swd_npgbad; /* #pages bad */ 136 int swd_drumoffset; /* page0 offset in drum */ 137 int swd_drumsize; /* #pages in drum */ 138 blist_t swd_blist; /* blist for this swapdev */ 139 struct vnode *swd_vp; /* backing vnode */ 140 TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */ 141 142 int swd_bsize; /* blocksize (bytes) */ 143 int swd_maxactive; /* max active i/o reqs */ 144 struct bufq_state *swd_tab; /* buffer list */ 145 int swd_active; /* number of active buffers */ 146 }; 147 148 /* 149 * swap device priority entry; the list is kept sorted on `spi_priority'. 150 */ 151 struct swappri { 152 int spi_priority; /* priority */ 153 TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 154 /* tailq of swapdevs at this priority */ 155 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 156 }; 157 158 /* 159 * The following two structures are used to keep track of data transfers 160 * on swap devices associated with regular files. 161 * NOTE: this code is more or less a copy of vnd.c; we use the same 162 * structure names here to ease porting.. 163 */ 164 struct vndxfer { 165 struct buf *vx_bp; /* Pointer to parent buffer */ 166 struct swapdev *vx_sdp; 167 int vx_error; 168 int vx_pending; /* # of pending aux buffers */ 169 int vx_flags; 170 #define VX_BUSY 1 171 #define VX_DEAD 2 172 }; 173 174 struct vndbuf { 175 struct buf vb_buf; 176 struct vndxfer *vb_xfer; 177 }; 178 179 /* 180 * We keep a of pool vndbuf's and vndxfer structures. 181 */ 182 static struct pool vndxfer_pool, vndbuf_pool; 183 184 /* 185 * local variables 186 */ 187 static vmem_t *swapmap; /* controls the mapping of /dev/drum */ 188 189 /* list of all active swap devices [by priority] */ 190 LIST_HEAD(swap_priority, swappri); 191 static struct swap_priority swap_priority; 192 193 /* locks */ 194 static kmutex_t uvm_swap_data_lock __cacheline_aligned; 195 static krwlock_t swap_syscall_lock; 196 197 /* workqueue and use counter for swap to regular files */ 198 static int sw_reg_count = 0; 199 static struct workqueue *sw_reg_workqueue; 200 201 /* tuneables */ 202 u_int uvm_swapisfull_factor = 99; 203 204 /* 205 * prototypes 206 */ 207 static struct swapdev *swapdrum_getsdp(int); 208 209 static struct swapdev *swaplist_find(struct vnode *, bool); 210 static void swaplist_insert(struct swapdev *, 211 struct swappri *, int); 212 static void swaplist_trim(void); 213 214 static int swap_on(struct lwp *, struct swapdev *); 215 static int swap_off(struct lwp *, struct swapdev *); 216 217 static void sw_reg_strategy(struct swapdev *, struct buf *, int); 218 static void sw_reg_biodone(struct buf *); 219 static void sw_reg_iodone(struct work *wk, void *dummy); 220 static void sw_reg_start(struct swapdev *); 221 222 static int uvm_swap_io(struct vm_page **, int, int, int); 223 224 /* 225 * uvm_swap_init: init the swap system data structures and locks 226 * 227 * => called at boot time from init_main.c after the filesystems 228 * are brought up (which happens after uvm_init()) 229 */ 230 void 231 uvm_swap_init(void) 232 { 233 UVMHIST_FUNC("uvm_swap_init"); 234 235 UVMHIST_CALLED(pdhist); 236 /* 237 * first, init the swap list, its counter, and its lock. 238 * then get a handle on the vnode for /dev/drum by using 239 * the its dev_t number ("swapdev", from MD conf.c). 240 */ 241 242 LIST_INIT(&swap_priority); 243 uvmexp.nswapdev = 0; 244 rw_init(&swap_syscall_lock); 245 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE); 246 247 if (bdevvp(swapdev, &swapdev_vp)) 248 panic("%s: can't get vnode for swap device", __func__); 249 if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY)) 250 panic("%s: can't lock swap device", __func__); 251 if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED)) 252 panic("%s: can't open swap device", __func__); 253 VOP_UNLOCK(swapdev_vp); 254 255 /* 256 * create swap block resource map to map /dev/drum. the range 257 * from 1 to INT_MAX allows 2 gigablocks of swap space. note 258 * that block 0 is reserved (used to indicate an allocation 259 * failure, or no allocation). 260 */ 261 swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0, 262 VM_NOSLEEP, IPL_NONE); 263 if (swapmap == 0) { 264 panic("%s: vmem_create failed", __func__); 265 } 266 267 pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx", 268 NULL, IPL_BIO); 269 pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd", 270 NULL, IPL_BIO); 271 272 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); 273 } 274 275 /* 276 * swaplist functions: functions that operate on the list of swap 277 * devices on the system. 278 */ 279 280 /* 281 * swaplist_insert: insert swap device "sdp" into the global list 282 * 283 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 284 * => caller must provide a newly allocated swappri structure (we will 285 * FREE it if we don't need it... this it to prevent allocation 286 * blocking here while adding swap) 287 */ 288 static void 289 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) 290 { 291 struct swappri *spp, *pspp; 292 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist); 293 294 /* 295 * find entry at or after which to insert the new device. 296 */ 297 pspp = NULL; 298 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 299 if (priority <= spp->spi_priority) 300 break; 301 pspp = spp; 302 } 303 304 /* 305 * new priority? 306 */ 307 if (spp == NULL || spp->spi_priority != priority) { 308 spp = newspp; /* use newspp! */ 309 UVMHIST_LOG(pdhist, "created new swappri = %jd", 310 priority, 0, 0, 0); 311 312 spp->spi_priority = priority; 313 TAILQ_INIT(&spp->spi_swapdev); 314 315 if (pspp) 316 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 317 else 318 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 319 } else { 320 /* we don't need a new priority structure, free it */ 321 kmem_free(newspp, sizeof(*newspp)); 322 } 323 324 /* 325 * priority found (or created). now insert on the priority's 326 * tailq list and bump the total number of swapdevs. 327 */ 328 sdp->swd_priority = priority; 329 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 330 uvmexp.nswapdev++; 331 } 332 333 /* 334 * swaplist_find: find and optionally remove a swap device from the 335 * global list. 336 * 337 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 338 * => we return the swapdev we found (and removed) 339 */ 340 static struct swapdev * 341 swaplist_find(struct vnode *vp, bool remove) 342 { 343 struct swapdev *sdp; 344 struct swappri *spp; 345 346 /* 347 * search the lists for the requested vp 348 */ 349 350 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 351 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 352 if (sdp->swd_vp == vp) { 353 if (remove) { 354 TAILQ_REMOVE(&spp->spi_swapdev, 355 sdp, swd_next); 356 uvmexp.nswapdev--; 357 } 358 return(sdp); 359 } 360 } 361 } 362 return (NULL); 363 } 364 365 /* 366 * swaplist_trim: scan priority list for empty priority entries and kill 367 * them. 368 * 369 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 370 */ 371 static void 372 swaplist_trim(void) 373 { 374 struct swappri *spp, *nextspp; 375 376 LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) { 377 if (!TAILQ_EMPTY(&spp->spi_swapdev)) 378 continue; 379 LIST_REMOVE(spp, spi_swappri); 380 kmem_free(spp, sizeof(*spp)); 381 } 382 } 383 384 /* 385 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 386 * to the "swapdev" that maps that section of the drum. 387 * 388 * => each swapdev takes one big contig chunk of the drum 389 * => caller must hold uvm_swap_data_lock 390 */ 391 static struct swapdev * 392 swapdrum_getsdp(int pgno) 393 { 394 struct swapdev *sdp; 395 struct swappri *spp; 396 397 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 398 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 399 if (sdp->swd_flags & SWF_FAKE) 400 continue; 401 if (pgno >= sdp->swd_drumoffset && 402 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 403 return sdp; 404 } 405 } 406 } 407 return NULL; 408 } 409 410 void swapsys_lock(krw_t op) 411 { 412 rw_enter(&swap_syscall_lock, op); 413 } 414 415 void swapsys_unlock(void) 416 { 417 rw_exit(&swap_syscall_lock); 418 } 419 420 static void 421 swapent_cvt(struct swapent *se, const struct swapdev *sdp, int inuse) 422 { 423 se->se_dev = sdp->swd_dev; 424 se->se_flags = sdp->swd_flags; 425 se->se_nblks = sdp->swd_nblks; 426 se->se_inuse = inuse; 427 se->se_priority = sdp->swd_priority; 428 KASSERT(sdp->swd_pathlen < sizeof(se->se_path)); 429 strcpy(se->se_path, sdp->swd_path); 430 } 431 432 int (*uvm_swap_stats13)(const struct sys_swapctl_args *, register_t *) = 433 (void *)enosys; 434 int (*uvm_swap_stats50)(const struct sys_swapctl_args *, register_t *) = 435 (void *)enosys; 436 437 /* 438 * sys_swapctl: main entry point for swapctl(2) system call 439 * [with two helper functions: swap_on and swap_off] 440 */ 441 int 442 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval) 443 { 444 /* { 445 syscallarg(int) cmd; 446 syscallarg(void *) arg; 447 syscallarg(int) misc; 448 } */ 449 struct vnode *vp; 450 struct nameidata nd; 451 struct swappri *spp; 452 struct swapdev *sdp; 453 #define SWAP_PATH_MAX (PATH_MAX + 1) 454 char *userpath; 455 size_t len = 0; 456 int error; 457 int priority; 458 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist); 459 460 /* 461 * we handle the non-priv NSWAP and STATS request first. 462 * 463 * SWAP_NSWAP: return number of config'd swap devices 464 * [can also be obtained with uvmexp sysctl] 465 */ 466 if (SCARG(uap, cmd) == SWAP_NSWAP) { 467 const int nswapdev = uvmexp.nswapdev; 468 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%jd", nswapdev, 469 0, 0, 0); 470 *retval = nswapdev; 471 return 0; 472 } 473 474 userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP); 475 476 /* 477 * ensure serialized syscall access by grabbing the swap_syscall_lock 478 */ 479 rw_enter(&swap_syscall_lock, RW_WRITER); 480 481 /* 482 * SWAP_STATS: get stats on current # of configured swap devs 483 * 484 * note that the swap_priority list can't change as long 485 * as we are holding the swap_syscall_lock. we don't want 486 * to grab the uvm_swap_data_lock because we may fault&sleep during 487 * copyout() and we don't want to be holding that lock then! 488 */ 489 switch (SCARG(uap, cmd)) { 490 case SWAP_STATS13: 491 error = (*uvm_swap_stats13)(uap, retval); 492 goto out; 493 case SWAP_STATS50: 494 error = (*uvm_swap_stats50)(uap, retval); 495 goto out; 496 case SWAP_STATS: 497 error = uvm_swap_stats(SCARG(uap, arg), SCARG(uap, misc), 498 NULL, sizeof(struct swapent), retval); 499 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); 500 goto out; 501 502 case SWAP_GETDUMPDEV: 503 error = copyout(&dumpdev, SCARG(uap, arg), sizeof(dumpdev)); 504 goto out; 505 default: 506 break; 507 } 508 509 /* 510 * all other requests require superuser privs. verify. 511 */ 512 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL, 513 0, NULL, NULL, NULL))) 514 goto out; 515 516 if (SCARG(uap, cmd) == SWAP_DUMPOFF) { 517 /* drop the current dump device */ 518 dumpdev = NODEV; 519 dumpcdev = NODEV; 520 cpu_dumpconf(); 521 goto out; 522 } 523 524 /* 525 * at this point we expect a path name in arg. we will 526 * use namei() to gain a vnode reference (vref), and lock 527 * the vnode (VOP_LOCK). 528 * 529 * XXX: a NULL arg means use the root vnode pointer (e.g. for 530 * miniroot) 531 */ 532 if (SCARG(uap, arg) == NULL) { 533 vp = rootvp; /* miniroot */ 534 vref(vp); 535 if (vn_lock(vp, LK_EXCLUSIVE)) { 536 vrele(vp); 537 error = EBUSY; 538 goto out; 539 } 540 if (SCARG(uap, cmd) == SWAP_ON && 541 copystr("miniroot", userpath, SWAP_PATH_MAX, &len)) 542 panic("swapctl: miniroot copy failed"); 543 } else { 544 struct pathbuf *pb; 545 546 /* 547 * This used to allow copying in one extra byte 548 * (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON. 549 * This was completely pointless because if anyone 550 * used that extra byte namei would fail with 551 * ENAMETOOLONG anyway, so I've removed the excess 552 * logic. - dholland 20100215 553 */ 554 555 error = pathbuf_copyin(SCARG(uap, arg), &pb); 556 if (error) { 557 goto out; 558 } 559 if (SCARG(uap, cmd) == SWAP_ON) { 560 /* get a copy of the string */ 561 pathbuf_copystring(pb, userpath, SWAP_PATH_MAX); 562 len = strlen(userpath) + 1; 563 } 564 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb); 565 if ((error = namei(&nd))) { 566 pathbuf_destroy(pb); 567 goto out; 568 } 569 vp = nd.ni_vp; 570 pathbuf_destroy(pb); 571 } 572 /* note: "vp" is referenced and locked */ 573 574 error = 0; /* assume no error */ 575 switch(SCARG(uap, cmd)) { 576 577 case SWAP_DUMPDEV: 578 if (vp->v_type != VBLK) { 579 error = ENOTBLK; 580 break; 581 } 582 if (bdevsw_lookup(vp->v_rdev)) { 583 dumpdev = vp->v_rdev; 584 dumpcdev = devsw_blk2chr(dumpdev); 585 } else 586 dumpdev = NODEV; 587 cpu_dumpconf(); 588 break; 589 590 case SWAP_CTL: 591 /* 592 * get new priority, remove old entry (if any) and then 593 * reinsert it in the correct place. finally, prune out 594 * any empty priority structures. 595 */ 596 priority = SCARG(uap, misc); 597 spp = kmem_alloc(sizeof(*spp), KM_SLEEP); 598 mutex_enter(&uvm_swap_data_lock); 599 if ((sdp = swaplist_find(vp, true)) == NULL) { 600 error = ENOENT; 601 } else { 602 swaplist_insert(sdp, spp, priority); 603 swaplist_trim(); 604 } 605 mutex_exit(&uvm_swap_data_lock); 606 if (error) 607 kmem_free(spp, sizeof(*spp)); 608 break; 609 610 case SWAP_ON: 611 612 /* 613 * check for duplicates. if none found, then insert a 614 * dummy entry on the list to prevent someone else from 615 * trying to enable this device while we are working on 616 * it. 617 */ 618 619 priority = SCARG(uap, misc); 620 sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP); 621 spp = kmem_alloc(sizeof(*spp), KM_SLEEP); 622 sdp->swd_flags = SWF_FAKE; 623 sdp->swd_vp = vp; 624 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 625 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK); 626 mutex_enter(&uvm_swap_data_lock); 627 if (swaplist_find(vp, false) != NULL) { 628 error = EBUSY; 629 mutex_exit(&uvm_swap_data_lock); 630 bufq_free(sdp->swd_tab); 631 kmem_free(sdp, sizeof(*sdp)); 632 kmem_free(spp, sizeof(*spp)); 633 break; 634 } 635 swaplist_insert(sdp, spp, priority); 636 mutex_exit(&uvm_swap_data_lock); 637 638 KASSERT(len > 0); 639 sdp->swd_pathlen = len; 640 sdp->swd_path = kmem_alloc(len, KM_SLEEP); 641 if (copystr(userpath, sdp->swd_path, len, 0) != 0) 642 panic("swapctl: copystr"); 643 644 /* 645 * we've now got a FAKE placeholder in the swap list. 646 * now attempt to enable swap on it. if we fail, undo 647 * what we've done and kill the fake entry we just inserted. 648 * if swap_on is a success, it will clear the SWF_FAKE flag 649 */ 650 651 if ((error = swap_on(l, sdp)) != 0) { 652 mutex_enter(&uvm_swap_data_lock); 653 (void) swaplist_find(vp, true); /* kill fake entry */ 654 swaplist_trim(); 655 mutex_exit(&uvm_swap_data_lock); 656 bufq_free(sdp->swd_tab); 657 kmem_free(sdp->swd_path, sdp->swd_pathlen); 658 kmem_free(sdp, sizeof(*sdp)); 659 break; 660 } 661 break; 662 663 case SWAP_OFF: 664 mutex_enter(&uvm_swap_data_lock); 665 if ((sdp = swaplist_find(vp, false)) == NULL) { 666 mutex_exit(&uvm_swap_data_lock); 667 error = ENXIO; 668 break; 669 } 670 671 /* 672 * If a device isn't in use or enabled, we 673 * can't stop swapping from it (again). 674 */ 675 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 676 mutex_exit(&uvm_swap_data_lock); 677 error = EBUSY; 678 break; 679 } 680 681 /* 682 * do the real work. 683 */ 684 error = swap_off(l, sdp); 685 break; 686 687 default: 688 error = EINVAL; 689 } 690 691 /* 692 * done! release the ref gained by namei() and unlock. 693 */ 694 vput(vp); 695 out: 696 rw_exit(&swap_syscall_lock); 697 kmem_free(userpath, SWAP_PATH_MAX); 698 699 UVMHIST_LOG(pdhist, "<- done! error=%jd", error, 0, 0, 0); 700 return (error); 701 } 702 703 /* 704 * uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept 705 * away from sys_swapctl() in order to allow COMPAT_* swapctl() 706 * emulation to use it directly without going through sys_swapctl(). 707 * The problem with using sys_swapctl() there is that it involves 708 * copying the swapent array to the stackgap, and this array's size 709 * is not known at build time. Hence it would not be possible to 710 * ensure it would fit in the stackgap in any case. 711 */ 712 int 713 uvm_swap_stats(char *ptr, int misc, 714 void (*f)(void *, const struct swapent *), size_t len, 715 register_t *retval) 716 { 717 struct swappri *spp; 718 struct swapdev *sdp; 719 struct swapent sep; 720 int count = 0; 721 int error; 722 723 KASSERT(len <= sizeof(sep)); 724 if (len == 0) 725 return ENOSYS; 726 727 if (misc < 0) 728 return EINVAL; 729 730 if (misc == 0 || uvmexp.nswapdev == 0) 731 return 0; 732 733 /* Make sure userland cannot exhaust kernel memory */ 734 if ((size_t)misc > (size_t)uvmexp.nswapdev) 735 misc = uvmexp.nswapdev; 736 737 KASSERT(rw_lock_held(&swap_syscall_lock)); 738 739 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 740 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 741 int inuse; 742 743 if (misc-- <= 0) 744 break; 745 746 inuse = btodb((uint64_t)sdp->swd_npginuse << 747 PAGE_SHIFT); 748 749 memset(&sep, 0, sizeof(sep)); 750 swapent_cvt(&sep, sdp, inuse); 751 if (f) 752 (*f)(&sep, &sep); 753 if ((error = copyout(&sep, ptr, len)) != 0) 754 return error; 755 ptr += len; 756 count++; 757 } 758 } 759 *retval = count; 760 return 0; 761 } 762 763 /* 764 * swap_on: attempt to enable a swapdev for swapping. note that the 765 * swapdev is already on the global list, but disabled (marked 766 * SWF_FAKE). 767 * 768 * => we avoid the start of the disk (to protect disk labels) 769 * => we also avoid the miniroot, if we are swapping to root. 770 * => caller should leave uvm_swap_data_lock unlocked, we may lock it 771 * if needed. 772 */ 773 static int 774 swap_on(struct lwp *l, struct swapdev *sdp) 775 { 776 struct vnode *vp; 777 int error, npages, nblocks, size; 778 long addr; 779 vmem_addr_t result; 780 struct vattr va; 781 dev_t dev; 782 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist); 783 784 /* 785 * we want to enable swapping on sdp. the swd_vp contains 786 * the vnode we want (locked and ref'd), and the swd_dev 787 * contains the dev_t of the file, if it a block device. 788 */ 789 790 vp = sdp->swd_vp; 791 dev = sdp->swd_dev; 792 793 /* 794 * open the swap file (mostly useful for block device files to 795 * let device driver know what is up). 796 * 797 * we skip the open/close for root on swap because the root 798 * has already been opened when root was mounted (mountroot). 799 */ 800 if (vp != rootvp) { 801 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred))) 802 return (error); 803 } 804 805 /* XXX this only works for block devices */ 806 UVMHIST_LOG(pdhist, " dev=%jd, major(dev)=%jd", dev, major(dev), 0, 0); 807 808 /* 809 * we now need to determine the size of the swap area. for 810 * block specials we can call the d_psize function. 811 * for normal files, we must stat [get attrs]. 812 * 813 * we put the result in nblks. 814 * for normal files, we also want the filesystem block size 815 * (which we get with statfs). 816 */ 817 switch (vp->v_type) { 818 case VBLK: 819 if ((nblocks = bdev_size(dev)) == -1) { 820 error = ENXIO; 821 goto bad; 822 } 823 break; 824 825 case VREG: 826 if ((error = VOP_GETATTR(vp, &va, l->l_cred))) 827 goto bad; 828 nblocks = (int)btodb(va.va_size); 829 sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift; 830 /* 831 * limit the max # of outstanding I/O requests we issue 832 * at any one time. take it easy on NFS servers. 833 */ 834 if (vp->v_tag == VT_NFS) 835 sdp->swd_maxactive = 2; /* XXX */ 836 else 837 sdp->swd_maxactive = 8; /* XXX */ 838 break; 839 840 default: 841 error = ENXIO; 842 goto bad; 843 } 844 845 /* 846 * save nblocks in a safe place and convert to pages. 847 */ 848 849 sdp->swd_nblks = nblocks; 850 npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT; 851 852 /* 853 * for block special files, we want to make sure that leave 854 * the disklabel and bootblocks alone, so we arrange to skip 855 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 856 * note that because of this the "size" can be less than the 857 * actual number of blocks on the device. 858 */ 859 if (vp->v_type == VBLK) { 860 /* we use pages 1 to (size - 1) [inclusive] */ 861 size = npages - 1; 862 addr = 1; 863 } else { 864 /* we use pages 0 to (size - 1) [inclusive] */ 865 size = npages; 866 addr = 0; 867 } 868 869 /* 870 * make sure we have enough blocks for a reasonable sized swap 871 * area. we want at least one page. 872 */ 873 874 if (size < 1) { 875 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); 876 error = EINVAL; 877 goto bad; 878 } 879 880 UVMHIST_LOG(pdhist, " dev=%jx: size=%jd addr=%jd", dev, size, addr, 0); 881 882 /* 883 * now we need to allocate an extent to manage this swap device 884 */ 885 886 sdp->swd_blist = blist_create(npages); 887 /* mark all expect the `saved' region free. */ 888 blist_free(sdp->swd_blist, addr, size); 889 890 /* 891 * if the vnode we are swapping to is the root vnode 892 * (i.e. we are swapping to the miniroot) then we want 893 * to make sure we don't overwrite it. do a statfs to 894 * find its size and skip over it. 895 */ 896 if (vp == rootvp) { 897 struct mount *mp; 898 struct statvfs *sp; 899 int rootblocks, rootpages; 900 901 mp = rootvnode->v_mount; 902 sp = &mp->mnt_stat; 903 rootblocks = sp->f_blocks * btodb(sp->f_frsize); 904 /* 905 * XXX: sp->f_blocks isn't the total number of 906 * blocks in the filesystem, it's the number of 907 * data blocks. so, our rootblocks almost 908 * definitely underestimates the total size 909 * of the filesystem - how badly depends on the 910 * details of the filesystem type. there isn't 911 * an obvious way to deal with this cleanly 912 * and perfectly, so for now we just pad our 913 * rootblocks estimate with an extra 5 percent. 914 */ 915 rootblocks += (rootblocks >> 5) + 916 (rootblocks >> 6) + 917 (rootblocks >> 7); 918 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; 919 if (rootpages > size) 920 panic("swap_on: miniroot larger than swap?"); 921 922 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) { 923 panic("swap_on: unable to preserve miniroot"); 924 } 925 926 size -= rootpages; 927 printf("Preserved %d pages of miniroot ", rootpages); 928 printf("leaving %d pages of swap\n", size); 929 } 930 931 /* 932 * add a ref to vp to reflect usage as a swap device. 933 */ 934 vref(vp); 935 936 /* 937 * now add the new swapdev to the drum and enable. 938 */ 939 error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result); 940 if (error != 0) 941 panic("swapdrum_add"); 942 /* 943 * If this is the first regular swap create the workqueue. 944 * => Protected by swap_syscall_lock. 945 */ 946 if (vp->v_type != VBLK) { 947 if (sw_reg_count++ == 0) { 948 KASSERT(sw_reg_workqueue == NULL); 949 if (workqueue_create(&sw_reg_workqueue, "swapiod", 950 sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0) 951 panic("%s: workqueue_create failed", __func__); 952 } 953 } 954 955 sdp->swd_drumoffset = (int)result; 956 sdp->swd_drumsize = npages; 957 sdp->swd_npages = size; 958 mutex_enter(&uvm_swap_data_lock); 959 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 960 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 961 uvmexp.swpages += size; 962 uvmexp.swpgavail += size; 963 mutex_exit(&uvm_swap_data_lock); 964 return (0); 965 966 /* 967 * failure: clean up and return error. 968 */ 969 970 bad: 971 if (sdp->swd_blist) { 972 blist_destroy(sdp->swd_blist); 973 } 974 if (vp != rootvp) { 975 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred); 976 } 977 return (error); 978 } 979 980 /* 981 * swap_off: stop swapping on swapdev 982 * 983 * => swap data should be locked, we will unlock. 984 */ 985 static int 986 swap_off(struct lwp *l, struct swapdev *sdp) 987 { 988 int npages = sdp->swd_npages; 989 int error = 0; 990 991 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist); 992 UVMHIST_LOG(pdhist, " dev=%jx, npages=%jd", sdp->swd_dev,npages, 0, 0); 993 994 /* disable the swap area being removed */ 995 sdp->swd_flags &= ~SWF_ENABLE; 996 uvmexp.swpgavail -= npages; 997 mutex_exit(&uvm_swap_data_lock); 998 999 /* 1000 * the idea is to find all the pages that are paged out to this 1001 * device, and page them all in. in uvm, swap-backed pageable 1002 * memory can take two forms: aobjs and anons. call the 1003 * swapoff hook for each subsystem to bring in pages. 1004 */ 1005 1006 if (uao_swap_off(sdp->swd_drumoffset, 1007 sdp->swd_drumoffset + sdp->swd_drumsize) || 1008 amap_swap_off(sdp->swd_drumoffset, 1009 sdp->swd_drumoffset + sdp->swd_drumsize)) { 1010 error = ENOMEM; 1011 } else if (sdp->swd_npginuse > sdp->swd_npgbad) { 1012 error = EBUSY; 1013 } 1014 1015 if (error) { 1016 mutex_enter(&uvm_swap_data_lock); 1017 sdp->swd_flags |= SWF_ENABLE; 1018 uvmexp.swpgavail += npages; 1019 mutex_exit(&uvm_swap_data_lock); 1020 1021 return error; 1022 } 1023 1024 /* 1025 * If this is the last regular swap destroy the workqueue. 1026 * => Protected by swap_syscall_lock. 1027 */ 1028 if (sdp->swd_vp->v_type != VBLK) { 1029 KASSERT(sw_reg_count > 0); 1030 KASSERT(sw_reg_workqueue != NULL); 1031 if (--sw_reg_count == 0) { 1032 workqueue_destroy(sw_reg_workqueue); 1033 sw_reg_workqueue = NULL; 1034 } 1035 } 1036 1037 /* 1038 * done with the vnode. 1039 * drop our ref on the vnode before calling VOP_CLOSE() 1040 * so that spec_close() can tell if this is the last close. 1041 */ 1042 vrele(sdp->swd_vp); 1043 if (sdp->swd_vp != rootvp) { 1044 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred); 1045 } 1046 1047 mutex_enter(&uvm_swap_data_lock); 1048 uvmexp.swpages -= npages; 1049 uvmexp.swpginuse -= sdp->swd_npgbad; 1050 1051 if (swaplist_find(sdp->swd_vp, true) == NULL) 1052 panic("%s: swapdev not in list", __func__); 1053 swaplist_trim(); 1054 mutex_exit(&uvm_swap_data_lock); 1055 1056 /* 1057 * free all resources! 1058 */ 1059 vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize); 1060 blist_destroy(sdp->swd_blist); 1061 bufq_free(sdp->swd_tab); 1062 kmem_free(sdp, sizeof(*sdp)); 1063 return (0); 1064 } 1065 1066 void 1067 uvm_swap_shutdown(struct lwp *l) 1068 { 1069 struct swapdev *sdp; 1070 struct swappri *spp; 1071 struct vnode *vp; 1072 int error; 1073 1074 printf("turning off swap..."); 1075 rw_enter(&swap_syscall_lock, RW_WRITER); 1076 mutex_enter(&uvm_swap_data_lock); 1077 again: 1078 LIST_FOREACH(spp, &swap_priority, spi_swappri) 1079 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1080 if (sdp->swd_flags & SWF_FAKE) 1081 continue; 1082 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) 1083 continue; 1084 #ifdef DEBUG 1085 printf("\nturning off swap on %s...", 1086 sdp->swd_path); 1087 #endif 1088 if (vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE)) { 1089 error = EBUSY; 1090 vp = NULL; 1091 } else 1092 error = 0; 1093 if (!error) { 1094 error = swap_off(l, sdp); 1095 mutex_enter(&uvm_swap_data_lock); 1096 } 1097 if (error) { 1098 printf("stopping swap on %s failed " 1099 "with error %d\n", sdp->swd_path, error); 1100 TAILQ_REMOVE(&spp->spi_swapdev, sdp, 1101 swd_next); 1102 uvmexp.nswapdev--; 1103 swaplist_trim(); 1104 if (vp) 1105 vput(vp); 1106 } 1107 goto again; 1108 } 1109 printf(" done\n"); 1110 mutex_exit(&uvm_swap_data_lock); 1111 rw_exit(&swap_syscall_lock); 1112 } 1113 1114 1115 /* 1116 * /dev/drum interface and i/o functions 1117 */ 1118 1119 /* 1120 * swstrategy: perform I/O on the drum 1121 * 1122 * => we must map the i/o request from the drum to the correct swapdev. 1123 */ 1124 static void 1125 swstrategy(struct buf *bp) 1126 { 1127 struct swapdev *sdp; 1128 struct vnode *vp; 1129 int pageno, bn; 1130 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist); 1131 1132 /* 1133 * convert block number to swapdev. note that swapdev can't 1134 * be yanked out from under us because we are holding resources 1135 * in it (i.e. the blocks we are doing I/O on). 1136 */ 1137 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT; 1138 mutex_enter(&uvm_swap_data_lock); 1139 sdp = swapdrum_getsdp(pageno); 1140 mutex_exit(&uvm_swap_data_lock); 1141 if (sdp == NULL) { 1142 bp->b_error = EINVAL; 1143 bp->b_resid = bp->b_bcount; 1144 biodone(bp); 1145 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); 1146 return; 1147 } 1148 1149 /* 1150 * convert drum page number to block number on this swapdev. 1151 */ 1152 1153 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1154 bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ 1155 1156 UVMHIST_LOG(pdhist, " Rd/Wr (0/1) %jd: mapoff=%jx bn=%jx bcount=%jd", 1157 ((bp->b_flags & B_READ) == 0) ? 1 : 0, 1158 sdp->swd_drumoffset, bn, bp->b_bcount); 1159 1160 /* 1161 * for block devices we finish up here. 1162 * for regular files we have to do more work which we delegate 1163 * to sw_reg_strategy(). 1164 */ 1165 1166 vp = sdp->swd_vp; /* swapdev vnode pointer */ 1167 switch (vp->v_type) { 1168 default: 1169 panic("%s: vnode type 0x%x", __func__, vp->v_type); 1170 1171 case VBLK: 1172 1173 /* 1174 * must convert "bp" from an I/O on /dev/drum to an I/O 1175 * on the swapdev (sdp). 1176 */ 1177 bp->b_blkno = bn; /* swapdev block number */ 1178 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */ 1179 1180 /* 1181 * if we are doing a write, we have to redirect the i/o on 1182 * drum's v_numoutput counter to the swapdevs. 1183 */ 1184 if ((bp->b_flags & B_READ) == 0) { 1185 mutex_enter(bp->b_objlock); 1186 vwakeup(bp); /* kills one 'v_numoutput' on drum */ 1187 mutex_exit(bp->b_objlock); 1188 mutex_enter(vp->v_interlock); 1189 vp->v_numoutput++; /* put it on swapdev */ 1190 mutex_exit(vp->v_interlock); 1191 } 1192 1193 /* 1194 * finally plug in swapdev vnode and start I/O 1195 */ 1196 bp->b_vp = vp; 1197 bp->b_objlock = vp->v_interlock; 1198 VOP_STRATEGY(vp, bp); 1199 return; 1200 1201 case VREG: 1202 /* 1203 * delegate to sw_reg_strategy function. 1204 */ 1205 sw_reg_strategy(sdp, bp, bn); 1206 return; 1207 } 1208 /* NOTREACHED */ 1209 } 1210 1211 /* 1212 * swread: the read function for the drum (just a call to physio) 1213 */ 1214 /*ARGSUSED*/ 1215 static int 1216 swread(dev_t dev, struct uio *uio, int ioflag) 1217 { 1218 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist); 1219 1220 UVMHIST_LOG(pdhist, " dev=%jx offset=%jx", dev, uio->uio_offset, 0, 0); 1221 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); 1222 } 1223 1224 /* 1225 * swwrite: the write function for the drum (just a call to physio) 1226 */ 1227 /*ARGSUSED*/ 1228 static int 1229 swwrite(dev_t dev, struct uio *uio, int ioflag) 1230 { 1231 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist); 1232 1233 UVMHIST_LOG(pdhist, " dev=%jx offset=%jx", dev, uio->uio_offset, 0, 0); 1234 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); 1235 } 1236 1237 const struct bdevsw swap_bdevsw = { 1238 .d_open = nullopen, 1239 .d_close = nullclose, 1240 .d_strategy = swstrategy, 1241 .d_ioctl = noioctl, 1242 .d_dump = nodump, 1243 .d_psize = nosize, 1244 .d_discard = nodiscard, 1245 .d_flag = D_OTHER 1246 }; 1247 1248 const struct cdevsw swap_cdevsw = { 1249 .d_open = nullopen, 1250 .d_close = nullclose, 1251 .d_read = swread, 1252 .d_write = swwrite, 1253 .d_ioctl = noioctl, 1254 .d_stop = nostop, 1255 .d_tty = notty, 1256 .d_poll = nopoll, 1257 .d_mmap = nommap, 1258 .d_kqfilter = nokqfilter, 1259 .d_discard = nodiscard, 1260 .d_flag = D_OTHER, 1261 }; 1262 1263 /* 1264 * sw_reg_strategy: handle swap i/o to regular files 1265 */ 1266 static void 1267 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) 1268 { 1269 struct vnode *vp; 1270 struct vndxfer *vnx; 1271 daddr_t nbn; 1272 char *addr; 1273 off_t byteoff; 1274 int s, off, nra, error, sz, resid; 1275 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist); 1276 1277 /* 1278 * allocate a vndxfer head for this transfer and point it to 1279 * our buffer. 1280 */ 1281 vnx = pool_get(&vndxfer_pool, PR_WAITOK); 1282 vnx->vx_flags = VX_BUSY; 1283 vnx->vx_error = 0; 1284 vnx->vx_pending = 0; 1285 vnx->vx_bp = bp; 1286 vnx->vx_sdp = sdp; 1287 1288 /* 1289 * setup for main loop where we read filesystem blocks into 1290 * our buffer. 1291 */ 1292 error = 0; 1293 bp->b_resid = bp->b_bcount; /* nothing transferred yet! */ 1294 addr = bp->b_data; /* current position in buffer */ 1295 byteoff = dbtob((uint64_t)bn); 1296 1297 for (resid = bp->b_resid; resid; resid -= sz) { 1298 struct vndbuf *nbp; 1299 1300 /* 1301 * translate byteoffset into block number. return values: 1302 * vp = vnode of underlying device 1303 * nbn = new block number (on underlying vnode dev) 1304 * nra = num blocks we can read-ahead (excludes requested 1305 * block) 1306 */ 1307 nra = 0; 1308 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1309 &vp, &nbn, &nra); 1310 1311 if (error == 0 && nbn == (daddr_t)-1) { 1312 /* 1313 * this used to just set error, but that doesn't 1314 * do the right thing. Instead, it causes random 1315 * memory errors. The panic() should remain until 1316 * this condition doesn't destabilize the system. 1317 */ 1318 #if 1 1319 panic("%s: swap to sparse file", __func__); 1320 #else 1321 error = EIO; /* failure */ 1322 #endif 1323 } 1324 1325 /* 1326 * punt if there was an error or a hole in the file. 1327 * we must wait for any i/o ops we have already started 1328 * to finish before returning. 1329 * 1330 * XXX we could deal with holes here but it would be 1331 * a hassle (in the write case). 1332 */ 1333 if (error) { 1334 s = splbio(); 1335 vnx->vx_error = error; /* pass error up */ 1336 goto out; 1337 } 1338 1339 /* 1340 * compute the size ("sz") of this transfer (in bytes). 1341 */ 1342 off = byteoff % sdp->swd_bsize; 1343 sz = (1 + nra) * sdp->swd_bsize - off; 1344 if (sz > resid) 1345 sz = resid; 1346 1347 UVMHIST_LOG(pdhist, "sw_reg_strategy: " 1348 "vp %#jx/%#jx offset 0x%jx/0x%jx", 1349 (uintptr_t)sdp->swd_vp, (uintptr_t)vp, byteoff, nbn); 1350 1351 /* 1352 * now get a buf structure. note that the vb_buf is 1353 * at the front of the nbp structure so that you can 1354 * cast pointers between the two structure easily. 1355 */ 1356 nbp = pool_get(&vndbuf_pool, PR_WAITOK); 1357 buf_init(&nbp->vb_buf); 1358 nbp->vb_buf.b_flags = bp->b_flags; 1359 nbp->vb_buf.b_cflags = bp->b_cflags; 1360 nbp->vb_buf.b_oflags = bp->b_oflags; 1361 nbp->vb_buf.b_bcount = sz; 1362 nbp->vb_buf.b_bufsize = sz; 1363 nbp->vb_buf.b_error = 0; 1364 nbp->vb_buf.b_data = addr; 1365 nbp->vb_buf.b_lblkno = 0; 1366 nbp->vb_buf.b_blkno = nbn + btodb(off); 1367 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno; 1368 nbp->vb_buf.b_iodone = sw_reg_biodone; 1369 nbp->vb_buf.b_vp = vp; 1370 nbp->vb_buf.b_objlock = vp->v_interlock; 1371 if (vp->v_type == VBLK) { 1372 nbp->vb_buf.b_dev = vp->v_rdev; 1373 } 1374 1375 nbp->vb_xfer = vnx; /* patch it back in to vnx */ 1376 1377 /* 1378 * Just sort by block number 1379 */ 1380 s = splbio(); 1381 if (vnx->vx_error != 0) { 1382 buf_destroy(&nbp->vb_buf); 1383 pool_put(&vndbuf_pool, nbp); 1384 goto out; 1385 } 1386 vnx->vx_pending++; 1387 1388 /* sort it in and start I/O if we are not over our limit */ 1389 /* XXXAD locking */ 1390 bufq_put(sdp->swd_tab, &nbp->vb_buf); 1391 sw_reg_start(sdp); 1392 splx(s); 1393 1394 /* 1395 * advance to the next I/O 1396 */ 1397 byteoff += sz; 1398 addr += sz; 1399 } 1400 1401 s = splbio(); 1402 1403 out: /* Arrive here at splbio */ 1404 vnx->vx_flags &= ~VX_BUSY; 1405 if (vnx->vx_pending == 0) { 1406 error = vnx->vx_error; 1407 pool_put(&vndxfer_pool, vnx); 1408 bp->b_error = error; 1409 biodone(bp); 1410 } 1411 splx(s); 1412 } 1413 1414 /* 1415 * sw_reg_start: start an I/O request on the requested swapdev 1416 * 1417 * => reqs are sorted by b_rawblkno (above) 1418 */ 1419 static void 1420 sw_reg_start(struct swapdev *sdp) 1421 { 1422 struct buf *bp; 1423 struct vnode *vp; 1424 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist); 1425 1426 /* recursion control */ 1427 if ((sdp->swd_flags & SWF_BUSY) != 0) 1428 return; 1429 1430 sdp->swd_flags |= SWF_BUSY; 1431 1432 while (sdp->swd_active < sdp->swd_maxactive) { 1433 bp = bufq_get(sdp->swd_tab); 1434 if (bp == NULL) 1435 break; 1436 sdp->swd_active++; 1437 1438 UVMHIST_LOG(pdhist, 1439 "sw_reg_start: bp %#jx vp %#jx blkno %#jx cnt %jx", 1440 (uintptr_t)bp, (uintptr_t)bp->b_vp, (uintptr_t)bp->b_blkno, 1441 bp->b_bcount); 1442 vp = bp->b_vp; 1443 KASSERT(bp->b_objlock == vp->v_interlock); 1444 if ((bp->b_flags & B_READ) == 0) { 1445 mutex_enter(vp->v_interlock); 1446 vp->v_numoutput++; 1447 mutex_exit(vp->v_interlock); 1448 } 1449 VOP_STRATEGY(vp, bp); 1450 } 1451 sdp->swd_flags &= ~SWF_BUSY; 1452 } 1453 1454 /* 1455 * sw_reg_biodone: one of our i/o's has completed 1456 */ 1457 static void 1458 sw_reg_biodone(struct buf *bp) 1459 { 1460 workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL); 1461 } 1462 1463 /* 1464 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1465 * 1466 * => note that we can recover the vndbuf struct by casting the buf ptr 1467 */ 1468 static void 1469 sw_reg_iodone(struct work *wk, void *dummy) 1470 { 1471 struct vndbuf *vbp = (void *)wk; 1472 struct vndxfer *vnx = vbp->vb_xfer; 1473 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1474 struct swapdev *sdp = vnx->vx_sdp; 1475 int s, resid, error; 1476 KASSERT(&vbp->vb_buf.b_work == wk); 1477 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist); 1478 1479 UVMHIST_LOG(pdhist, " vbp=%#jx vp=%#jx blkno=%jx addr=%#jx", 1480 (uintptr_t)vbp, (uintptr_t)vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, 1481 (uintptr_t)vbp->vb_buf.b_data); 1482 UVMHIST_LOG(pdhist, " cnt=%jx resid=%jx", 1483 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); 1484 1485 /* 1486 * protect vbp at splbio and update. 1487 */ 1488 1489 s = splbio(); 1490 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1491 pbp->b_resid -= resid; 1492 vnx->vx_pending--; 1493 1494 if (vbp->vb_buf.b_error != 0) { 1495 /* pass error upward */ 1496 error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO; 1497 UVMHIST_LOG(pdhist, " got error=%jd !", error, 0, 0, 0); 1498 vnx->vx_error = error; 1499 } 1500 1501 /* 1502 * kill vbp structure 1503 */ 1504 buf_destroy(&vbp->vb_buf); 1505 pool_put(&vndbuf_pool, vbp); 1506 1507 /* 1508 * wrap up this transaction if it has run to completion or, in 1509 * case of an error, when all auxiliary buffers have returned. 1510 */ 1511 if (vnx->vx_error != 0) { 1512 /* pass error upward */ 1513 error = vnx->vx_error; 1514 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1515 pbp->b_error = error; 1516 biodone(pbp); 1517 pool_put(&vndxfer_pool, vnx); 1518 } 1519 } else if (pbp->b_resid == 0) { 1520 KASSERT(vnx->vx_pending == 0); 1521 if ((vnx->vx_flags & VX_BUSY) == 0) { 1522 UVMHIST_LOG(pdhist, " iodone, pbp=%#jx error=%jd !", 1523 (uintptr_t)pbp, vnx->vx_error, 0, 0); 1524 biodone(pbp); 1525 pool_put(&vndxfer_pool, vnx); 1526 } 1527 } 1528 1529 /* 1530 * done! start next swapdev I/O if one is pending 1531 */ 1532 sdp->swd_active--; 1533 sw_reg_start(sdp); 1534 splx(s); 1535 } 1536 1537 1538 /* 1539 * uvm_swap_alloc: allocate space on swap 1540 * 1541 * => allocation is done "round robin" down the priority list, as we 1542 * allocate in a priority we "rotate" the circle queue. 1543 * => space can be freed with uvm_swap_free 1544 * => we return the page slot number in /dev/drum (0 == invalid slot) 1545 * => we lock uvm_swap_data_lock 1546 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1547 */ 1548 int 1549 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok) 1550 { 1551 struct swapdev *sdp; 1552 struct swappri *spp; 1553 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist); 1554 1555 /* 1556 * no swap devices configured yet? definite failure. 1557 */ 1558 if (uvmexp.nswapdev < 1) 1559 return 0; 1560 1561 /* 1562 * XXXJAK: BEGIN HACK 1563 * 1564 * blist_alloc() in subr_blist.c will panic if we try to allocate 1565 * too many slots. 1566 */ 1567 if (*nslots > BLIST_MAX_ALLOC) { 1568 if (__predict_false(lessok == false)) 1569 return 0; 1570 *nslots = BLIST_MAX_ALLOC; 1571 } 1572 /* XXXJAK: END HACK */ 1573 1574 /* 1575 * lock data lock, convert slots into blocks, and enter loop 1576 */ 1577 mutex_enter(&uvm_swap_data_lock); 1578 1579 ReTry: /* XXXMRG */ 1580 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1581 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1582 uint64_t result; 1583 1584 /* if it's not enabled, then we can't swap from it */ 1585 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1586 continue; 1587 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1588 continue; 1589 result = blist_alloc(sdp->swd_blist, *nslots); 1590 if (result == BLIST_NONE) { 1591 continue; 1592 } 1593 KASSERT(result < sdp->swd_drumsize); 1594 1595 /* 1596 * successful allocation! now rotate the tailq. 1597 */ 1598 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1599 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1600 sdp->swd_npginuse += *nslots; 1601 uvmexp.swpginuse += *nslots; 1602 mutex_exit(&uvm_swap_data_lock); 1603 /* done! return drum slot number */ 1604 UVMHIST_LOG(pdhist, 1605 "success! returning %jd slots starting at %jd", 1606 *nslots, result + sdp->swd_drumoffset, 0, 0); 1607 return (result + sdp->swd_drumoffset); 1608 } 1609 } 1610 1611 /* XXXMRG: BEGIN HACK */ 1612 if (*nslots > 1 && lessok) { 1613 *nslots = 1; 1614 /* XXXMRG: ugh! blist should support this for us */ 1615 goto ReTry; 1616 } 1617 /* XXXMRG: END HACK */ 1618 1619 mutex_exit(&uvm_swap_data_lock); 1620 return 0; 1621 } 1622 1623 /* 1624 * uvm_swapisfull: return true if most of available swap is allocated 1625 * and in use. we don't count some small portion as it may be inaccessible 1626 * to us at any given moment, for example if there is lock contention or if 1627 * pages are busy. 1628 */ 1629 bool 1630 uvm_swapisfull(void) 1631 { 1632 int swpgonly; 1633 bool rv; 1634 1635 mutex_enter(&uvm_swap_data_lock); 1636 KASSERT(uvmexp.swpgonly <= uvmexp.swpages); 1637 swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 / 1638 uvm_swapisfull_factor); 1639 rv = (swpgonly >= uvmexp.swpgavail); 1640 mutex_exit(&uvm_swap_data_lock); 1641 1642 return (rv); 1643 } 1644 1645 /* 1646 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1647 * 1648 * => we lock uvm_swap_data_lock 1649 */ 1650 void 1651 uvm_swap_markbad(int startslot, int nslots) 1652 { 1653 struct swapdev *sdp; 1654 UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist); 1655 1656 mutex_enter(&uvm_swap_data_lock); 1657 sdp = swapdrum_getsdp(startslot); 1658 KASSERT(sdp != NULL); 1659 1660 /* 1661 * we just keep track of how many pages have been marked bad 1662 * in this device, to make everything add up in swap_off(). 1663 * we assume here that the range of slots will all be within 1664 * one swap device. 1665 */ 1666 1667 KASSERT(uvmexp.swpgonly >= nslots); 1668 atomic_add_int(&uvmexp.swpgonly, -nslots); 1669 sdp->swd_npgbad += nslots; 1670 UVMHIST_LOG(pdhist, "now %jd bad", sdp->swd_npgbad, 0,0,0); 1671 mutex_exit(&uvm_swap_data_lock); 1672 } 1673 1674 /* 1675 * uvm_swap_free: free swap slots 1676 * 1677 * => this can be all or part of an allocation made by uvm_swap_alloc 1678 * => we lock uvm_swap_data_lock 1679 */ 1680 void 1681 uvm_swap_free(int startslot, int nslots) 1682 { 1683 struct swapdev *sdp; 1684 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist); 1685 1686 UVMHIST_LOG(pdhist, "freeing %jd slots starting at %jd", nslots, 1687 startslot, 0, 0); 1688 1689 /* 1690 * ignore attempts to free the "bad" slot. 1691 */ 1692 1693 if (startslot == SWSLOT_BAD) { 1694 return; 1695 } 1696 1697 /* 1698 * convert drum slot offset back to sdp, free the blocks 1699 * in the extent, and return. must hold pri lock to do 1700 * lookup and access the extent. 1701 */ 1702 1703 mutex_enter(&uvm_swap_data_lock); 1704 sdp = swapdrum_getsdp(startslot); 1705 KASSERT(uvmexp.nswapdev >= 1); 1706 KASSERT(sdp != NULL); 1707 KASSERT(sdp->swd_npginuse >= nslots); 1708 blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots); 1709 sdp->swd_npginuse -= nslots; 1710 uvmexp.swpginuse -= nslots; 1711 mutex_exit(&uvm_swap_data_lock); 1712 } 1713 1714 /* 1715 * uvm_swap_put: put any number of pages into a contig place on swap 1716 * 1717 * => can be sync or async 1718 */ 1719 1720 int 1721 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) 1722 { 1723 int error; 1724 1725 error = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1726 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1727 return error; 1728 } 1729 1730 /* 1731 * uvm_swap_get: get a single page from swap 1732 * 1733 * => usually a sync op (from fault) 1734 */ 1735 1736 int 1737 uvm_swap_get(struct vm_page *page, int swslot, int flags) 1738 { 1739 int error; 1740 1741 atomic_inc_uint(&uvmexp.nswget); 1742 KASSERT(flags & PGO_SYNCIO); 1743 if (swslot == SWSLOT_BAD) { 1744 return EIO; 1745 } 1746 1747 error = uvm_swap_io(&page, swslot, 1, B_READ | 1748 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1749 if (error == 0) { 1750 1751 /* 1752 * this page is no longer only in swap. 1753 */ 1754 1755 KASSERT(uvmexp.swpgonly > 0); 1756 atomic_dec_uint(&uvmexp.swpgonly); 1757 } 1758 return error; 1759 } 1760 1761 /* 1762 * uvm_swap_io: do an i/o operation to swap 1763 */ 1764 1765 static int 1766 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) 1767 { 1768 daddr_t startblk; 1769 struct buf *bp; 1770 vaddr_t kva; 1771 int error, mapinflags; 1772 bool write, async; 1773 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist); 1774 1775 UVMHIST_LOG(pdhist, "<- called, startslot=%jd, npages=%jd, flags=%jd", 1776 startslot, npages, flags, 0); 1777 1778 write = (flags & B_READ) == 0; 1779 async = (flags & B_ASYNC) != 0; 1780 1781 /* 1782 * allocate a buf for the i/o. 1783 */ 1784 1785 KASSERT(curlwp != uvm.pagedaemon_lwp || (write && async)); 1786 bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp); 1787 if (bp == NULL) { 1788 uvm_aio_aiodone_pages(pps, npages, true, ENOMEM); 1789 return ENOMEM; 1790 } 1791 1792 /* 1793 * convert starting drum slot to block number 1794 */ 1795 1796 startblk = btodb((uint64_t)startslot << PAGE_SHIFT); 1797 1798 /* 1799 * first, map the pages into the kernel. 1800 */ 1801 1802 mapinflags = !write ? 1803 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ : 1804 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE; 1805 kva = uvm_pagermapin(pps, npages, mapinflags); 1806 1807 /* 1808 * fill in the bp/sbp. we currently route our i/o through 1809 * /dev/drum's vnode [swapdev_vp]. 1810 */ 1811 1812 bp->b_cflags = BC_BUSY | BC_NOCACHE; 1813 bp->b_flags = (flags & (B_READ|B_ASYNC)); 1814 bp->b_proc = &proc0; /* XXX */ 1815 bp->b_vnbufs.le_next = NOLIST; 1816 bp->b_data = (void *)kva; 1817 bp->b_blkno = startblk; 1818 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT; 1819 1820 /* 1821 * bump v_numoutput (counter of number of active outputs). 1822 */ 1823 1824 if (write) { 1825 mutex_enter(swapdev_vp->v_interlock); 1826 swapdev_vp->v_numoutput++; 1827 mutex_exit(swapdev_vp->v_interlock); 1828 } 1829 1830 /* 1831 * for async ops we must set up the iodone handler. 1832 */ 1833 1834 if (async) { 1835 bp->b_iodone = uvm_aio_aiodone; 1836 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); 1837 if (curlwp == uvm.pagedaemon_lwp) 1838 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1839 else 1840 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 1841 } else { 1842 bp->b_iodone = NULL; 1843 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1844 } 1845 UVMHIST_LOG(pdhist, 1846 "about to start io: data = %#jx blkno = 0x%jx, bcount = %jd", 1847 (uintptr_t)bp->b_data, bp->b_blkno, bp->b_bcount, 0); 1848 1849 /* 1850 * now we start the I/O, and if async, return. 1851 */ 1852 1853 VOP_STRATEGY(swapdev_vp, bp); 1854 if (async) 1855 return 0; 1856 1857 /* 1858 * must be sync i/o. wait for it to finish 1859 */ 1860 1861 error = biowait(bp); 1862 1863 /* 1864 * kill the pager mapping 1865 */ 1866 1867 uvm_pagermapout(kva, npages); 1868 1869 /* 1870 * now dispose of the buf and we're done. 1871 */ 1872 1873 if (write) { 1874 mutex_enter(swapdev_vp->v_interlock); 1875 vwakeup(bp); 1876 mutex_exit(swapdev_vp->v_interlock); 1877 } 1878 putiobuf(bp); 1879 UVMHIST_LOG(pdhist, "<- done (sync) error=%jd", error, 0, 0, 0); 1880 1881 return (error); 1882 } 1883