1 /* $NetBSD: uvm_swap.c,v 1.180 2019/01/27 05:22:19 kre Exp $ */ 2 3 /* 4 * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 29 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.180 2019/01/27 05:22:19 kre Exp $"); 34 35 #include "opt_uvmhist.h" 36 #include "opt_compat_netbsd.h" 37 #include "opt_ddb.h" 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/buf.h> 42 #include <sys/bufq.h> 43 #include <sys/conf.h> 44 #include <sys/proc.h> 45 #include <sys/namei.h> 46 #include <sys/disklabel.h> 47 #include <sys/errno.h> 48 #include <sys/kernel.h> 49 #include <sys/vnode.h> 50 #include <sys/file.h> 51 #include <sys/vmem.h> 52 #include <sys/blist.h> 53 #include <sys/mount.h> 54 #include <sys/pool.h> 55 #include <sys/kmem.h> 56 #include <sys/syscallargs.h> 57 #include <sys/swap.h> 58 #include <sys/kauth.h> 59 #include <sys/sysctl.h> 60 #include <sys/workqueue.h> 61 62 #include <uvm/uvm.h> 63 64 #include <miscfs/specfs/specdev.h> 65 66 /* 67 * uvm_swap.c: manage configuration and i/o to swap space. 68 */ 69 70 /* 71 * swap space is managed in the following way: 72 * 73 * each swap partition or file is described by a "swapdev" structure. 74 * each "swapdev" structure contains a "swapent" structure which contains 75 * information that is passed up to the user (via system calls). 76 * 77 * each swap partition is assigned a "priority" (int) which controls 78 * swap parition usage. 79 * 80 * the system maintains a global data structure describing all swap 81 * partitions/files. there is a sorted LIST of "swappri" structures 82 * which describe "swapdev"'s at that priority. this LIST is headed 83 * by the "swap_priority" global var. each "swappri" contains a 84 * TAILQ of "swapdev" structures at that priority. 85 * 86 * locking: 87 * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl 88 * system call and prevents the swap priority list from changing 89 * while we are in the middle of a system call (e.g. SWAP_STATS). 90 * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data 91 * structures including the priority list, the swapdev structures, 92 * and the swapmap arena. 93 * 94 * each swap device has the following info: 95 * - swap device in use (could be disabled, preventing future use) 96 * - swap enabled (allows new allocations on swap) 97 * - map info in /dev/drum 98 * - vnode pointer 99 * for swap files only: 100 * - block size 101 * - max byte count in buffer 102 * - buffer 103 * 104 * userland controls and configures swap with the swapctl(2) system call. 105 * the sys_swapctl performs the following operations: 106 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 107 * [2] SWAP_STATS: given a pointer to an array of swapent structures 108 * (passed in via "arg") of a size passed in via "misc" ... we load 109 * the current swap config into the array. The actual work is done 110 * in the uvm_swap_stats() function. 111 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 112 * priority in "misc", start swapping on it. 113 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 114 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 115 * "misc") 116 */ 117 118 /* 119 * swapdev: describes a single swap partition/file 120 * 121 * note the following should be true: 122 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 123 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] 124 */ 125 struct swapdev { 126 dev_t swd_dev; /* device id */ 127 int swd_flags; /* flags:inuse/enable/fake */ 128 int swd_priority; /* our priority */ 129 int swd_nblks; /* blocks in this device */ 130 char *swd_path; /* saved pathname of device */ 131 int swd_pathlen; /* length of pathname */ 132 int swd_npages; /* #pages we can use */ 133 int swd_npginuse; /* #pages in use */ 134 int swd_npgbad; /* #pages bad */ 135 int swd_drumoffset; /* page0 offset in drum */ 136 int swd_drumsize; /* #pages in drum */ 137 blist_t swd_blist; /* blist for this swapdev */ 138 struct vnode *swd_vp; /* backing vnode */ 139 TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */ 140 141 int swd_bsize; /* blocksize (bytes) */ 142 int swd_maxactive; /* max active i/o reqs */ 143 struct bufq_state *swd_tab; /* buffer list */ 144 int swd_active; /* number of active buffers */ 145 }; 146 147 /* 148 * swap device priority entry; the list is kept sorted on `spi_priority'. 149 */ 150 struct swappri { 151 int spi_priority; /* priority */ 152 TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 153 /* tailq of swapdevs at this priority */ 154 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 155 }; 156 157 /* 158 * The following two structures are used to keep track of data transfers 159 * on swap devices associated with regular files. 160 * NOTE: this code is more or less a copy of vnd.c; we use the same 161 * structure names here to ease porting.. 162 */ 163 struct vndxfer { 164 struct buf *vx_bp; /* Pointer to parent buffer */ 165 struct swapdev *vx_sdp; 166 int vx_error; 167 int vx_pending; /* # of pending aux buffers */ 168 int vx_flags; 169 #define VX_BUSY 1 170 #define VX_DEAD 2 171 }; 172 173 struct vndbuf { 174 struct buf vb_buf; 175 struct vndxfer *vb_xfer; 176 }; 177 178 /* 179 * We keep a of pool vndbuf's and vndxfer structures. 180 */ 181 static struct pool vndxfer_pool, vndbuf_pool; 182 183 /* 184 * local variables 185 */ 186 static vmem_t *swapmap; /* controls the mapping of /dev/drum */ 187 188 /* list of all active swap devices [by priority] */ 189 LIST_HEAD(swap_priority, swappri); 190 static struct swap_priority swap_priority; 191 192 /* locks */ 193 static krwlock_t swap_syscall_lock; 194 195 /* workqueue and use counter for swap to regular files */ 196 static int sw_reg_count = 0; 197 static struct workqueue *sw_reg_workqueue; 198 199 /* tuneables */ 200 u_int uvm_swapisfull_factor = 99; 201 202 /* 203 * prototypes 204 */ 205 static struct swapdev *swapdrum_getsdp(int); 206 207 static struct swapdev *swaplist_find(struct vnode *, bool); 208 static void swaplist_insert(struct swapdev *, 209 struct swappri *, int); 210 static void swaplist_trim(void); 211 212 static int swap_on(struct lwp *, struct swapdev *); 213 static int swap_off(struct lwp *, struct swapdev *); 214 215 static void sw_reg_strategy(struct swapdev *, struct buf *, int); 216 static void sw_reg_biodone(struct buf *); 217 static void sw_reg_iodone(struct work *wk, void *dummy); 218 static void sw_reg_start(struct swapdev *); 219 220 static int uvm_swap_io(struct vm_page **, int, int, int); 221 222 /* 223 * uvm_swap_init: init the swap system data structures and locks 224 * 225 * => called at boot time from init_main.c after the filesystems 226 * are brought up (which happens after uvm_init()) 227 */ 228 void 229 uvm_swap_init(void) 230 { 231 UVMHIST_FUNC("uvm_swap_init"); 232 233 UVMHIST_CALLED(pdhist); 234 /* 235 * first, init the swap list, its counter, and its lock. 236 * then get a handle on the vnode for /dev/drum by using 237 * the its dev_t number ("swapdev", from MD conf.c). 238 */ 239 240 LIST_INIT(&swap_priority); 241 uvmexp.nswapdev = 0; 242 rw_init(&swap_syscall_lock); 243 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE); 244 245 if (bdevvp(swapdev, &swapdev_vp)) 246 panic("%s: can't get vnode for swap device", __func__); 247 if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY)) 248 panic("%s: can't lock swap device", __func__); 249 if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED)) 250 panic("%s: can't open swap device", __func__); 251 VOP_UNLOCK(swapdev_vp); 252 253 /* 254 * create swap block resource map to map /dev/drum. the range 255 * from 1 to INT_MAX allows 2 gigablocks of swap space. note 256 * that block 0 is reserved (used to indicate an allocation 257 * failure, or no allocation). 258 */ 259 swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0, 260 VM_NOSLEEP, IPL_NONE); 261 if (swapmap == 0) { 262 panic("%s: vmem_create failed", __func__); 263 } 264 265 pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx", 266 NULL, IPL_BIO); 267 pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd", 268 NULL, IPL_BIO); 269 270 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); 271 } 272 273 /* 274 * swaplist functions: functions that operate on the list of swap 275 * devices on the system. 276 */ 277 278 /* 279 * swaplist_insert: insert swap device "sdp" into the global list 280 * 281 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 282 * => caller must provide a newly allocated swappri structure (we will 283 * FREE it if we don't need it... this it to prevent allocation 284 * blocking here while adding swap) 285 */ 286 static void 287 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) 288 { 289 struct swappri *spp, *pspp; 290 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist); 291 292 /* 293 * find entry at or after which to insert the new device. 294 */ 295 pspp = NULL; 296 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 297 if (priority <= spp->spi_priority) 298 break; 299 pspp = spp; 300 } 301 302 /* 303 * new priority? 304 */ 305 if (spp == NULL || spp->spi_priority != priority) { 306 spp = newspp; /* use newspp! */ 307 UVMHIST_LOG(pdhist, "created new swappri = %jd", 308 priority, 0, 0, 0); 309 310 spp->spi_priority = priority; 311 TAILQ_INIT(&spp->spi_swapdev); 312 313 if (pspp) 314 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 315 else 316 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 317 } else { 318 /* we don't need a new priority structure, free it */ 319 kmem_free(newspp, sizeof(*newspp)); 320 } 321 322 /* 323 * priority found (or created). now insert on the priority's 324 * tailq list and bump the total number of swapdevs. 325 */ 326 sdp->swd_priority = priority; 327 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 328 uvmexp.nswapdev++; 329 } 330 331 /* 332 * swaplist_find: find and optionally remove a swap device from the 333 * global list. 334 * 335 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 336 * => we return the swapdev we found (and removed) 337 */ 338 static struct swapdev * 339 swaplist_find(struct vnode *vp, bool remove) 340 { 341 struct swapdev *sdp; 342 struct swappri *spp; 343 344 /* 345 * search the lists for the requested vp 346 */ 347 348 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 349 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 350 if (sdp->swd_vp == vp) { 351 if (remove) { 352 TAILQ_REMOVE(&spp->spi_swapdev, 353 sdp, swd_next); 354 uvmexp.nswapdev--; 355 } 356 return(sdp); 357 } 358 } 359 } 360 return (NULL); 361 } 362 363 /* 364 * swaplist_trim: scan priority list for empty priority entries and kill 365 * them. 366 * 367 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 368 */ 369 static void 370 swaplist_trim(void) 371 { 372 struct swappri *spp, *nextspp; 373 374 LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) { 375 if (!TAILQ_EMPTY(&spp->spi_swapdev)) 376 continue; 377 LIST_REMOVE(spp, spi_swappri); 378 kmem_free(spp, sizeof(*spp)); 379 } 380 } 381 382 /* 383 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 384 * to the "swapdev" that maps that section of the drum. 385 * 386 * => each swapdev takes one big contig chunk of the drum 387 * => caller must hold uvm_swap_data_lock 388 */ 389 static struct swapdev * 390 swapdrum_getsdp(int pgno) 391 { 392 struct swapdev *sdp; 393 struct swappri *spp; 394 395 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 396 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 397 if (sdp->swd_flags & SWF_FAKE) 398 continue; 399 if (pgno >= sdp->swd_drumoffset && 400 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 401 return sdp; 402 } 403 } 404 } 405 return NULL; 406 } 407 408 void swapsys_lock(krw_t op) 409 { 410 rw_enter(&swap_syscall_lock, op); 411 } 412 413 void swapsys_unlock(void) 414 { 415 rw_exit(&swap_syscall_lock); 416 } 417 418 static void 419 swapent_cvt(struct swapent *se, const struct swapdev *sdp, int inuse) 420 { 421 se->se_dev = sdp->swd_dev; 422 se->se_flags = sdp->swd_flags; 423 se->se_nblks = sdp->swd_nblks; 424 se->se_inuse = inuse; 425 se->se_priority = sdp->swd_priority; 426 KASSERT(sdp->swd_pathlen < sizeof(se->se_path)); 427 strcpy(se->se_path, sdp->swd_path); 428 } 429 430 int (*uvm_swap_stats13)(const struct sys_swapctl_args *, register_t *) = 431 (void *)enosys; 432 int (*uvm_swap_stats50)(const struct sys_swapctl_args *, register_t *) = 433 (void *)enosys; 434 435 /* 436 * sys_swapctl: main entry point for swapctl(2) system call 437 * [with two helper functions: swap_on and swap_off] 438 */ 439 int 440 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval) 441 { 442 /* { 443 syscallarg(int) cmd; 444 syscallarg(void *) arg; 445 syscallarg(int) misc; 446 } */ 447 struct vnode *vp; 448 struct nameidata nd; 449 struct swappri *spp; 450 struct swapdev *sdp; 451 #define SWAP_PATH_MAX (PATH_MAX + 1) 452 char *userpath; 453 size_t len = 0; 454 int error; 455 int priority; 456 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist); 457 458 /* 459 * we handle the non-priv NSWAP and STATS request first. 460 * 461 * SWAP_NSWAP: return number of config'd swap devices 462 * [can also be obtained with uvmexp sysctl] 463 */ 464 if (SCARG(uap, cmd) == SWAP_NSWAP) { 465 const int nswapdev = uvmexp.nswapdev; 466 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%jd", nswapdev, 467 0, 0, 0); 468 *retval = nswapdev; 469 return 0; 470 } 471 472 userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP); 473 474 /* 475 * ensure serialized syscall access by grabbing the swap_syscall_lock 476 */ 477 rw_enter(&swap_syscall_lock, RW_WRITER); 478 479 /* 480 * SWAP_STATS: get stats on current # of configured swap devs 481 * 482 * note that the swap_priority list can't change as long 483 * as we are holding the swap_syscall_lock. we don't want 484 * to grab the uvm_swap_data_lock because we may fault&sleep during 485 * copyout() and we don't want to be holding that lock then! 486 */ 487 switch (SCARG(uap, cmd)) { 488 case SWAP_STATS13: 489 error = (*uvm_swap_stats13)(uap, retval); 490 goto out; 491 case SWAP_STATS50: 492 error = (*uvm_swap_stats50)(uap, retval); 493 goto out; 494 case SWAP_STATS: 495 error = uvm_swap_stats(SCARG(uap, arg), SCARG(uap, misc), 496 NULL, sizeof(struct swapent), retval); 497 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); 498 goto out; 499 500 case SWAP_GETDUMPDEV: 501 error = copyout(&dumpdev, SCARG(uap, arg), sizeof(dumpdev)); 502 goto out; 503 default: 504 break; 505 } 506 507 /* 508 * all other requests require superuser privs. verify. 509 */ 510 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL, 511 0, NULL, NULL, NULL))) 512 goto out; 513 514 if (SCARG(uap, cmd) == SWAP_DUMPOFF) { 515 /* drop the current dump device */ 516 dumpdev = NODEV; 517 dumpcdev = NODEV; 518 cpu_dumpconf(); 519 goto out; 520 } 521 522 /* 523 * at this point we expect a path name in arg. we will 524 * use namei() to gain a vnode reference (vref), and lock 525 * the vnode (VOP_LOCK). 526 * 527 * XXX: a NULL arg means use the root vnode pointer (e.g. for 528 * miniroot) 529 */ 530 if (SCARG(uap, arg) == NULL) { 531 vp = rootvp; /* miniroot */ 532 vref(vp); 533 if (vn_lock(vp, LK_EXCLUSIVE)) { 534 vrele(vp); 535 error = EBUSY; 536 goto out; 537 } 538 if (SCARG(uap, cmd) == SWAP_ON && 539 copystr("miniroot", userpath, SWAP_PATH_MAX, &len)) 540 panic("swapctl: miniroot copy failed"); 541 } else { 542 struct pathbuf *pb; 543 544 /* 545 * This used to allow copying in one extra byte 546 * (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON. 547 * This was completely pointless because if anyone 548 * used that extra byte namei would fail with 549 * ENAMETOOLONG anyway, so I've removed the excess 550 * logic. - dholland 20100215 551 */ 552 553 error = pathbuf_copyin(SCARG(uap, arg), &pb); 554 if (error) { 555 goto out; 556 } 557 if (SCARG(uap, cmd) == SWAP_ON) { 558 /* get a copy of the string */ 559 pathbuf_copystring(pb, userpath, SWAP_PATH_MAX); 560 len = strlen(userpath) + 1; 561 } 562 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb); 563 if ((error = namei(&nd))) { 564 pathbuf_destroy(pb); 565 goto out; 566 } 567 vp = nd.ni_vp; 568 pathbuf_destroy(pb); 569 } 570 /* note: "vp" is referenced and locked */ 571 572 error = 0; /* assume no error */ 573 switch(SCARG(uap, cmd)) { 574 575 case SWAP_DUMPDEV: 576 if (vp->v_type != VBLK) { 577 error = ENOTBLK; 578 break; 579 } 580 if (bdevsw_lookup(vp->v_rdev)) { 581 dumpdev = vp->v_rdev; 582 dumpcdev = devsw_blk2chr(dumpdev); 583 } else 584 dumpdev = NODEV; 585 cpu_dumpconf(); 586 break; 587 588 case SWAP_CTL: 589 /* 590 * get new priority, remove old entry (if any) and then 591 * reinsert it in the correct place. finally, prune out 592 * any empty priority structures. 593 */ 594 priority = SCARG(uap, misc); 595 spp = kmem_alloc(sizeof(*spp), KM_SLEEP); 596 mutex_enter(&uvm_swap_data_lock); 597 if ((sdp = swaplist_find(vp, true)) == NULL) { 598 error = ENOENT; 599 } else { 600 swaplist_insert(sdp, spp, priority); 601 swaplist_trim(); 602 } 603 mutex_exit(&uvm_swap_data_lock); 604 if (error) 605 kmem_free(spp, sizeof(*spp)); 606 break; 607 608 case SWAP_ON: 609 610 /* 611 * check for duplicates. if none found, then insert a 612 * dummy entry on the list to prevent someone else from 613 * trying to enable this device while we are working on 614 * it. 615 */ 616 617 priority = SCARG(uap, misc); 618 sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP); 619 spp = kmem_alloc(sizeof(*spp), KM_SLEEP); 620 sdp->swd_flags = SWF_FAKE; 621 sdp->swd_vp = vp; 622 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 623 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK); 624 mutex_enter(&uvm_swap_data_lock); 625 if (swaplist_find(vp, false) != NULL) { 626 error = EBUSY; 627 mutex_exit(&uvm_swap_data_lock); 628 bufq_free(sdp->swd_tab); 629 kmem_free(sdp, sizeof(*sdp)); 630 kmem_free(spp, sizeof(*spp)); 631 break; 632 } 633 swaplist_insert(sdp, spp, priority); 634 mutex_exit(&uvm_swap_data_lock); 635 636 KASSERT(len > 0); 637 sdp->swd_pathlen = len; 638 sdp->swd_path = kmem_alloc(len, KM_SLEEP); 639 if (copystr(userpath, sdp->swd_path, len, 0) != 0) 640 panic("swapctl: copystr"); 641 642 /* 643 * we've now got a FAKE placeholder in the swap list. 644 * now attempt to enable swap on it. if we fail, undo 645 * what we've done and kill the fake entry we just inserted. 646 * if swap_on is a success, it will clear the SWF_FAKE flag 647 */ 648 649 if ((error = swap_on(l, sdp)) != 0) { 650 mutex_enter(&uvm_swap_data_lock); 651 (void) swaplist_find(vp, true); /* kill fake entry */ 652 swaplist_trim(); 653 mutex_exit(&uvm_swap_data_lock); 654 bufq_free(sdp->swd_tab); 655 kmem_free(sdp->swd_path, sdp->swd_pathlen); 656 kmem_free(sdp, sizeof(*sdp)); 657 break; 658 } 659 break; 660 661 case SWAP_OFF: 662 mutex_enter(&uvm_swap_data_lock); 663 if ((sdp = swaplist_find(vp, false)) == NULL) { 664 mutex_exit(&uvm_swap_data_lock); 665 error = ENXIO; 666 break; 667 } 668 669 /* 670 * If a device isn't in use or enabled, we 671 * can't stop swapping from it (again). 672 */ 673 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 674 mutex_exit(&uvm_swap_data_lock); 675 error = EBUSY; 676 break; 677 } 678 679 /* 680 * do the real work. 681 */ 682 error = swap_off(l, sdp); 683 break; 684 685 default: 686 error = EINVAL; 687 } 688 689 /* 690 * done! release the ref gained by namei() and unlock. 691 */ 692 vput(vp); 693 out: 694 rw_exit(&swap_syscall_lock); 695 kmem_free(userpath, SWAP_PATH_MAX); 696 697 UVMHIST_LOG(pdhist, "<- done! error=%jd", error, 0, 0, 0); 698 return (error); 699 } 700 701 /* 702 * uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept 703 * away from sys_swapctl() in order to allow COMPAT_* swapctl() 704 * emulation to use it directly without going through sys_swapctl(). 705 * The problem with using sys_swapctl() there is that it involves 706 * copying the swapent array to the stackgap, and this array's size 707 * is not known at build time. Hence it would not be possible to 708 * ensure it would fit in the stackgap in any case. 709 */ 710 int 711 uvm_swap_stats(char *ptr, int misc, 712 void (*f)(void *, const struct swapent *), size_t len, 713 register_t *retval) 714 { 715 struct swappri *spp; 716 struct swapdev *sdp; 717 struct swapent sep; 718 int count = 0; 719 int error; 720 721 KASSERT(len <= sizeof(sep)); 722 if (len == 0) 723 return ENOSYS; 724 725 if (misc < 0) 726 return EINVAL; 727 728 if (misc == 0 || uvmexp.nswapdev == 0) 729 return 0; 730 731 /* Make sure userland cannot exhaust kernel memory */ 732 if ((size_t)misc > (size_t)uvmexp.nswapdev) 733 misc = uvmexp.nswapdev; 734 735 KASSERT(rw_lock_held(&swap_syscall_lock)); 736 737 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 738 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 739 int inuse; 740 741 if (misc-- <= 0) 742 break; 743 744 inuse = btodb((uint64_t)sdp->swd_npginuse << 745 PAGE_SHIFT); 746 747 memset(&sep, 0, sizeof(sep)); 748 swapent_cvt(&sep, sdp, inuse); 749 if (f) 750 (*f)(&sep, &sep); 751 if ((error = copyout(&sep, ptr, len)) != 0) 752 return error; 753 ptr += len; 754 count++; 755 } 756 } 757 *retval = count; 758 return 0; 759 } 760 761 /* 762 * swap_on: attempt to enable a swapdev for swapping. note that the 763 * swapdev is already on the global list, but disabled (marked 764 * SWF_FAKE). 765 * 766 * => we avoid the start of the disk (to protect disk labels) 767 * => we also avoid the miniroot, if we are swapping to root. 768 * => caller should leave uvm_swap_data_lock unlocked, we may lock it 769 * if needed. 770 */ 771 static int 772 swap_on(struct lwp *l, struct swapdev *sdp) 773 { 774 struct vnode *vp; 775 int error, npages, nblocks, size; 776 long addr; 777 vmem_addr_t result; 778 struct vattr va; 779 dev_t dev; 780 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist); 781 782 /* 783 * we want to enable swapping on sdp. the swd_vp contains 784 * the vnode we want (locked and ref'd), and the swd_dev 785 * contains the dev_t of the file, if it a block device. 786 */ 787 788 vp = sdp->swd_vp; 789 dev = sdp->swd_dev; 790 791 /* 792 * open the swap file (mostly useful for block device files to 793 * let device driver know what is up). 794 * 795 * we skip the open/close for root on swap because the root 796 * has already been opened when root was mounted (mountroot). 797 */ 798 if (vp != rootvp) { 799 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred))) 800 return (error); 801 } 802 803 /* XXX this only works for block devices */ 804 UVMHIST_LOG(pdhist, " dev=%jd, major(dev)=%jd", dev, major(dev), 0, 0); 805 806 /* 807 * we now need to determine the size of the swap area. for 808 * block specials we can call the d_psize function. 809 * for normal files, we must stat [get attrs]. 810 * 811 * we put the result in nblks. 812 * for normal files, we also want the filesystem block size 813 * (which we get with statfs). 814 */ 815 switch (vp->v_type) { 816 case VBLK: 817 if ((nblocks = bdev_size(dev)) == -1) { 818 error = ENXIO; 819 goto bad; 820 } 821 break; 822 823 case VREG: 824 if ((error = VOP_GETATTR(vp, &va, l->l_cred))) 825 goto bad; 826 nblocks = (int)btodb(va.va_size); 827 sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift; 828 /* 829 * limit the max # of outstanding I/O requests we issue 830 * at any one time. take it easy on NFS servers. 831 */ 832 if (vp->v_tag == VT_NFS) 833 sdp->swd_maxactive = 2; /* XXX */ 834 else 835 sdp->swd_maxactive = 8; /* XXX */ 836 break; 837 838 default: 839 error = ENXIO; 840 goto bad; 841 } 842 843 /* 844 * save nblocks in a safe place and convert to pages. 845 */ 846 847 sdp->swd_nblks = nblocks; 848 npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT; 849 850 /* 851 * for block special files, we want to make sure that leave 852 * the disklabel and bootblocks alone, so we arrange to skip 853 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 854 * note that because of this the "size" can be less than the 855 * actual number of blocks on the device. 856 */ 857 if (vp->v_type == VBLK) { 858 /* we use pages 1 to (size - 1) [inclusive] */ 859 size = npages - 1; 860 addr = 1; 861 } else { 862 /* we use pages 0 to (size - 1) [inclusive] */ 863 size = npages; 864 addr = 0; 865 } 866 867 /* 868 * make sure we have enough blocks for a reasonable sized swap 869 * area. we want at least one page. 870 */ 871 872 if (size < 1) { 873 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); 874 error = EINVAL; 875 goto bad; 876 } 877 878 UVMHIST_LOG(pdhist, " dev=%jx: size=%jd addr=%jd", dev, size, addr, 0); 879 880 /* 881 * now we need to allocate an extent to manage this swap device 882 */ 883 884 sdp->swd_blist = blist_create(npages); 885 /* mark all expect the `saved' region free. */ 886 blist_free(sdp->swd_blist, addr, size); 887 888 /* 889 * if the vnode we are swapping to is the root vnode 890 * (i.e. we are swapping to the miniroot) then we want 891 * to make sure we don't overwrite it. do a statfs to 892 * find its size and skip over it. 893 */ 894 if (vp == rootvp) { 895 struct mount *mp; 896 struct statvfs *sp; 897 int rootblocks, rootpages; 898 899 mp = rootvnode->v_mount; 900 sp = &mp->mnt_stat; 901 rootblocks = sp->f_blocks * btodb(sp->f_frsize); 902 /* 903 * XXX: sp->f_blocks isn't the total number of 904 * blocks in the filesystem, it's the number of 905 * data blocks. so, our rootblocks almost 906 * definitely underestimates the total size 907 * of the filesystem - how badly depends on the 908 * details of the filesystem type. there isn't 909 * an obvious way to deal with this cleanly 910 * and perfectly, so for now we just pad our 911 * rootblocks estimate with an extra 5 percent. 912 */ 913 rootblocks += (rootblocks >> 5) + 914 (rootblocks >> 6) + 915 (rootblocks >> 7); 916 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; 917 if (rootpages > size) 918 panic("swap_on: miniroot larger than swap?"); 919 920 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) { 921 panic("swap_on: unable to preserve miniroot"); 922 } 923 924 size -= rootpages; 925 printf("Preserved %d pages of miniroot ", rootpages); 926 printf("leaving %d pages of swap\n", size); 927 } 928 929 /* 930 * add a ref to vp to reflect usage as a swap device. 931 */ 932 vref(vp); 933 934 /* 935 * now add the new swapdev to the drum and enable. 936 */ 937 error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result); 938 if (error != 0) 939 panic("swapdrum_add"); 940 /* 941 * If this is the first regular swap create the workqueue. 942 * => Protected by swap_syscall_lock. 943 */ 944 if (vp->v_type != VBLK) { 945 if (sw_reg_count++ == 0) { 946 KASSERT(sw_reg_workqueue == NULL); 947 if (workqueue_create(&sw_reg_workqueue, "swapiod", 948 sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0) 949 panic("%s: workqueue_create failed", __func__); 950 } 951 } 952 953 sdp->swd_drumoffset = (int)result; 954 sdp->swd_drumsize = npages; 955 sdp->swd_npages = size; 956 mutex_enter(&uvm_swap_data_lock); 957 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 958 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 959 uvmexp.swpages += size; 960 uvmexp.swpgavail += size; 961 mutex_exit(&uvm_swap_data_lock); 962 return (0); 963 964 /* 965 * failure: clean up and return error. 966 */ 967 968 bad: 969 if (sdp->swd_blist) { 970 blist_destroy(sdp->swd_blist); 971 } 972 if (vp != rootvp) { 973 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred); 974 } 975 return (error); 976 } 977 978 /* 979 * swap_off: stop swapping on swapdev 980 * 981 * => swap data should be locked, we will unlock. 982 */ 983 static int 984 swap_off(struct lwp *l, struct swapdev *sdp) 985 { 986 int npages = sdp->swd_npages; 987 int error = 0; 988 989 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist); 990 UVMHIST_LOG(pdhist, " dev=%jx, npages=%jd", sdp->swd_dev,npages, 0, 0); 991 992 /* disable the swap area being removed */ 993 sdp->swd_flags &= ~SWF_ENABLE; 994 uvmexp.swpgavail -= npages; 995 mutex_exit(&uvm_swap_data_lock); 996 997 /* 998 * the idea is to find all the pages that are paged out to this 999 * device, and page them all in. in uvm, swap-backed pageable 1000 * memory can take two forms: aobjs and anons. call the 1001 * swapoff hook for each subsystem to bring in pages. 1002 */ 1003 1004 if (uao_swap_off(sdp->swd_drumoffset, 1005 sdp->swd_drumoffset + sdp->swd_drumsize) || 1006 amap_swap_off(sdp->swd_drumoffset, 1007 sdp->swd_drumoffset + sdp->swd_drumsize)) { 1008 error = ENOMEM; 1009 } else if (sdp->swd_npginuse > sdp->swd_npgbad) { 1010 error = EBUSY; 1011 } 1012 1013 if (error) { 1014 mutex_enter(&uvm_swap_data_lock); 1015 sdp->swd_flags |= SWF_ENABLE; 1016 uvmexp.swpgavail += npages; 1017 mutex_exit(&uvm_swap_data_lock); 1018 1019 return error; 1020 } 1021 1022 /* 1023 * If this is the last regular swap destroy the workqueue. 1024 * => Protected by swap_syscall_lock. 1025 */ 1026 if (sdp->swd_vp->v_type != VBLK) { 1027 KASSERT(sw_reg_count > 0); 1028 KASSERT(sw_reg_workqueue != NULL); 1029 if (--sw_reg_count == 0) { 1030 workqueue_destroy(sw_reg_workqueue); 1031 sw_reg_workqueue = NULL; 1032 } 1033 } 1034 1035 /* 1036 * done with the vnode. 1037 * drop our ref on the vnode before calling VOP_CLOSE() 1038 * so that spec_close() can tell if this is the last close. 1039 */ 1040 vrele(sdp->swd_vp); 1041 if (sdp->swd_vp != rootvp) { 1042 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred); 1043 } 1044 1045 mutex_enter(&uvm_swap_data_lock); 1046 uvmexp.swpages -= npages; 1047 uvmexp.swpginuse -= sdp->swd_npgbad; 1048 1049 if (swaplist_find(sdp->swd_vp, true) == NULL) 1050 panic("%s: swapdev not in list", __func__); 1051 swaplist_trim(); 1052 mutex_exit(&uvm_swap_data_lock); 1053 1054 /* 1055 * free all resources! 1056 */ 1057 vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize); 1058 blist_destroy(sdp->swd_blist); 1059 bufq_free(sdp->swd_tab); 1060 kmem_free(sdp, sizeof(*sdp)); 1061 return (0); 1062 } 1063 1064 void 1065 uvm_swap_shutdown(struct lwp *l) 1066 { 1067 struct swapdev *sdp; 1068 struct swappri *spp; 1069 struct vnode *vp; 1070 int error; 1071 1072 printf("turning of swap..."); 1073 rw_enter(&swap_syscall_lock, RW_WRITER); 1074 mutex_enter(&uvm_swap_data_lock); 1075 again: 1076 LIST_FOREACH(spp, &swap_priority, spi_swappri) 1077 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1078 if (sdp->swd_flags & SWF_FAKE) 1079 continue; 1080 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) 1081 continue; 1082 #ifdef DEBUG 1083 printf("\nturning off swap on %s...", 1084 sdp->swd_path); 1085 #endif 1086 if (vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE)) { 1087 error = EBUSY; 1088 vp = NULL; 1089 } else 1090 error = 0; 1091 if (!error) { 1092 error = swap_off(l, sdp); 1093 mutex_enter(&uvm_swap_data_lock); 1094 } 1095 if (error) { 1096 printf("stopping swap on %s failed " 1097 "with error %d\n", sdp->swd_path, error); 1098 TAILQ_REMOVE(&spp->spi_swapdev, sdp, 1099 swd_next); 1100 uvmexp.nswapdev--; 1101 swaplist_trim(); 1102 if (vp) 1103 vput(vp); 1104 } 1105 goto again; 1106 } 1107 printf(" done\n"); 1108 mutex_exit(&uvm_swap_data_lock); 1109 rw_exit(&swap_syscall_lock); 1110 } 1111 1112 1113 /* 1114 * /dev/drum interface and i/o functions 1115 */ 1116 1117 /* 1118 * swstrategy: perform I/O on the drum 1119 * 1120 * => we must map the i/o request from the drum to the correct swapdev. 1121 */ 1122 static void 1123 swstrategy(struct buf *bp) 1124 { 1125 struct swapdev *sdp; 1126 struct vnode *vp; 1127 int pageno, bn; 1128 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist); 1129 1130 /* 1131 * convert block number to swapdev. note that swapdev can't 1132 * be yanked out from under us because we are holding resources 1133 * in it (i.e. the blocks we are doing I/O on). 1134 */ 1135 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT; 1136 mutex_enter(&uvm_swap_data_lock); 1137 sdp = swapdrum_getsdp(pageno); 1138 mutex_exit(&uvm_swap_data_lock); 1139 if (sdp == NULL) { 1140 bp->b_error = EINVAL; 1141 bp->b_resid = bp->b_bcount; 1142 biodone(bp); 1143 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); 1144 return; 1145 } 1146 1147 /* 1148 * convert drum page number to block number on this swapdev. 1149 */ 1150 1151 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1152 bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ 1153 1154 UVMHIST_LOG(pdhist, " Rd/Wr (0/1) %jd: mapoff=%jx bn=%jx bcount=%jd", 1155 ((bp->b_flags & B_READ) == 0) ? 1 : 0, 1156 sdp->swd_drumoffset, bn, bp->b_bcount); 1157 1158 /* 1159 * for block devices we finish up here. 1160 * for regular files we have to do more work which we delegate 1161 * to sw_reg_strategy(). 1162 */ 1163 1164 vp = sdp->swd_vp; /* swapdev vnode pointer */ 1165 switch (vp->v_type) { 1166 default: 1167 panic("%s: vnode type 0x%x", __func__, vp->v_type); 1168 1169 case VBLK: 1170 1171 /* 1172 * must convert "bp" from an I/O on /dev/drum to an I/O 1173 * on the swapdev (sdp). 1174 */ 1175 bp->b_blkno = bn; /* swapdev block number */ 1176 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */ 1177 1178 /* 1179 * if we are doing a write, we have to redirect the i/o on 1180 * drum's v_numoutput counter to the swapdevs. 1181 */ 1182 if ((bp->b_flags & B_READ) == 0) { 1183 mutex_enter(bp->b_objlock); 1184 vwakeup(bp); /* kills one 'v_numoutput' on drum */ 1185 mutex_exit(bp->b_objlock); 1186 mutex_enter(vp->v_interlock); 1187 vp->v_numoutput++; /* put it on swapdev */ 1188 mutex_exit(vp->v_interlock); 1189 } 1190 1191 /* 1192 * finally plug in swapdev vnode and start I/O 1193 */ 1194 bp->b_vp = vp; 1195 bp->b_objlock = vp->v_interlock; 1196 VOP_STRATEGY(vp, bp); 1197 return; 1198 1199 case VREG: 1200 /* 1201 * delegate to sw_reg_strategy function. 1202 */ 1203 sw_reg_strategy(sdp, bp, bn); 1204 return; 1205 } 1206 /* NOTREACHED */ 1207 } 1208 1209 /* 1210 * swread: the read function for the drum (just a call to physio) 1211 */ 1212 /*ARGSUSED*/ 1213 static int 1214 swread(dev_t dev, struct uio *uio, int ioflag) 1215 { 1216 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist); 1217 1218 UVMHIST_LOG(pdhist, " dev=%jx offset=%jx", dev, uio->uio_offset, 0, 0); 1219 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); 1220 } 1221 1222 /* 1223 * swwrite: the write function for the drum (just a call to physio) 1224 */ 1225 /*ARGSUSED*/ 1226 static int 1227 swwrite(dev_t dev, struct uio *uio, int ioflag) 1228 { 1229 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist); 1230 1231 UVMHIST_LOG(pdhist, " dev=%jx offset=%jx", dev, uio->uio_offset, 0, 0); 1232 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); 1233 } 1234 1235 const struct bdevsw swap_bdevsw = { 1236 .d_open = nullopen, 1237 .d_close = nullclose, 1238 .d_strategy = swstrategy, 1239 .d_ioctl = noioctl, 1240 .d_dump = nodump, 1241 .d_psize = nosize, 1242 .d_discard = nodiscard, 1243 .d_flag = D_OTHER 1244 }; 1245 1246 const struct cdevsw swap_cdevsw = { 1247 .d_open = nullopen, 1248 .d_close = nullclose, 1249 .d_read = swread, 1250 .d_write = swwrite, 1251 .d_ioctl = noioctl, 1252 .d_stop = nostop, 1253 .d_tty = notty, 1254 .d_poll = nopoll, 1255 .d_mmap = nommap, 1256 .d_kqfilter = nokqfilter, 1257 .d_discard = nodiscard, 1258 .d_flag = D_OTHER, 1259 }; 1260 1261 /* 1262 * sw_reg_strategy: handle swap i/o to regular files 1263 */ 1264 static void 1265 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) 1266 { 1267 struct vnode *vp; 1268 struct vndxfer *vnx; 1269 daddr_t nbn; 1270 char *addr; 1271 off_t byteoff; 1272 int s, off, nra, error, sz, resid; 1273 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist); 1274 1275 /* 1276 * allocate a vndxfer head for this transfer and point it to 1277 * our buffer. 1278 */ 1279 vnx = pool_get(&vndxfer_pool, PR_WAITOK); 1280 vnx->vx_flags = VX_BUSY; 1281 vnx->vx_error = 0; 1282 vnx->vx_pending = 0; 1283 vnx->vx_bp = bp; 1284 vnx->vx_sdp = sdp; 1285 1286 /* 1287 * setup for main loop where we read filesystem blocks into 1288 * our buffer. 1289 */ 1290 error = 0; 1291 bp->b_resid = bp->b_bcount; /* nothing transfered yet! */ 1292 addr = bp->b_data; /* current position in buffer */ 1293 byteoff = dbtob((uint64_t)bn); 1294 1295 for (resid = bp->b_resid; resid; resid -= sz) { 1296 struct vndbuf *nbp; 1297 1298 /* 1299 * translate byteoffset into block number. return values: 1300 * vp = vnode of underlying device 1301 * nbn = new block number (on underlying vnode dev) 1302 * nra = num blocks we can read-ahead (excludes requested 1303 * block) 1304 */ 1305 nra = 0; 1306 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1307 &vp, &nbn, &nra); 1308 1309 if (error == 0 && nbn == (daddr_t)-1) { 1310 /* 1311 * this used to just set error, but that doesn't 1312 * do the right thing. Instead, it causes random 1313 * memory errors. The panic() should remain until 1314 * this condition doesn't destabilize the system. 1315 */ 1316 #if 1 1317 panic("%s: swap to sparse file", __func__); 1318 #else 1319 error = EIO; /* failure */ 1320 #endif 1321 } 1322 1323 /* 1324 * punt if there was an error or a hole in the file. 1325 * we must wait for any i/o ops we have already started 1326 * to finish before returning. 1327 * 1328 * XXX we could deal with holes here but it would be 1329 * a hassle (in the write case). 1330 */ 1331 if (error) { 1332 s = splbio(); 1333 vnx->vx_error = error; /* pass error up */ 1334 goto out; 1335 } 1336 1337 /* 1338 * compute the size ("sz") of this transfer (in bytes). 1339 */ 1340 off = byteoff % sdp->swd_bsize; 1341 sz = (1 + nra) * sdp->swd_bsize - off; 1342 if (sz > resid) 1343 sz = resid; 1344 1345 UVMHIST_LOG(pdhist, "sw_reg_strategy: " 1346 "vp %#jx/%#jx offset 0x%jx/0x%jx", 1347 (uintptr_t)sdp->swd_vp, (uintptr_t)vp, byteoff, nbn); 1348 1349 /* 1350 * now get a buf structure. note that the vb_buf is 1351 * at the front of the nbp structure so that you can 1352 * cast pointers between the two structure easily. 1353 */ 1354 nbp = pool_get(&vndbuf_pool, PR_WAITOK); 1355 buf_init(&nbp->vb_buf); 1356 nbp->vb_buf.b_flags = bp->b_flags; 1357 nbp->vb_buf.b_cflags = bp->b_cflags; 1358 nbp->vb_buf.b_oflags = bp->b_oflags; 1359 nbp->vb_buf.b_bcount = sz; 1360 nbp->vb_buf.b_bufsize = sz; 1361 nbp->vb_buf.b_error = 0; 1362 nbp->vb_buf.b_data = addr; 1363 nbp->vb_buf.b_lblkno = 0; 1364 nbp->vb_buf.b_blkno = nbn + btodb(off); 1365 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno; 1366 nbp->vb_buf.b_iodone = sw_reg_biodone; 1367 nbp->vb_buf.b_vp = vp; 1368 nbp->vb_buf.b_objlock = vp->v_interlock; 1369 if (vp->v_type == VBLK) { 1370 nbp->vb_buf.b_dev = vp->v_rdev; 1371 } 1372 1373 nbp->vb_xfer = vnx; /* patch it back in to vnx */ 1374 1375 /* 1376 * Just sort by block number 1377 */ 1378 s = splbio(); 1379 if (vnx->vx_error != 0) { 1380 buf_destroy(&nbp->vb_buf); 1381 pool_put(&vndbuf_pool, nbp); 1382 goto out; 1383 } 1384 vnx->vx_pending++; 1385 1386 /* sort it in and start I/O if we are not over our limit */ 1387 /* XXXAD locking */ 1388 bufq_put(sdp->swd_tab, &nbp->vb_buf); 1389 sw_reg_start(sdp); 1390 splx(s); 1391 1392 /* 1393 * advance to the next I/O 1394 */ 1395 byteoff += sz; 1396 addr += sz; 1397 } 1398 1399 s = splbio(); 1400 1401 out: /* Arrive here at splbio */ 1402 vnx->vx_flags &= ~VX_BUSY; 1403 if (vnx->vx_pending == 0) { 1404 error = vnx->vx_error; 1405 pool_put(&vndxfer_pool, vnx); 1406 bp->b_error = error; 1407 biodone(bp); 1408 } 1409 splx(s); 1410 } 1411 1412 /* 1413 * sw_reg_start: start an I/O request on the requested swapdev 1414 * 1415 * => reqs are sorted by b_rawblkno (above) 1416 */ 1417 static void 1418 sw_reg_start(struct swapdev *sdp) 1419 { 1420 struct buf *bp; 1421 struct vnode *vp; 1422 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist); 1423 1424 /* recursion control */ 1425 if ((sdp->swd_flags & SWF_BUSY) != 0) 1426 return; 1427 1428 sdp->swd_flags |= SWF_BUSY; 1429 1430 while (sdp->swd_active < sdp->swd_maxactive) { 1431 bp = bufq_get(sdp->swd_tab); 1432 if (bp == NULL) 1433 break; 1434 sdp->swd_active++; 1435 1436 UVMHIST_LOG(pdhist, 1437 "sw_reg_start: bp %#jx vp %#jx blkno %#jx cnt %jx", 1438 (uintptr_t)bp, (uintptr_t)bp->b_vp, (uintptr_t)bp->b_blkno, 1439 bp->b_bcount); 1440 vp = bp->b_vp; 1441 KASSERT(bp->b_objlock == vp->v_interlock); 1442 if ((bp->b_flags & B_READ) == 0) { 1443 mutex_enter(vp->v_interlock); 1444 vp->v_numoutput++; 1445 mutex_exit(vp->v_interlock); 1446 } 1447 VOP_STRATEGY(vp, bp); 1448 } 1449 sdp->swd_flags &= ~SWF_BUSY; 1450 } 1451 1452 /* 1453 * sw_reg_biodone: one of our i/o's has completed 1454 */ 1455 static void 1456 sw_reg_biodone(struct buf *bp) 1457 { 1458 workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL); 1459 } 1460 1461 /* 1462 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1463 * 1464 * => note that we can recover the vndbuf struct by casting the buf ptr 1465 */ 1466 static void 1467 sw_reg_iodone(struct work *wk, void *dummy) 1468 { 1469 struct vndbuf *vbp = (void *)wk; 1470 struct vndxfer *vnx = vbp->vb_xfer; 1471 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1472 struct swapdev *sdp = vnx->vx_sdp; 1473 int s, resid, error; 1474 KASSERT(&vbp->vb_buf.b_work == wk); 1475 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist); 1476 1477 UVMHIST_LOG(pdhist, " vbp=%#jx vp=%#jx blkno=%jx addr=%#jx", 1478 (uintptr_t)vbp, (uintptr_t)vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, 1479 (uintptr_t)vbp->vb_buf.b_data); 1480 UVMHIST_LOG(pdhist, " cnt=%jx resid=%jx", 1481 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); 1482 1483 /* 1484 * protect vbp at splbio and update. 1485 */ 1486 1487 s = splbio(); 1488 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1489 pbp->b_resid -= resid; 1490 vnx->vx_pending--; 1491 1492 if (vbp->vb_buf.b_error != 0) { 1493 /* pass error upward */ 1494 error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO; 1495 UVMHIST_LOG(pdhist, " got error=%jd !", error, 0, 0, 0); 1496 vnx->vx_error = error; 1497 } 1498 1499 /* 1500 * kill vbp structure 1501 */ 1502 buf_destroy(&vbp->vb_buf); 1503 pool_put(&vndbuf_pool, vbp); 1504 1505 /* 1506 * wrap up this transaction if it has run to completion or, in 1507 * case of an error, when all auxiliary buffers have returned. 1508 */ 1509 if (vnx->vx_error != 0) { 1510 /* pass error upward */ 1511 error = vnx->vx_error; 1512 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1513 pbp->b_error = error; 1514 biodone(pbp); 1515 pool_put(&vndxfer_pool, vnx); 1516 } 1517 } else if (pbp->b_resid == 0) { 1518 KASSERT(vnx->vx_pending == 0); 1519 if ((vnx->vx_flags & VX_BUSY) == 0) { 1520 UVMHIST_LOG(pdhist, " iodone, pbp=%#jx error=%jd !", 1521 (uintptr_t)pbp, vnx->vx_error, 0, 0); 1522 biodone(pbp); 1523 pool_put(&vndxfer_pool, vnx); 1524 } 1525 } 1526 1527 /* 1528 * done! start next swapdev I/O if one is pending 1529 */ 1530 sdp->swd_active--; 1531 sw_reg_start(sdp); 1532 splx(s); 1533 } 1534 1535 1536 /* 1537 * uvm_swap_alloc: allocate space on swap 1538 * 1539 * => allocation is done "round robin" down the priority list, as we 1540 * allocate in a priority we "rotate" the circle queue. 1541 * => space can be freed with uvm_swap_free 1542 * => we return the page slot number in /dev/drum (0 == invalid slot) 1543 * => we lock uvm_swap_data_lock 1544 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1545 */ 1546 int 1547 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok) 1548 { 1549 struct swapdev *sdp; 1550 struct swappri *spp; 1551 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist); 1552 1553 /* 1554 * no swap devices configured yet? definite failure. 1555 */ 1556 if (uvmexp.nswapdev < 1) 1557 return 0; 1558 1559 /* 1560 * XXXJAK: BEGIN HACK 1561 * 1562 * blist_alloc() in subr_blist.c will panic if we try to allocate 1563 * too many slots. 1564 */ 1565 if (*nslots > BLIST_MAX_ALLOC) { 1566 if (__predict_false(lessok == false)) 1567 return 0; 1568 *nslots = BLIST_MAX_ALLOC; 1569 } 1570 /* XXXJAK: END HACK */ 1571 1572 /* 1573 * lock data lock, convert slots into blocks, and enter loop 1574 */ 1575 mutex_enter(&uvm_swap_data_lock); 1576 1577 ReTry: /* XXXMRG */ 1578 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1579 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1580 uint64_t result; 1581 1582 /* if it's not enabled, then we can't swap from it */ 1583 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1584 continue; 1585 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1586 continue; 1587 result = blist_alloc(sdp->swd_blist, *nslots); 1588 if (result == BLIST_NONE) { 1589 continue; 1590 } 1591 KASSERT(result < sdp->swd_drumsize); 1592 1593 /* 1594 * successful allocation! now rotate the tailq. 1595 */ 1596 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1597 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1598 sdp->swd_npginuse += *nslots; 1599 uvmexp.swpginuse += *nslots; 1600 mutex_exit(&uvm_swap_data_lock); 1601 /* done! return drum slot number */ 1602 UVMHIST_LOG(pdhist, 1603 "success! returning %jd slots starting at %jd", 1604 *nslots, result + sdp->swd_drumoffset, 0, 0); 1605 return (result + sdp->swd_drumoffset); 1606 } 1607 } 1608 1609 /* XXXMRG: BEGIN HACK */ 1610 if (*nslots > 1 && lessok) { 1611 *nslots = 1; 1612 /* XXXMRG: ugh! blist should support this for us */ 1613 goto ReTry; 1614 } 1615 /* XXXMRG: END HACK */ 1616 1617 mutex_exit(&uvm_swap_data_lock); 1618 return 0; 1619 } 1620 1621 /* 1622 * uvm_swapisfull: return true if most of available swap is allocated 1623 * and in use. we don't count some small portion as it may be inaccessible 1624 * to us at any given moment, for example if there is lock contention or if 1625 * pages are busy. 1626 */ 1627 bool 1628 uvm_swapisfull(void) 1629 { 1630 int swpgonly; 1631 bool rv; 1632 1633 mutex_enter(&uvm_swap_data_lock); 1634 KASSERT(uvmexp.swpgonly <= uvmexp.swpages); 1635 swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 / 1636 uvm_swapisfull_factor); 1637 rv = (swpgonly >= uvmexp.swpgavail); 1638 mutex_exit(&uvm_swap_data_lock); 1639 1640 return (rv); 1641 } 1642 1643 /* 1644 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1645 * 1646 * => we lock uvm_swap_data_lock 1647 */ 1648 void 1649 uvm_swap_markbad(int startslot, int nslots) 1650 { 1651 struct swapdev *sdp; 1652 UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist); 1653 1654 mutex_enter(&uvm_swap_data_lock); 1655 sdp = swapdrum_getsdp(startslot); 1656 KASSERT(sdp != NULL); 1657 1658 /* 1659 * we just keep track of how many pages have been marked bad 1660 * in this device, to make everything add up in swap_off(). 1661 * we assume here that the range of slots will all be within 1662 * one swap device. 1663 */ 1664 1665 KASSERT(uvmexp.swpgonly >= nslots); 1666 uvmexp.swpgonly -= nslots; 1667 sdp->swd_npgbad += nslots; 1668 UVMHIST_LOG(pdhist, "now %jd bad", sdp->swd_npgbad, 0,0,0); 1669 mutex_exit(&uvm_swap_data_lock); 1670 } 1671 1672 /* 1673 * uvm_swap_free: free swap slots 1674 * 1675 * => this can be all or part of an allocation made by uvm_swap_alloc 1676 * => we lock uvm_swap_data_lock 1677 */ 1678 void 1679 uvm_swap_free(int startslot, int nslots) 1680 { 1681 struct swapdev *sdp; 1682 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist); 1683 1684 UVMHIST_LOG(pdhist, "freeing %jd slots starting at %jd", nslots, 1685 startslot, 0, 0); 1686 1687 /* 1688 * ignore attempts to free the "bad" slot. 1689 */ 1690 1691 if (startslot == SWSLOT_BAD) { 1692 return; 1693 } 1694 1695 /* 1696 * convert drum slot offset back to sdp, free the blocks 1697 * in the extent, and return. must hold pri lock to do 1698 * lookup and access the extent. 1699 */ 1700 1701 mutex_enter(&uvm_swap_data_lock); 1702 sdp = swapdrum_getsdp(startslot); 1703 KASSERT(uvmexp.nswapdev >= 1); 1704 KASSERT(sdp != NULL); 1705 KASSERT(sdp->swd_npginuse >= nslots); 1706 blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots); 1707 sdp->swd_npginuse -= nslots; 1708 uvmexp.swpginuse -= nslots; 1709 mutex_exit(&uvm_swap_data_lock); 1710 } 1711 1712 /* 1713 * uvm_swap_put: put any number of pages into a contig place on swap 1714 * 1715 * => can be sync or async 1716 */ 1717 1718 int 1719 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) 1720 { 1721 int error; 1722 1723 error = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1724 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1725 return error; 1726 } 1727 1728 /* 1729 * uvm_swap_get: get a single page from swap 1730 * 1731 * => usually a sync op (from fault) 1732 */ 1733 1734 int 1735 uvm_swap_get(struct vm_page *page, int swslot, int flags) 1736 { 1737 int error; 1738 1739 uvmexp.nswget++; 1740 KASSERT(flags & PGO_SYNCIO); 1741 if (swslot == SWSLOT_BAD) { 1742 return EIO; 1743 } 1744 1745 error = uvm_swap_io(&page, swslot, 1, B_READ | 1746 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1747 if (error == 0) { 1748 1749 /* 1750 * this page is no longer only in swap. 1751 */ 1752 1753 mutex_enter(&uvm_swap_data_lock); 1754 KASSERT(uvmexp.swpgonly > 0); 1755 uvmexp.swpgonly--; 1756 mutex_exit(&uvm_swap_data_lock); 1757 } 1758 return error; 1759 } 1760 1761 /* 1762 * uvm_swap_io: do an i/o operation to swap 1763 */ 1764 1765 static int 1766 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) 1767 { 1768 daddr_t startblk; 1769 struct buf *bp; 1770 vaddr_t kva; 1771 int error, mapinflags; 1772 bool write, async; 1773 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist); 1774 1775 UVMHIST_LOG(pdhist, "<- called, startslot=%jd, npages=%jd, flags=%jd", 1776 startslot, npages, flags, 0); 1777 1778 write = (flags & B_READ) == 0; 1779 async = (flags & B_ASYNC) != 0; 1780 1781 /* 1782 * allocate a buf for the i/o. 1783 */ 1784 1785 KASSERT(curlwp != uvm.pagedaemon_lwp || (write && async)); 1786 bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp); 1787 if (bp == NULL) { 1788 uvm_aio_aiodone_pages(pps, npages, true, ENOMEM); 1789 return ENOMEM; 1790 } 1791 1792 /* 1793 * convert starting drum slot to block number 1794 */ 1795 1796 startblk = btodb((uint64_t)startslot << PAGE_SHIFT); 1797 1798 /* 1799 * first, map the pages into the kernel. 1800 */ 1801 1802 mapinflags = !write ? 1803 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ : 1804 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE; 1805 kva = uvm_pagermapin(pps, npages, mapinflags); 1806 1807 /* 1808 * fill in the bp/sbp. we currently route our i/o through 1809 * /dev/drum's vnode [swapdev_vp]. 1810 */ 1811 1812 bp->b_cflags = BC_BUSY | BC_NOCACHE; 1813 bp->b_flags = (flags & (B_READ|B_ASYNC)); 1814 bp->b_proc = &proc0; /* XXX */ 1815 bp->b_vnbufs.le_next = NOLIST; 1816 bp->b_data = (void *)kva; 1817 bp->b_blkno = startblk; 1818 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT; 1819 1820 /* 1821 * bump v_numoutput (counter of number of active outputs). 1822 */ 1823 1824 if (write) { 1825 mutex_enter(swapdev_vp->v_interlock); 1826 swapdev_vp->v_numoutput++; 1827 mutex_exit(swapdev_vp->v_interlock); 1828 } 1829 1830 /* 1831 * for async ops we must set up the iodone handler. 1832 */ 1833 1834 if (async) { 1835 bp->b_iodone = uvm_aio_biodone; 1836 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); 1837 if (curlwp == uvm.pagedaemon_lwp) 1838 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1839 else 1840 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 1841 } else { 1842 bp->b_iodone = NULL; 1843 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1844 } 1845 UVMHIST_LOG(pdhist, 1846 "about to start io: data = %#jx blkno = 0x%jx, bcount = %jd", 1847 (uintptr_t)bp->b_data, bp->b_blkno, bp->b_bcount, 0); 1848 1849 /* 1850 * now we start the I/O, and if async, return. 1851 */ 1852 1853 VOP_STRATEGY(swapdev_vp, bp); 1854 if (async) 1855 return 0; 1856 1857 /* 1858 * must be sync i/o. wait for it to finish 1859 */ 1860 1861 error = biowait(bp); 1862 1863 /* 1864 * kill the pager mapping 1865 */ 1866 1867 uvm_pagermapout(kva, npages); 1868 1869 /* 1870 * now dispose of the buf and we're done. 1871 */ 1872 1873 if (write) { 1874 mutex_enter(swapdev_vp->v_interlock); 1875 vwakeup(bp); 1876 mutex_exit(swapdev_vp->v_interlock); 1877 } 1878 putiobuf(bp); 1879 UVMHIST_LOG(pdhist, "<- done (sync) error=%jd", error, 0, 0, 0); 1880 1881 return (error); 1882 } 1883