1 /* $NetBSD: uvm_swap.c,v 1.175 2017/10/28 00:37:13 pgoyette Exp $ */ 2 3 /* 4 * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 29 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.175 2017/10/28 00:37:13 pgoyette Exp $"); 34 35 #include "opt_uvmhist.h" 36 #include "opt_compat_netbsd.h" 37 #include "opt_ddb.h" 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/buf.h> 42 #include <sys/bufq.h> 43 #include <sys/conf.h> 44 #include <sys/proc.h> 45 #include <sys/namei.h> 46 #include <sys/disklabel.h> 47 #include <sys/errno.h> 48 #include <sys/kernel.h> 49 #include <sys/vnode.h> 50 #include <sys/file.h> 51 #include <sys/vmem.h> 52 #include <sys/blist.h> 53 #include <sys/mount.h> 54 #include <sys/pool.h> 55 #include <sys/kmem.h> 56 #include <sys/syscallargs.h> 57 #include <sys/swap.h> 58 #include <sys/kauth.h> 59 #include <sys/sysctl.h> 60 #include <sys/workqueue.h> 61 62 #include <uvm/uvm.h> 63 64 #include <miscfs/specfs/specdev.h> 65 66 /* 67 * uvm_swap.c: manage configuration and i/o to swap space. 68 */ 69 70 /* 71 * swap space is managed in the following way: 72 * 73 * each swap partition or file is described by a "swapdev" structure. 74 * each "swapdev" structure contains a "swapent" structure which contains 75 * information that is passed up to the user (via system calls). 76 * 77 * each swap partition is assigned a "priority" (int) which controls 78 * swap parition usage. 79 * 80 * the system maintains a global data structure describing all swap 81 * partitions/files. there is a sorted LIST of "swappri" structures 82 * which describe "swapdev"'s at that priority. this LIST is headed 83 * by the "swap_priority" global var. each "swappri" contains a 84 * TAILQ of "swapdev" structures at that priority. 85 * 86 * locking: 87 * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl 88 * system call and prevents the swap priority list from changing 89 * while we are in the middle of a system call (e.g. SWAP_STATS). 90 * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data 91 * structures including the priority list, the swapdev structures, 92 * and the swapmap arena. 93 * 94 * each swap device has the following info: 95 * - swap device in use (could be disabled, preventing future use) 96 * - swap enabled (allows new allocations on swap) 97 * - map info in /dev/drum 98 * - vnode pointer 99 * for swap files only: 100 * - block size 101 * - max byte count in buffer 102 * - buffer 103 * 104 * userland controls and configures swap with the swapctl(2) system call. 105 * the sys_swapctl performs the following operations: 106 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 107 * [2] SWAP_STATS: given a pointer to an array of swapent structures 108 * (passed in via "arg") of a size passed in via "misc" ... we load 109 * the current swap config into the array. The actual work is done 110 * in the uvm_swap_stats() function. 111 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 112 * priority in "misc", start swapping on it. 113 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 114 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 115 * "misc") 116 */ 117 118 /* 119 * swapdev: describes a single swap partition/file 120 * 121 * note the following should be true: 122 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 123 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] 124 */ 125 struct swapdev { 126 dev_t swd_dev; /* device id */ 127 int swd_flags; /* flags:inuse/enable/fake */ 128 int swd_priority; /* our priority */ 129 int swd_nblks; /* blocks in this device */ 130 char *swd_path; /* saved pathname of device */ 131 int swd_pathlen; /* length of pathname */ 132 int swd_npages; /* #pages we can use */ 133 int swd_npginuse; /* #pages in use */ 134 int swd_npgbad; /* #pages bad */ 135 int swd_drumoffset; /* page0 offset in drum */ 136 int swd_drumsize; /* #pages in drum */ 137 blist_t swd_blist; /* blist for this swapdev */ 138 struct vnode *swd_vp; /* backing vnode */ 139 TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */ 140 141 int swd_bsize; /* blocksize (bytes) */ 142 int swd_maxactive; /* max active i/o reqs */ 143 struct bufq_state *swd_tab; /* buffer list */ 144 int swd_active; /* number of active buffers */ 145 }; 146 147 /* 148 * swap device priority entry; the list is kept sorted on `spi_priority'. 149 */ 150 struct swappri { 151 int spi_priority; /* priority */ 152 TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 153 /* tailq of swapdevs at this priority */ 154 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 155 }; 156 157 /* 158 * The following two structures are used to keep track of data transfers 159 * on swap devices associated with regular files. 160 * NOTE: this code is more or less a copy of vnd.c; we use the same 161 * structure names here to ease porting.. 162 */ 163 struct vndxfer { 164 struct buf *vx_bp; /* Pointer to parent buffer */ 165 struct swapdev *vx_sdp; 166 int vx_error; 167 int vx_pending; /* # of pending aux buffers */ 168 int vx_flags; 169 #define VX_BUSY 1 170 #define VX_DEAD 2 171 }; 172 173 struct vndbuf { 174 struct buf vb_buf; 175 struct vndxfer *vb_xfer; 176 }; 177 178 /* 179 * NetBSD 1.3 swapctl(SWAP_STATS, ...) swapent structure; uses 32 bit 180 * dev_t and has no se_path[] member. 181 */ 182 struct swapent13 { 183 int32_t se13_dev; /* device id */ 184 int se13_flags; /* flags */ 185 int se13_nblks; /* total blocks */ 186 int se13_inuse; /* blocks in use */ 187 int se13_priority; /* priority of this device */ 188 }; 189 190 /* 191 * NetBSD 5.0 swapctl(SWAP_STATS, ...) swapent structure; uses 32 bit 192 * dev_t. 193 */ 194 struct swapent50 { 195 int32_t se50_dev; /* device id */ 196 int se50_flags; /* flags */ 197 int se50_nblks; /* total blocks */ 198 int se50_inuse; /* blocks in use */ 199 int se50_priority; /* priority of this device */ 200 char se50_path[PATH_MAX+1]; /* path name */ 201 }; 202 203 /* 204 * We keep a of pool vndbuf's and vndxfer structures. 205 */ 206 static struct pool vndxfer_pool, vndbuf_pool; 207 208 /* 209 * local variables 210 */ 211 static vmem_t *swapmap; /* controls the mapping of /dev/drum */ 212 213 /* list of all active swap devices [by priority] */ 214 LIST_HEAD(swap_priority, swappri); 215 static struct swap_priority swap_priority; 216 217 /* locks */ 218 static krwlock_t swap_syscall_lock; 219 220 /* workqueue and use counter for swap to regular files */ 221 static int sw_reg_count = 0; 222 static struct workqueue *sw_reg_workqueue; 223 224 /* tuneables */ 225 u_int uvm_swapisfull_factor = 99; 226 227 /* 228 * prototypes 229 */ 230 static struct swapdev *swapdrum_getsdp(int); 231 232 static struct swapdev *swaplist_find(struct vnode *, bool); 233 static void swaplist_insert(struct swapdev *, 234 struct swappri *, int); 235 static void swaplist_trim(void); 236 237 static int swap_on(struct lwp *, struct swapdev *); 238 static int swap_off(struct lwp *, struct swapdev *); 239 240 static void sw_reg_strategy(struct swapdev *, struct buf *, int); 241 static void sw_reg_biodone(struct buf *); 242 static void sw_reg_iodone(struct work *wk, void *dummy); 243 static void sw_reg_start(struct swapdev *); 244 245 static int uvm_swap_io(struct vm_page **, int, int, int); 246 247 /* 248 * uvm_swap_init: init the swap system data structures and locks 249 * 250 * => called at boot time from init_main.c after the filesystems 251 * are brought up (which happens after uvm_init()) 252 */ 253 void 254 uvm_swap_init(void) 255 { 256 UVMHIST_FUNC("uvm_swap_init"); 257 258 UVMHIST_CALLED(pdhist); 259 /* 260 * first, init the swap list, its counter, and its lock. 261 * then get a handle on the vnode for /dev/drum by using 262 * the its dev_t number ("swapdev", from MD conf.c). 263 */ 264 265 LIST_INIT(&swap_priority); 266 uvmexp.nswapdev = 0; 267 rw_init(&swap_syscall_lock); 268 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE); 269 270 if (bdevvp(swapdev, &swapdev_vp)) 271 panic("%s: can't get vnode for swap device", __func__); 272 if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY)) 273 panic("%s: can't lock swap device", __func__); 274 if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED)) 275 panic("%s: can't open swap device", __func__); 276 VOP_UNLOCK(swapdev_vp); 277 278 /* 279 * create swap block resource map to map /dev/drum. the range 280 * from 1 to INT_MAX allows 2 gigablocks of swap space. note 281 * that block 0 is reserved (used to indicate an allocation 282 * failure, or no allocation). 283 */ 284 swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0, 285 VM_NOSLEEP, IPL_NONE); 286 if (swapmap == 0) { 287 panic("%s: vmem_create failed", __func__); 288 } 289 290 pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx", 291 NULL, IPL_BIO); 292 pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd", 293 NULL, IPL_BIO); 294 295 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); 296 } 297 298 /* 299 * swaplist functions: functions that operate on the list of swap 300 * devices on the system. 301 */ 302 303 /* 304 * swaplist_insert: insert swap device "sdp" into the global list 305 * 306 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 307 * => caller must provide a newly allocated swappri structure (we will 308 * FREE it if we don't need it... this it to prevent allocation 309 * blocking here while adding swap) 310 */ 311 static void 312 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) 313 { 314 struct swappri *spp, *pspp; 315 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist); 316 317 /* 318 * find entry at or after which to insert the new device. 319 */ 320 pspp = NULL; 321 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 322 if (priority <= spp->spi_priority) 323 break; 324 pspp = spp; 325 } 326 327 /* 328 * new priority? 329 */ 330 if (spp == NULL || spp->spi_priority != priority) { 331 spp = newspp; /* use newspp! */ 332 UVMHIST_LOG(pdhist, "created new swappri = %jd", 333 priority, 0, 0, 0); 334 335 spp->spi_priority = priority; 336 TAILQ_INIT(&spp->spi_swapdev); 337 338 if (pspp) 339 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 340 else 341 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 342 } else { 343 /* we don't need a new priority structure, free it */ 344 kmem_free(newspp, sizeof(*newspp)); 345 } 346 347 /* 348 * priority found (or created). now insert on the priority's 349 * tailq list and bump the total number of swapdevs. 350 */ 351 sdp->swd_priority = priority; 352 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 353 uvmexp.nswapdev++; 354 } 355 356 /* 357 * swaplist_find: find and optionally remove a swap device from the 358 * global list. 359 * 360 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 361 * => we return the swapdev we found (and removed) 362 */ 363 static struct swapdev * 364 swaplist_find(struct vnode *vp, bool remove) 365 { 366 struct swapdev *sdp; 367 struct swappri *spp; 368 369 /* 370 * search the lists for the requested vp 371 */ 372 373 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 374 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 375 if (sdp->swd_vp == vp) { 376 if (remove) { 377 TAILQ_REMOVE(&spp->spi_swapdev, 378 sdp, swd_next); 379 uvmexp.nswapdev--; 380 } 381 return(sdp); 382 } 383 } 384 } 385 return (NULL); 386 } 387 388 /* 389 * swaplist_trim: scan priority list for empty priority entries and kill 390 * them. 391 * 392 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 393 */ 394 static void 395 swaplist_trim(void) 396 { 397 struct swappri *spp, *nextspp; 398 399 LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) { 400 if (!TAILQ_EMPTY(&spp->spi_swapdev)) 401 continue; 402 LIST_REMOVE(spp, spi_swappri); 403 kmem_free(spp, sizeof(*spp)); 404 } 405 } 406 407 /* 408 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 409 * to the "swapdev" that maps that section of the drum. 410 * 411 * => each swapdev takes one big contig chunk of the drum 412 * => caller must hold uvm_swap_data_lock 413 */ 414 static struct swapdev * 415 swapdrum_getsdp(int pgno) 416 { 417 struct swapdev *sdp; 418 struct swappri *spp; 419 420 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 421 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 422 if (sdp->swd_flags & SWF_FAKE) 423 continue; 424 if (pgno >= sdp->swd_drumoffset && 425 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 426 return sdp; 427 } 428 } 429 } 430 return NULL; 431 } 432 433 void swapsys_lock(krw_t op) 434 { 435 rw_enter(&swap_syscall_lock, op); 436 } 437 438 void swapsys_unlock(void) 439 { 440 rw_exit(&swap_syscall_lock); 441 } 442 443 /* 444 * sys_swapctl: main entry point for swapctl(2) system call 445 * [with two helper functions: swap_on and swap_off] 446 */ 447 int 448 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval) 449 { 450 /* { 451 syscallarg(int) cmd; 452 syscallarg(void *) arg; 453 syscallarg(int) misc; 454 } */ 455 struct vnode *vp; 456 struct nameidata nd; 457 struct swappri *spp; 458 struct swapdev *sdp; 459 struct swapent *sep; 460 #define SWAP_PATH_MAX (PATH_MAX + 1) 461 char *userpath; 462 size_t len = 0; 463 int error, misc; 464 int priority; 465 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist); 466 467 /* 468 * we handle the non-priv NSWAP and STATS request first. 469 * 470 * SWAP_NSWAP: return number of config'd swap devices 471 * [can also be obtained with uvmexp sysctl] 472 */ 473 if (SCARG(uap, cmd) == SWAP_NSWAP) { 474 const int nswapdev = uvmexp.nswapdev; 475 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%jd", nswapdev, 476 0, 0, 0); 477 *retval = nswapdev; 478 return 0; 479 } 480 481 misc = SCARG(uap, misc); 482 userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP); 483 484 /* 485 * ensure serialized syscall access by grabbing the swap_syscall_lock 486 */ 487 rw_enter(&swap_syscall_lock, RW_WRITER); 488 489 /* 490 * SWAP_STATS: get stats on current # of configured swap devs 491 * 492 * note that the swap_priority list can't change as long 493 * as we are holding the swap_syscall_lock. we don't want 494 * to grab the uvm_swap_data_lock because we may fault&sleep during 495 * copyout() and we don't want to be holding that lock then! 496 */ 497 if (SCARG(uap, cmd) == SWAP_STATS 498 #if defined(COMPAT_50) 499 || SCARG(uap, cmd) == SWAP_STATS50 500 #endif 501 #if defined(COMPAT_13) 502 || SCARG(uap, cmd) == SWAP_STATS13 503 #endif 504 ) { 505 if (misc < 0) { 506 error = EINVAL; 507 goto out; 508 } 509 if (misc == 0 || uvmexp.nswapdev == 0) { 510 error = 0; 511 goto out; 512 } 513 /* Make sure userland cannot exhaust kernel memory */ 514 if ((size_t)misc > (size_t)uvmexp.nswapdev) 515 misc = uvmexp.nswapdev; 516 KASSERT(misc > 0); 517 #if defined(COMPAT_13) 518 if (SCARG(uap, cmd) == SWAP_STATS13) 519 len = sizeof(struct swapent13) * misc; 520 else 521 #endif 522 #if defined(COMPAT_50) 523 if (SCARG(uap, cmd) == SWAP_STATS50) 524 len = sizeof(struct swapent50) * misc; 525 else 526 #endif 527 len = sizeof(struct swapent) * misc; 528 sep = (struct swapent *)kmem_alloc(len, KM_SLEEP); 529 530 uvm_swap_stats(SCARG(uap, cmd), sep, misc, retval); 531 error = copyout(sep, SCARG(uap, arg), len); 532 533 kmem_free(sep, len); 534 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); 535 goto out; 536 } 537 if (SCARG(uap, cmd) == SWAP_GETDUMPDEV) { 538 dev_t *devp = (dev_t *)SCARG(uap, arg); 539 540 error = copyout(&dumpdev, devp, sizeof(dumpdev)); 541 goto out; 542 } 543 544 /* 545 * all other requests require superuser privs. verify. 546 */ 547 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL, 548 0, NULL, NULL, NULL))) 549 goto out; 550 551 if (SCARG(uap, cmd) == SWAP_DUMPOFF) { 552 /* drop the current dump device */ 553 dumpdev = NODEV; 554 dumpcdev = NODEV; 555 cpu_dumpconf(); 556 goto out; 557 } 558 559 /* 560 * at this point we expect a path name in arg. we will 561 * use namei() to gain a vnode reference (vref), and lock 562 * the vnode (VOP_LOCK). 563 * 564 * XXX: a NULL arg means use the root vnode pointer (e.g. for 565 * miniroot) 566 */ 567 if (SCARG(uap, arg) == NULL) { 568 vp = rootvp; /* miniroot */ 569 vref(vp); 570 if (vn_lock(vp, LK_EXCLUSIVE)) { 571 vrele(vp); 572 error = EBUSY; 573 goto out; 574 } 575 if (SCARG(uap, cmd) == SWAP_ON && 576 copystr("miniroot", userpath, SWAP_PATH_MAX, &len)) 577 panic("swapctl: miniroot copy failed"); 578 } else { 579 struct pathbuf *pb; 580 581 /* 582 * This used to allow copying in one extra byte 583 * (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON. 584 * This was completely pointless because if anyone 585 * used that extra byte namei would fail with 586 * ENAMETOOLONG anyway, so I've removed the excess 587 * logic. - dholland 20100215 588 */ 589 590 error = pathbuf_copyin(SCARG(uap, arg), &pb); 591 if (error) { 592 goto out; 593 } 594 if (SCARG(uap, cmd) == SWAP_ON) { 595 /* get a copy of the string */ 596 pathbuf_copystring(pb, userpath, SWAP_PATH_MAX); 597 len = strlen(userpath) + 1; 598 } 599 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb); 600 if ((error = namei(&nd))) { 601 pathbuf_destroy(pb); 602 goto out; 603 } 604 vp = nd.ni_vp; 605 pathbuf_destroy(pb); 606 } 607 /* note: "vp" is referenced and locked */ 608 609 error = 0; /* assume no error */ 610 switch(SCARG(uap, cmd)) { 611 612 case SWAP_DUMPDEV: 613 if (vp->v_type != VBLK) { 614 error = ENOTBLK; 615 break; 616 } 617 if (bdevsw_lookup(vp->v_rdev)) { 618 dumpdev = vp->v_rdev; 619 dumpcdev = devsw_blk2chr(dumpdev); 620 } else 621 dumpdev = NODEV; 622 cpu_dumpconf(); 623 break; 624 625 case SWAP_CTL: 626 /* 627 * get new priority, remove old entry (if any) and then 628 * reinsert it in the correct place. finally, prune out 629 * any empty priority structures. 630 */ 631 priority = SCARG(uap, misc); 632 spp = kmem_alloc(sizeof(*spp), KM_SLEEP); 633 mutex_enter(&uvm_swap_data_lock); 634 if ((sdp = swaplist_find(vp, true)) == NULL) { 635 error = ENOENT; 636 } else { 637 swaplist_insert(sdp, spp, priority); 638 swaplist_trim(); 639 } 640 mutex_exit(&uvm_swap_data_lock); 641 if (error) 642 kmem_free(spp, sizeof(*spp)); 643 break; 644 645 case SWAP_ON: 646 647 /* 648 * check for duplicates. if none found, then insert a 649 * dummy entry on the list to prevent someone else from 650 * trying to enable this device while we are working on 651 * it. 652 */ 653 654 priority = SCARG(uap, misc); 655 sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP); 656 spp = kmem_alloc(sizeof(*spp), KM_SLEEP); 657 sdp->swd_flags = SWF_FAKE; 658 sdp->swd_vp = vp; 659 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 660 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK); 661 mutex_enter(&uvm_swap_data_lock); 662 if (swaplist_find(vp, false) != NULL) { 663 error = EBUSY; 664 mutex_exit(&uvm_swap_data_lock); 665 bufq_free(sdp->swd_tab); 666 kmem_free(sdp, sizeof(*sdp)); 667 kmem_free(spp, sizeof(*spp)); 668 break; 669 } 670 swaplist_insert(sdp, spp, priority); 671 mutex_exit(&uvm_swap_data_lock); 672 673 KASSERT(len > 0); 674 sdp->swd_pathlen = len; 675 sdp->swd_path = kmem_alloc(len, KM_SLEEP); 676 if (copystr(userpath, sdp->swd_path, len, 0) != 0) 677 panic("swapctl: copystr"); 678 679 /* 680 * we've now got a FAKE placeholder in the swap list. 681 * now attempt to enable swap on it. if we fail, undo 682 * what we've done and kill the fake entry we just inserted. 683 * if swap_on is a success, it will clear the SWF_FAKE flag 684 */ 685 686 if ((error = swap_on(l, sdp)) != 0) { 687 mutex_enter(&uvm_swap_data_lock); 688 (void) swaplist_find(vp, true); /* kill fake entry */ 689 swaplist_trim(); 690 mutex_exit(&uvm_swap_data_lock); 691 bufq_free(sdp->swd_tab); 692 kmem_free(sdp->swd_path, sdp->swd_pathlen); 693 kmem_free(sdp, sizeof(*sdp)); 694 break; 695 } 696 break; 697 698 case SWAP_OFF: 699 mutex_enter(&uvm_swap_data_lock); 700 if ((sdp = swaplist_find(vp, false)) == NULL) { 701 mutex_exit(&uvm_swap_data_lock); 702 error = ENXIO; 703 break; 704 } 705 706 /* 707 * If a device isn't in use or enabled, we 708 * can't stop swapping from it (again). 709 */ 710 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 711 mutex_exit(&uvm_swap_data_lock); 712 error = EBUSY; 713 break; 714 } 715 716 /* 717 * do the real work. 718 */ 719 error = swap_off(l, sdp); 720 break; 721 722 default: 723 error = EINVAL; 724 } 725 726 /* 727 * done! release the ref gained by namei() and unlock. 728 */ 729 vput(vp); 730 out: 731 rw_exit(&swap_syscall_lock); 732 kmem_free(userpath, SWAP_PATH_MAX); 733 734 UVMHIST_LOG(pdhist, "<- done! error=%jd", error, 0, 0, 0); 735 return (error); 736 } 737 738 /* 739 * uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept 740 * away from sys_swapctl() in order to allow COMPAT_* swapctl() 741 * emulation to use it directly without going through sys_swapctl(). 742 * The problem with using sys_swapctl() there is that it involves 743 * copying the swapent array to the stackgap, and this array's size 744 * is not known at build time. Hence it would not be possible to 745 * ensure it would fit in the stackgap in any case. 746 */ 747 void 748 uvm_swap_stats(int cmd, struct swapent *sep, int sec, register_t *retval) 749 { 750 struct swappri *spp; 751 struct swapdev *sdp; 752 int count = 0; 753 754 KASSERT(rw_lock_held(&swap_syscall_lock)); 755 756 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 757 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 758 int inuse; 759 760 if (sec-- <= 0) 761 break; 762 763 /* 764 * backwards compatibility for system call. 765 * For NetBSD 1.3 and 5.0, we have to use 766 * the 32 bit dev_t. For 5.0 and -current 767 * we have to add the path. 768 */ 769 inuse = btodb((uint64_t)sdp->swd_npginuse << 770 PAGE_SHIFT); 771 772 #if defined(COMPAT_13) || defined(COMPAT_50) 773 if (cmd == SWAP_STATS) { 774 #endif 775 sep->se_dev = sdp->swd_dev; 776 sep->se_flags = sdp->swd_flags; 777 sep->se_nblks = sdp->swd_nblks; 778 sep->se_inuse = inuse; 779 sep->se_priority = sdp->swd_priority; 780 KASSERT(sdp->swd_pathlen < 781 sizeof(sep->se_path)); 782 strcpy(sep->se_path, sdp->swd_path); 783 sep++; 784 #if defined(COMPAT_13) 785 } else if (cmd == SWAP_STATS13) { 786 struct swapent13 *sep13 = 787 (struct swapent13 *)sep; 788 789 sep13->se13_dev = sdp->swd_dev; 790 sep13->se13_flags = sdp->swd_flags; 791 sep13->se13_nblks = sdp->swd_nblks; 792 sep13->se13_inuse = inuse; 793 sep13->se13_priority = sdp->swd_priority; 794 sep = (struct swapent *)(sep13 + 1); 795 #endif 796 #if defined(COMPAT_50) 797 } else if (cmd == SWAP_STATS50) { 798 struct swapent50 *sep50 = 799 (struct swapent50 *)sep; 800 801 sep50->se50_dev = sdp->swd_dev; 802 sep50->se50_flags = sdp->swd_flags; 803 sep50->se50_nblks = sdp->swd_nblks; 804 sep50->se50_inuse = inuse; 805 sep50->se50_priority = sdp->swd_priority; 806 KASSERT(sdp->swd_pathlen < 807 sizeof(sep50->se50_path)); 808 strcpy(sep50->se50_path, sdp->swd_path); 809 sep = (struct swapent *)(sep50 + 1); 810 #endif 811 #if defined(COMPAT_13) || defined(COMPAT_50) 812 } 813 #endif 814 count++; 815 } 816 } 817 *retval = count; 818 } 819 820 /* 821 * swap_on: attempt to enable a swapdev for swapping. note that the 822 * swapdev is already on the global list, but disabled (marked 823 * SWF_FAKE). 824 * 825 * => we avoid the start of the disk (to protect disk labels) 826 * => we also avoid the miniroot, if we are swapping to root. 827 * => caller should leave uvm_swap_data_lock unlocked, we may lock it 828 * if needed. 829 */ 830 static int 831 swap_on(struct lwp *l, struct swapdev *sdp) 832 { 833 struct vnode *vp; 834 int error, npages, nblocks, size; 835 long addr; 836 vmem_addr_t result; 837 struct vattr va; 838 dev_t dev; 839 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist); 840 841 /* 842 * we want to enable swapping on sdp. the swd_vp contains 843 * the vnode we want (locked and ref'd), and the swd_dev 844 * contains the dev_t of the file, if it a block device. 845 */ 846 847 vp = sdp->swd_vp; 848 dev = sdp->swd_dev; 849 850 /* 851 * open the swap file (mostly useful for block device files to 852 * let device driver know what is up). 853 * 854 * we skip the open/close for root on swap because the root 855 * has already been opened when root was mounted (mountroot). 856 */ 857 if (vp != rootvp) { 858 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred))) 859 return (error); 860 } 861 862 /* XXX this only works for block devices */ 863 UVMHIST_LOG(pdhist, " dev=%jd, major(dev)=%jd", dev, major(dev), 0, 0); 864 865 /* 866 * we now need to determine the size of the swap area. for 867 * block specials we can call the d_psize function. 868 * for normal files, we must stat [get attrs]. 869 * 870 * we put the result in nblks. 871 * for normal files, we also want the filesystem block size 872 * (which we get with statfs). 873 */ 874 switch (vp->v_type) { 875 case VBLK: 876 if ((nblocks = bdev_size(dev)) == -1) { 877 error = ENXIO; 878 goto bad; 879 } 880 break; 881 882 case VREG: 883 if ((error = VOP_GETATTR(vp, &va, l->l_cred))) 884 goto bad; 885 nblocks = (int)btodb(va.va_size); 886 sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift; 887 /* 888 * limit the max # of outstanding I/O requests we issue 889 * at any one time. take it easy on NFS servers. 890 */ 891 if (vp->v_tag == VT_NFS) 892 sdp->swd_maxactive = 2; /* XXX */ 893 else 894 sdp->swd_maxactive = 8; /* XXX */ 895 break; 896 897 default: 898 error = ENXIO; 899 goto bad; 900 } 901 902 /* 903 * save nblocks in a safe place and convert to pages. 904 */ 905 906 sdp->swd_nblks = nblocks; 907 npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT; 908 909 /* 910 * for block special files, we want to make sure that leave 911 * the disklabel and bootblocks alone, so we arrange to skip 912 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 913 * note that because of this the "size" can be less than the 914 * actual number of blocks on the device. 915 */ 916 if (vp->v_type == VBLK) { 917 /* we use pages 1 to (size - 1) [inclusive] */ 918 size = npages - 1; 919 addr = 1; 920 } else { 921 /* we use pages 0 to (size - 1) [inclusive] */ 922 size = npages; 923 addr = 0; 924 } 925 926 /* 927 * make sure we have enough blocks for a reasonable sized swap 928 * area. we want at least one page. 929 */ 930 931 if (size < 1) { 932 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); 933 error = EINVAL; 934 goto bad; 935 } 936 937 UVMHIST_LOG(pdhist, " dev=%jx: size=%jd addr=%jd", dev, size, addr, 0); 938 939 /* 940 * now we need to allocate an extent to manage this swap device 941 */ 942 943 sdp->swd_blist = blist_create(npages); 944 /* mark all expect the `saved' region free. */ 945 blist_free(sdp->swd_blist, addr, size); 946 947 /* 948 * if the vnode we are swapping to is the root vnode 949 * (i.e. we are swapping to the miniroot) then we want 950 * to make sure we don't overwrite it. do a statfs to 951 * find its size and skip over it. 952 */ 953 if (vp == rootvp) { 954 struct mount *mp; 955 struct statvfs *sp; 956 int rootblocks, rootpages; 957 958 mp = rootvnode->v_mount; 959 sp = &mp->mnt_stat; 960 rootblocks = sp->f_blocks * btodb(sp->f_frsize); 961 /* 962 * XXX: sp->f_blocks isn't the total number of 963 * blocks in the filesystem, it's the number of 964 * data blocks. so, our rootblocks almost 965 * definitely underestimates the total size 966 * of the filesystem - how badly depends on the 967 * details of the filesystem type. there isn't 968 * an obvious way to deal with this cleanly 969 * and perfectly, so for now we just pad our 970 * rootblocks estimate with an extra 5 percent. 971 */ 972 rootblocks += (rootblocks >> 5) + 973 (rootblocks >> 6) + 974 (rootblocks >> 7); 975 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; 976 if (rootpages > size) 977 panic("swap_on: miniroot larger than swap?"); 978 979 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) { 980 panic("swap_on: unable to preserve miniroot"); 981 } 982 983 size -= rootpages; 984 printf("Preserved %d pages of miniroot ", rootpages); 985 printf("leaving %d pages of swap\n", size); 986 } 987 988 /* 989 * add a ref to vp to reflect usage as a swap device. 990 */ 991 vref(vp); 992 993 /* 994 * now add the new swapdev to the drum and enable. 995 */ 996 error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result); 997 if (error != 0) 998 panic("swapdrum_add"); 999 /* 1000 * If this is the first regular swap create the workqueue. 1001 * => Protected by swap_syscall_lock. 1002 */ 1003 if (vp->v_type != VBLK) { 1004 if (sw_reg_count++ == 0) { 1005 KASSERT(sw_reg_workqueue == NULL); 1006 if (workqueue_create(&sw_reg_workqueue, "swapiod", 1007 sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0) 1008 panic("%s: workqueue_create failed", __func__); 1009 } 1010 } 1011 1012 sdp->swd_drumoffset = (int)result; 1013 sdp->swd_drumsize = npages; 1014 sdp->swd_npages = size; 1015 mutex_enter(&uvm_swap_data_lock); 1016 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 1017 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 1018 uvmexp.swpages += size; 1019 uvmexp.swpgavail += size; 1020 mutex_exit(&uvm_swap_data_lock); 1021 return (0); 1022 1023 /* 1024 * failure: clean up and return error. 1025 */ 1026 1027 bad: 1028 if (sdp->swd_blist) { 1029 blist_destroy(sdp->swd_blist); 1030 } 1031 if (vp != rootvp) { 1032 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred); 1033 } 1034 return (error); 1035 } 1036 1037 /* 1038 * swap_off: stop swapping on swapdev 1039 * 1040 * => swap data should be locked, we will unlock. 1041 */ 1042 static int 1043 swap_off(struct lwp *l, struct swapdev *sdp) 1044 { 1045 int npages = sdp->swd_npages; 1046 int error = 0; 1047 1048 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist); 1049 UVMHIST_LOG(pdhist, " dev=%jx, npages=%jd", sdp->swd_dev,npages, 0, 0); 1050 1051 /* disable the swap area being removed */ 1052 sdp->swd_flags &= ~SWF_ENABLE; 1053 uvmexp.swpgavail -= npages; 1054 mutex_exit(&uvm_swap_data_lock); 1055 1056 /* 1057 * the idea is to find all the pages that are paged out to this 1058 * device, and page them all in. in uvm, swap-backed pageable 1059 * memory can take two forms: aobjs and anons. call the 1060 * swapoff hook for each subsystem to bring in pages. 1061 */ 1062 1063 if (uao_swap_off(sdp->swd_drumoffset, 1064 sdp->swd_drumoffset + sdp->swd_drumsize) || 1065 amap_swap_off(sdp->swd_drumoffset, 1066 sdp->swd_drumoffset + sdp->swd_drumsize)) { 1067 error = ENOMEM; 1068 } else if (sdp->swd_npginuse > sdp->swd_npgbad) { 1069 error = EBUSY; 1070 } 1071 1072 if (error) { 1073 mutex_enter(&uvm_swap_data_lock); 1074 sdp->swd_flags |= SWF_ENABLE; 1075 uvmexp.swpgavail += npages; 1076 mutex_exit(&uvm_swap_data_lock); 1077 1078 return error; 1079 } 1080 1081 /* 1082 * If this is the last regular swap destroy the workqueue. 1083 * => Protected by swap_syscall_lock. 1084 */ 1085 if (sdp->swd_vp->v_type != VBLK) { 1086 KASSERT(sw_reg_count > 0); 1087 KASSERT(sw_reg_workqueue != NULL); 1088 if (--sw_reg_count == 0) { 1089 workqueue_destroy(sw_reg_workqueue); 1090 sw_reg_workqueue = NULL; 1091 } 1092 } 1093 1094 /* 1095 * done with the vnode. 1096 * drop our ref on the vnode before calling VOP_CLOSE() 1097 * so that spec_close() can tell if this is the last close. 1098 */ 1099 vrele(sdp->swd_vp); 1100 if (sdp->swd_vp != rootvp) { 1101 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred); 1102 } 1103 1104 mutex_enter(&uvm_swap_data_lock); 1105 uvmexp.swpages -= npages; 1106 uvmexp.swpginuse -= sdp->swd_npgbad; 1107 1108 if (swaplist_find(sdp->swd_vp, true) == NULL) 1109 panic("%s: swapdev not in list", __func__); 1110 swaplist_trim(); 1111 mutex_exit(&uvm_swap_data_lock); 1112 1113 /* 1114 * free all resources! 1115 */ 1116 vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize); 1117 blist_destroy(sdp->swd_blist); 1118 bufq_free(sdp->swd_tab); 1119 kmem_free(sdp, sizeof(*sdp)); 1120 return (0); 1121 } 1122 1123 void 1124 uvm_swap_shutdown(struct lwp *l) 1125 { 1126 struct swapdev *sdp; 1127 struct swappri *spp; 1128 struct vnode *vp; 1129 int error; 1130 1131 printf("turning of swap..."); 1132 rw_enter(&swap_syscall_lock, RW_WRITER); 1133 mutex_enter(&uvm_swap_data_lock); 1134 again: 1135 LIST_FOREACH(spp, &swap_priority, spi_swappri) 1136 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1137 if (sdp->swd_flags & SWF_FAKE) 1138 continue; 1139 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) 1140 continue; 1141 #ifdef DEBUG 1142 printf("\nturning off swap on %s...", 1143 sdp->swd_path); 1144 #endif 1145 if (vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE)) { 1146 error = EBUSY; 1147 vp = NULL; 1148 } else 1149 error = 0; 1150 if (!error) { 1151 error = swap_off(l, sdp); 1152 mutex_enter(&uvm_swap_data_lock); 1153 } 1154 if (error) { 1155 printf("stopping swap on %s failed " 1156 "with error %d\n", sdp->swd_path, error); 1157 TAILQ_REMOVE(&spp->spi_swapdev, sdp, 1158 swd_next); 1159 uvmexp.nswapdev--; 1160 swaplist_trim(); 1161 if (vp) 1162 vput(vp); 1163 } 1164 goto again; 1165 } 1166 printf(" done\n"); 1167 mutex_exit(&uvm_swap_data_lock); 1168 rw_exit(&swap_syscall_lock); 1169 } 1170 1171 1172 /* 1173 * /dev/drum interface and i/o functions 1174 */ 1175 1176 /* 1177 * swstrategy: perform I/O on the drum 1178 * 1179 * => we must map the i/o request from the drum to the correct swapdev. 1180 */ 1181 static void 1182 swstrategy(struct buf *bp) 1183 { 1184 struct swapdev *sdp; 1185 struct vnode *vp; 1186 int pageno, bn; 1187 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist); 1188 1189 /* 1190 * convert block number to swapdev. note that swapdev can't 1191 * be yanked out from under us because we are holding resources 1192 * in it (i.e. the blocks we are doing I/O on). 1193 */ 1194 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT; 1195 mutex_enter(&uvm_swap_data_lock); 1196 sdp = swapdrum_getsdp(pageno); 1197 mutex_exit(&uvm_swap_data_lock); 1198 if (sdp == NULL) { 1199 bp->b_error = EINVAL; 1200 bp->b_resid = bp->b_bcount; 1201 biodone(bp); 1202 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); 1203 return; 1204 } 1205 1206 /* 1207 * convert drum page number to block number on this swapdev. 1208 */ 1209 1210 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1211 bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ 1212 1213 UVMHIST_LOG(pdhist, " Rd/Wr (0/1) %jd: mapoff=%jx bn=%jx bcount=%jd", 1214 ((bp->b_flags & B_READ) == 0) ? 1 : 0, 1215 sdp->swd_drumoffset, bn, bp->b_bcount); 1216 1217 /* 1218 * for block devices we finish up here. 1219 * for regular files we have to do more work which we delegate 1220 * to sw_reg_strategy(). 1221 */ 1222 1223 vp = sdp->swd_vp; /* swapdev vnode pointer */ 1224 switch (vp->v_type) { 1225 default: 1226 panic("%s: vnode type 0x%x", __func__, vp->v_type); 1227 1228 case VBLK: 1229 1230 /* 1231 * must convert "bp" from an I/O on /dev/drum to an I/O 1232 * on the swapdev (sdp). 1233 */ 1234 bp->b_blkno = bn; /* swapdev block number */ 1235 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */ 1236 1237 /* 1238 * if we are doing a write, we have to redirect the i/o on 1239 * drum's v_numoutput counter to the swapdevs. 1240 */ 1241 if ((bp->b_flags & B_READ) == 0) { 1242 mutex_enter(bp->b_objlock); 1243 vwakeup(bp); /* kills one 'v_numoutput' on drum */ 1244 mutex_exit(bp->b_objlock); 1245 mutex_enter(vp->v_interlock); 1246 vp->v_numoutput++; /* put it on swapdev */ 1247 mutex_exit(vp->v_interlock); 1248 } 1249 1250 /* 1251 * finally plug in swapdev vnode and start I/O 1252 */ 1253 bp->b_vp = vp; 1254 bp->b_objlock = vp->v_interlock; 1255 VOP_STRATEGY(vp, bp); 1256 return; 1257 1258 case VREG: 1259 /* 1260 * delegate to sw_reg_strategy function. 1261 */ 1262 sw_reg_strategy(sdp, bp, bn); 1263 return; 1264 } 1265 /* NOTREACHED */ 1266 } 1267 1268 /* 1269 * swread: the read function for the drum (just a call to physio) 1270 */ 1271 /*ARGSUSED*/ 1272 static int 1273 swread(dev_t dev, struct uio *uio, int ioflag) 1274 { 1275 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist); 1276 1277 UVMHIST_LOG(pdhist, " dev=%jx offset=%jx", dev, uio->uio_offset, 0, 0); 1278 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); 1279 } 1280 1281 /* 1282 * swwrite: the write function for the drum (just a call to physio) 1283 */ 1284 /*ARGSUSED*/ 1285 static int 1286 swwrite(dev_t dev, struct uio *uio, int ioflag) 1287 { 1288 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist); 1289 1290 UVMHIST_LOG(pdhist, " dev=%jx offset=%jx", dev, uio->uio_offset, 0, 0); 1291 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); 1292 } 1293 1294 const struct bdevsw swap_bdevsw = { 1295 .d_open = nullopen, 1296 .d_close = nullclose, 1297 .d_strategy = swstrategy, 1298 .d_ioctl = noioctl, 1299 .d_dump = nodump, 1300 .d_psize = nosize, 1301 .d_discard = nodiscard, 1302 .d_flag = D_OTHER 1303 }; 1304 1305 const struct cdevsw swap_cdevsw = { 1306 .d_open = nullopen, 1307 .d_close = nullclose, 1308 .d_read = swread, 1309 .d_write = swwrite, 1310 .d_ioctl = noioctl, 1311 .d_stop = nostop, 1312 .d_tty = notty, 1313 .d_poll = nopoll, 1314 .d_mmap = nommap, 1315 .d_kqfilter = nokqfilter, 1316 .d_discard = nodiscard, 1317 .d_flag = D_OTHER, 1318 }; 1319 1320 /* 1321 * sw_reg_strategy: handle swap i/o to regular files 1322 */ 1323 static void 1324 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) 1325 { 1326 struct vnode *vp; 1327 struct vndxfer *vnx; 1328 daddr_t nbn; 1329 char *addr; 1330 off_t byteoff; 1331 int s, off, nra, error, sz, resid; 1332 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist); 1333 1334 /* 1335 * allocate a vndxfer head for this transfer and point it to 1336 * our buffer. 1337 */ 1338 vnx = pool_get(&vndxfer_pool, PR_WAITOK); 1339 vnx->vx_flags = VX_BUSY; 1340 vnx->vx_error = 0; 1341 vnx->vx_pending = 0; 1342 vnx->vx_bp = bp; 1343 vnx->vx_sdp = sdp; 1344 1345 /* 1346 * setup for main loop where we read filesystem blocks into 1347 * our buffer. 1348 */ 1349 error = 0; 1350 bp->b_resid = bp->b_bcount; /* nothing transfered yet! */ 1351 addr = bp->b_data; /* current position in buffer */ 1352 byteoff = dbtob((uint64_t)bn); 1353 1354 for (resid = bp->b_resid; resid; resid -= sz) { 1355 struct vndbuf *nbp; 1356 1357 /* 1358 * translate byteoffset into block number. return values: 1359 * vp = vnode of underlying device 1360 * nbn = new block number (on underlying vnode dev) 1361 * nra = num blocks we can read-ahead (excludes requested 1362 * block) 1363 */ 1364 nra = 0; 1365 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1366 &vp, &nbn, &nra); 1367 1368 if (error == 0 && nbn == (daddr_t)-1) { 1369 /* 1370 * this used to just set error, but that doesn't 1371 * do the right thing. Instead, it causes random 1372 * memory errors. The panic() should remain until 1373 * this condition doesn't destabilize the system. 1374 */ 1375 #if 1 1376 panic("%s: swap to sparse file", __func__); 1377 #else 1378 error = EIO; /* failure */ 1379 #endif 1380 } 1381 1382 /* 1383 * punt if there was an error or a hole in the file. 1384 * we must wait for any i/o ops we have already started 1385 * to finish before returning. 1386 * 1387 * XXX we could deal with holes here but it would be 1388 * a hassle (in the write case). 1389 */ 1390 if (error) { 1391 s = splbio(); 1392 vnx->vx_error = error; /* pass error up */ 1393 goto out; 1394 } 1395 1396 /* 1397 * compute the size ("sz") of this transfer (in bytes). 1398 */ 1399 off = byteoff % sdp->swd_bsize; 1400 sz = (1 + nra) * sdp->swd_bsize - off; 1401 if (sz > resid) 1402 sz = resid; 1403 1404 UVMHIST_LOG(pdhist, "sw_reg_strategy: " 1405 "vp %#jx/%#jx offset 0x%jx/0x%jx", 1406 (uintptr_t)sdp->swd_vp, (uintptr_t)vp, byteoff, nbn); 1407 1408 /* 1409 * now get a buf structure. note that the vb_buf is 1410 * at the front of the nbp structure so that you can 1411 * cast pointers between the two structure easily. 1412 */ 1413 nbp = pool_get(&vndbuf_pool, PR_WAITOK); 1414 buf_init(&nbp->vb_buf); 1415 nbp->vb_buf.b_flags = bp->b_flags; 1416 nbp->vb_buf.b_cflags = bp->b_cflags; 1417 nbp->vb_buf.b_oflags = bp->b_oflags; 1418 nbp->vb_buf.b_bcount = sz; 1419 nbp->vb_buf.b_bufsize = sz; 1420 nbp->vb_buf.b_error = 0; 1421 nbp->vb_buf.b_data = addr; 1422 nbp->vb_buf.b_lblkno = 0; 1423 nbp->vb_buf.b_blkno = nbn + btodb(off); 1424 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno; 1425 nbp->vb_buf.b_iodone = sw_reg_biodone; 1426 nbp->vb_buf.b_vp = vp; 1427 nbp->vb_buf.b_objlock = vp->v_interlock; 1428 if (vp->v_type == VBLK) { 1429 nbp->vb_buf.b_dev = vp->v_rdev; 1430 } 1431 1432 nbp->vb_xfer = vnx; /* patch it back in to vnx */ 1433 1434 /* 1435 * Just sort by block number 1436 */ 1437 s = splbio(); 1438 if (vnx->vx_error != 0) { 1439 buf_destroy(&nbp->vb_buf); 1440 pool_put(&vndbuf_pool, nbp); 1441 goto out; 1442 } 1443 vnx->vx_pending++; 1444 1445 /* sort it in and start I/O if we are not over our limit */ 1446 /* XXXAD locking */ 1447 bufq_put(sdp->swd_tab, &nbp->vb_buf); 1448 sw_reg_start(sdp); 1449 splx(s); 1450 1451 /* 1452 * advance to the next I/O 1453 */ 1454 byteoff += sz; 1455 addr += sz; 1456 } 1457 1458 s = splbio(); 1459 1460 out: /* Arrive here at splbio */ 1461 vnx->vx_flags &= ~VX_BUSY; 1462 if (vnx->vx_pending == 0) { 1463 error = vnx->vx_error; 1464 pool_put(&vndxfer_pool, vnx); 1465 bp->b_error = error; 1466 biodone(bp); 1467 } 1468 splx(s); 1469 } 1470 1471 /* 1472 * sw_reg_start: start an I/O request on the requested swapdev 1473 * 1474 * => reqs are sorted by b_rawblkno (above) 1475 */ 1476 static void 1477 sw_reg_start(struct swapdev *sdp) 1478 { 1479 struct buf *bp; 1480 struct vnode *vp; 1481 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist); 1482 1483 /* recursion control */ 1484 if ((sdp->swd_flags & SWF_BUSY) != 0) 1485 return; 1486 1487 sdp->swd_flags |= SWF_BUSY; 1488 1489 while (sdp->swd_active < sdp->swd_maxactive) { 1490 bp = bufq_get(sdp->swd_tab); 1491 if (bp == NULL) 1492 break; 1493 sdp->swd_active++; 1494 1495 UVMHIST_LOG(pdhist, 1496 "sw_reg_start: bp %#jx vp %#jx blkno %#jx cnt %jx", 1497 (uintptr_t)bp, (uintptr_t)bp->b_vp, (uintptr_t)bp->b_blkno, 1498 bp->b_bcount); 1499 vp = bp->b_vp; 1500 KASSERT(bp->b_objlock == vp->v_interlock); 1501 if ((bp->b_flags & B_READ) == 0) { 1502 mutex_enter(vp->v_interlock); 1503 vp->v_numoutput++; 1504 mutex_exit(vp->v_interlock); 1505 } 1506 VOP_STRATEGY(vp, bp); 1507 } 1508 sdp->swd_flags &= ~SWF_BUSY; 1509 } 1510 1511 /* 1512 * sw_reg_biodone: one of our i/o's has completed 1513 */ 1514 static void 1515 sw_reg_biodone(struct buf *bp) 1516 { 1517 workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL); 1518 } 1519 1520 /* 1521 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1522 * 1523 * => note that we can recover the vndbuf struct by casting the buf ptr 1524 */ 1525 static void 1526 sw_reg_iodone(struct work *wk, void *dummy) 1527 { 1528 struct vndbuf *vbp = (void *)wk; 1529 struct vndxfer *vnx = vbp->vb_xfer; 1530 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1531 struct swapdev *sdp = vnx->vx_sdp; 1532 int s, resid, error; 1533 KASSERT(&vbp->vb_buf.b_work == wk); 1534 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist); 1535 1536 UVMHIST_LOG(pdhist, " vbp=%#jx vp=%#jx blkno=%jx addr=%#jx", 1537 (uintptr_t)vbp, (uintptr_t)vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, 1538 (uintptr_t)vbp->vb_buf.b_data); 1539 UVMHIST_LOG(pdhist, " cnt=%jx resid=%jx", 1540 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); 1541 1542 /* 1543 * protect vbp at splbio and update. 1544 */ 1545 1546 s = splbio(); 1547 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1548 pbp->b_resid -= resid; 1549 vnx->vx_pending--; 1550 1551 if (vbp->vb_buf.b_error != 0) { 1552 /* pass error upward */ 1553 error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO; 1554 UVMHIST_LOG(pdhist, " got error=%jd !", error, 0, 0, 0); 1555 vnx->vx_error = error; 1556 } 1557 1558 /* 1559 * kill vbp structure 1560 */ 1561 buf_destroy(&vbp->vb_buf); 1562 pool_put(&vndbuf_pool, vbp); 1563 1564 /* 1565 * wrap up this transaction if it has run to completion or, in 1566 * case of an error, when all auxiliary buffers have returned. 1567 */ 1568 if (vnx->vx_error != 0) { 1569 /* pass error upward */ 1570 error = vnx->vx_error; 1571 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1572 pbp->b_error = error; 1573 biodone(pbp); 1574 pool_put(&vndxfer_pool, vnx); 1575 } 1576 } else if (pbp->b_resid == 0) { 1577 KASSERT(vnx->vx_pending == 0); 1578 if ((vnx->vx_flags & VX_BUSY) == 0) { 1579 UVMHIST_LOG(pdhist, " iodone, pbp=%#jx error=%jd !", 1580 (uintptr_t)pbp, vnx->vx_error, 0, 0); 1581 biodone(pbp); 1582 pool_put(&vndxfer_pool, vnx); 1583 } 1584 } 1585 1586 /* 1587 * done! start next swapdev I/O if one is pending 1588 */ 1589 sdp->swd_active--; 1590 sw_reg_start(sdp); 1591 splx(s); 1592 } 1593 1594 1595 /* 1596 * uvm_swap_alloc: allocate space on swap 1597 * 1598 * => allocation is done "round robin" down the priority list, as we 1599 * allocate in a priority we "rotate" the circle queue. 1600 * => space can be freed with uvm_swap_free 1601 * => we return the page slot number in /dev/drum (0 == invalid slot) 1602 * => we lock uvm_swap_data_lock 1603 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1604 */ 1605 int 1606 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok) 1607 { 1608 struct swapdev *sdp; 1609 struct swappri *spp; 1610 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist); 1611 1612 /* 1613 * no swap devices configured yet? definite failure. 1614 */ 1615 if (uvmexp.nswapdev < 1) 1616 return 0; 1617 1618 /* 1619 * XXXJAK: BEGIN HACK 1620 * 1621 * blist_alloc() in subr_blist.c will panic if we try to allocate 1622 * too many slots. 1623 */ 1624 if (*nslots > BLIST_MAX_ALLOC) { 1625 if (__predict_false(lessok == false)) 1626 return 0; 1627 *nslots = BLIST_MAX_ALLOC; 1628 } 1629 /* XXXJAK: END HACK */ 1630 1631 /* 1632 * lock data lock, convert slots into blocks, and enter loop 1633 */ 1634 mutex_enter(&uvm_swap_data_lock); 1635 1636 ReTry: /* XXXMRG */ 1637 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1638 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1639 uint64_t result; 1640 1641 /* if it's not enabled, then we can't swap from it */ 1642 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1643 continue; 1644 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1645 continue; 1646 result = blist_alloc(sdp->swd_blist, *nslots); 1647 if (result == BLIST_NONE) { 1648 continue; 1649 } 1650 KASSERT(result < sdp->swd_drumsize); 1651 1652 /* 1653 * successful allocation! now rotate the tailq. 1654 */ 1655 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1656 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1657 sdp->swd_npginuse += *nslots; 1658 uvmexp.swpginuse += *nslots; 1659 mutex_exit(&uvm_swap_data_lock); 1660 /* done! return drum slot number */ 1661 UVMHIST_LOG(pdhist, 1662 "success! returning %jd slots starting at %jd", 1663 *nslots, result + sdp->swd_drumoffset, 0, 0); 1664 return (result + sdp->swd_drumoffset); 1665 } 1666 } 1667 1668 /* XXXMRG: BEGIN HACK */ 1669 if (*nslots > 1 && lessok) { 1670 *nslots = 1; 1671 /* XXXMRG: ugh! blist should support this for us */ 1672 goto ReTry; 1673 } 1674 /* XXXMRG: END HACK */ 1675 1676 mutex_exit(&uvm_swap_data_lock); 1677 return 0; 1678 } 1679 1680 /* 1681 * uvm_swapisfull: return true if most of available swap is allocated 1682 * and in use. we don't count some small portion as it may be inaccessible 1683 * to us at any given moment, for example if there is lock contention or if 1684 * pages are busy. 1685 */ 1686 bool 1687 uvm_swapisfull(void) 1688 { 1689 int swpgonly; 1690 bool rv; 1691 1692 mutex_enter(&uvm_swap_data_lock); 1693 KASSERT(uvmexp.swpgonly <= uvmexp.swpages); 1694 swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 / 1695 uvm_swapisfull_factor); 1696 rv = (swpgonly >= uvmexp.swpgavail); 1697 mutex_exit(&uvm_swap_data_lock); 1698 1699 return (rv); 1700 } 1701 1702 /* 1703 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1704 * 1705 * => we lock uvm_swap_data_lock 1706 */ 1707 void 1708 uvm_swap_markbad(int startslot, int nslots) 1709 { 1710 struct swapdev *sdp; 1711 UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist); 1712 1713 mutex_enter(&uvm_swap_data_lock); 1714 sdp = swapdrum_getsdp(startslot); 1715 KASSERT(sdp != NULL); 1716 1717 /* 1718 * we just keep track of how many pages have been marked bad 1719 * in this device, to make everything add up in swap_off(). 1720 * we assume here that the range of slots will all be within 1721 * one swap device. 1722 */ 1723 1724 KASSERT(uvmexp.swpgonly >= nslots); 1725 uvmexp.swpgonly -= nslots; 1726 sdp->swd_npgbad += nslots; 1727 UVMHIST_LOG(pdhist, "now %jd bad", sdp->swd_npgbad, 0,0,0); 1728 mutex_exit(&uvm_swap_data_lock); 1729 } 1730 1731 /* 1732 * uvm_swap_free: free swap slots 1733 * 1734 * => this can be all or part of an allocation made by uvm_swap_alloc 1735 * => we lock uvm_swap_data_lock 1736 */ 1737 void 1738 uvm_swap_free(int startslot, int nslots) 1739 { 1740 struct swapdev *sdp; 1741 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist); 1742 1743 UVMHIST_LOG(pdhist, "freeing %jd slots starting at %jd", nslots, 1744 startslot, 0, 0); 1745 1746 /* 1747 * ignore attempts to free the "bad" slot. 1748 */ 1749 1750 if (startslot == SWSLOT_BAD) { 1751 return; 1752 } 1753 1754 /* 1755 * convert drum slot offset back to sdp, free the blocks 1756 * in the extent, and return. must hold pri lock to do 1757 * lookup and access the extent. 1758 */ 1759 1760 mutex_enter(&uvm_swap_data_lock); 1761 sdp = swapdrum_getsdp(startslot); 1762 KASSERT(uvmexp.nswapdev >= 1); 1763 KASSERT(sdp != NULL); 1764 KASSERT(sdp->swd_npginuse >= nslots); 1765 blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots); 1766 sdp->swd_npginuse -= nslots; 1767 uvmexp.swpginuse -= nslots; 1768 mutex_exit(&uvm_swap_data_lock); 1769 } 1770 1771 /* 1772 * uvm_swap_put: put any number of pages into a contig place on swap 1773 * 1774 * => can be sync or async 1775 */ 1776 1777 int 1778 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) 1779 { 1780 int error; 1781 1782 error = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1783 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1784 return error; 1785 } 1786 1787 /* 1788 * uvm_swap_get: get a single page from swap 1789 * 1790 * => usually a sync op (from fault) 1791 */ 1792 1793 int 1794 uvm_swap_get(struct vm_page *page, int swslot, int flags) 1795 { 1796 int error; 1797 1798 uvmexp.nswget++; 1799 KASSERT(flags & PGO_SYNCIO); 1800 if (swslot == SWSLOT_BAD) { 1801 return EIO; 1802 } 1803 1804 error = uvm_swap_io(&page, swslot, 1, B_READ | 1805 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1806 if (error == 0) { 1807 1808 /* 1809 * this page is no longer only in swap. 1810 */ 1811 1812 mutex_enter(&uvm_swap_data_lock); 1813 KASSERT(uvmexp.swpgonly > 0); 1814 uvmexp.swpgonly--; 1815 mutex_exit(&uvm_swap_data_lock); 1816 } 1817 return error; 1818 } 1819 1820 /* 1821 * uvm_swap_io: do an i/o operation to swap 1822 */ 1823 1824 static int 1825 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) 1826 { 1827 daddr_t startblk; 1828 struct buf *bp; 1829 vaddr_t kva; 1830 int error, mapinflags; 1831 bool write, async; 1832 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist); 1833 1834 UVMHIST_LOG(pdhist, "<- called, startslot=%jd, npages=%jd, flags=%jd", 1835 startslot, npages, flags, 0); 1836 1837 write = (flags & B_READ) == 0; 1838 async = (flags & B_ASYNC) != 0; 1839 1840 /* 1841 * allocate a buf for the i/o. 1842 */ 1843 1844 KASSERT(curlwp != uvm.pagedaemon_lwp || (write && async)); 1845 bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp); 1846 if (bp == NULL) { 1847 uvm_aio_aiodone_pages(pps, npages, true, ENOMEM); 1848 return ENOMEM; 1849 } 1850 1851 /* 1852 * convert starting drum slot to block number 1853 */ 1854 1855 startblk = btodb((uint64_t)startslot << PAGE_SHIFT); 1856 1857 /* 1858 * first, map the pages into the kernel. 1859 */ 1860 1861 mapinflags = !write ? 1862 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ : 1863 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE; 1864 kva = uvm_pagermapin(pps, npages, mapinflags); 1865 1866 /* 1867 * fill in the bp/sbp. we currently route our i/o through 1868 * /dev/drum's vnode [swapdev_vp]. 1869 */ 1870 1871 bp->b_cflags = BC_BUSY | BC_NOCACHE; 1872 bp->b_flags = (flags & (B_READ|B_ASYNC)); 1873 bp->b_proc = &proc0; /* XXX */ 1874 bp->b_vnbufs.le_next = NOLIST; 1875 bp->b_data = (void *)kva; 1876 bp->b_blkno = startblk; 1877 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT; 1878 1879 /* 1880 * bump v_numoutput (counter of number of active outputs). 1881 */ 1882 1883 if (write) { 1884 mutex_enter(swapdev_vp->v_interlock); 1885 swapdev_vp->v_numoutput++; 1886 mutex_exit(swapdev_vp->v_interlock); 1887 } 1888 1889 /* 1890 * for async ops we must set up the iodone handler. 1891 */ 1892 1893 if (async) { 1894 bp->b_iodone = uvm_aio_biodone; 1895 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); 1896 if (curlwp == uvm.pagedaemon_lwp) 1897 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1898 else 1899 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 1900 } else { 1901 bp->b_iodone = NULL; 1902 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1903 } 1904 UVMHIST_LOG(pdhist, 1905 "about to start io: data = %#jx blkno = 0x%jx, bcount = %jd", 1906 (uintptr_t)bp->b_data, bp->b_blkno, bp->b_bcount, 0); 1907 1908 /* 1909 * now we start the I/O, and if async, return. 1910 */ 1911 1912 VOP_STRATEGY(swapdev_vp, bp); 1913 if (async) 1914 return 0; 1915 1916 /* 1917 * must be sync i/o. wait for it to finish 1918 */ 1919 1920 error = biowait(bp); 1921 1922 /* 1923 * kill the pager mapping 1924 */ 1925 1926 uvm_pagermapout(kva, npages); 1927 1928 /* 1929 * now dispose of the buf and we're done. 1930 */ 1931 1932 if (write) { 1933 mutex_enter(swapdev_vp->v_interlock); 1934 vwakeup(bp); 1935 mutex_exit(swapdev_vp->v_interlock); 1936 } 1937 putiobuf(bp); 1938 UVMHIST_LOG(pdhist, "<- done (sync) error=%jd", error, 0, 0, 0); 1939 1940 return (error); 1941 } 1942