1 /* $NetBSD: uvm_swap.c,v 1.142 2008/12/17 20:51:39 cegger Exp $ */ 2 3 /* 4 * Copyright (c) 1995, 1996, 1997 Matthew R. Green 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 29 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.142 2008/12/17 20:51:39 cegger Exp $"); 34 35 #include "fs_nfs.h" 36 #include "opt_uvmhist.h" 37 #include "opt_compat_netbsd.h" 38 #include "opt_ddb.h" 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/buf.h> 43 #include <sys/bufq.h> 44 #include <sys/conf.h> 45 #include <sys/proc.h> 46 #include <sys/namei.h> 47 #include <sys/disklabel.h> 48 #include <sys/errno.h> 49 #include <sys/kernel.h> 50 #include <sys/malloc.h> 51 #include <sys/vnode.h> 52 #include <sys/file.h> 53 #include <sys/vmem.h> 54 #include <sys/blist.h> 55 #include <sys/mount.h> 56 #include <sys/pool.h> 57 #include <sys/syscallargs.h> 58 #include <sys/swap.h> 59 #include <sys/kauth.h> 60 #include <sys/sysctl.h> 61 #include <sys/workqueue.h> 62 63 #include <uvm/uvm.h> 64 65 #include <miscfs/specfs/specdev.h> 66 67 /* 68 * uvm_swap.c: manage configuration and i/o to swap space. 69 */ 70 71 /* 72 * swap space is managed in the following way: 73 * 74 * each swap partition or file is described by a "swapdev" structure. 75 * each "swapdev" structure contains a "swapent" structure which contains 76 * information that is passed up to the user (via system calls). 77 * 78 * each swap partition is assigned a "priority" (int) which controls 79 * swap parition usage. 80 * 81 * the system maintains a global data structure describing all swap 82 * partitions/files. there is a sorted LIST of "swappri" structures 83 * which describe "swapdev"'s at that priority. this LIST is headed 84 * by the "swap_priority" global var. each "swappri" contains a 85 * CIRCLEQ of "swapdev" structures at that priority. 86 * 87 * locking: 88 * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl 89 * system call and prevents the swap priority list from changing 90 * while we are in the middle of a system call (e.g. SWAP_STATS). 91 * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data 92 * structures including the priority list, the swapdev structures, 93 * and the swapmap arena. 94 * 95 * each swap device has the following info: 96 * - swap device in use (could be disabled, preventing future use) 97 * - swap enabled (allows new allocations on swap) 98 * - map info in /dev/drum 99 * - vnode pointer 100 * for swap files only: 101 * - block size 102 * - max byte count in buffer 103 * - buffer 104 * 105 * userland controls and configures swap with the swapctl(2) system call. 106 * the sys_swapctl performs the following operations: 107 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 108 * [2] SWAP_STATS: given a pointer to an array of swapent structures 109 * (passed in via "arg") of a size passed in via "misc" ... we load 110 * the current swap config into the array. The actual work is done 111 * in the uvm_swap_stats(9) function. 112 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 113 * priority in "misc", start swapping on it. 114 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 115 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 116 * "misc") 117 */ 118 119 /* 120 * swapdev: describes a single swap partition/file 121 * 122 * note the following should be true: 123 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 124 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] 125 */ 126 struct swapdev { 127 struct oswapent swd_ose; 128 #define swd_dev swd_ose.ose_dev /* device id */ 129 #define swd_flags swd_ose.ose_flags /* flags:inuse/enable/fake */ 130 #define swd_priority swd_ose.ose_priority /* our priority */ 131 /* also: swd_ose.ose_nblks, swd_ose.ose_inuse */ 132 char *swd_path; /* saved pathname of device */ 133 int swd_pathlen; /* length of pathname */ 134 int swd_npages; /* #pages we can use */ 135 int swd_npginuse; /* #pages in use */ 136 int swd_npgbad; /* #pages bad */ 137 int swd_drumoffset; /* page0 offset in drum */ 138 int swd_drumsize; /* #pages in drum */ 139 blist_t swd_blist; /* blist for this swapdev */ 140 struct vnode *swd_vp; /* backing vnode */ 141 CIRCLEQ_ENTRY(swapdev) swd_next; /* priority circleq */ 142 143 int swd_bsize; /* blocksize (bytes) */ 144 int swd_maxactive; /* max active i/o reqs */ 145 struct bufq_state *swd_tab; /* buffer list */ 146 int swd_active; /* number of active buffers */ 147 }; 148 149 /* 150 * swap device priority entry; the list is kept sorted on `spi_priority'. 151 */ 152 struct swappri { 153 int spi_priority; /* priority */ 154 CIRCLEQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 155 /* circleq of swapdevs at this priority */ 156 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 157 }; 158 159 /* 160 * The following two structures are used to keep track of data transfers 161 * on swap devices associated with regular files. 162 * NOTE: this code is more or less a copy of vnd.c; we use the same 163 * structure names here to ease porting.. 164 */ 165 struct vndxfer { 166 struct buf *vx_bp; /* Pointer to parent buffer */ 167 struct swapdev *vx_sdp; 168 int vx_error; 169 int vx_pending; /* # of pending aux buffers */ 170 int vx_flags; 171 #define VX_BUSY 1 172 #define VX_DEAD 2 173 }; 174 175 struct vndbuf { 176 struct buf vb_buf; 177 struct vndxfer *vb_xfer; 178 }; 179 180 181 /* 182 * We keep a of pool vndbuf's and vndxfer structures. 183 */ 184 POOL_INIT(vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx", NULL, 185 IPL_BIO); 186 POOL_INIT(vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd", NULL, 187 IPL_BIO); 188 189 /* 190 * local variables 191 */ 192 MALLOC_DEFINE(M_VMSWAP, "VM swap", "VM swap structures"); 193 static vmem_t *swapmap; /* controls the mapping of /dev/drum */ 194 195 /* list of all active swap devices [by priority] */ 196 LIST_HEAD(swap_priority, swappri); 197 static struct swap_priority swap_priority; 198 199 /* locks */ 200 static krwlock_t swap_syscall_lock; 201 202 /* workqueue and use counter for swap to regular files */ 203 static int sw_reg_count = 0; 204 static struct workqueue *sw_reg_workqueue; 205 206 /* tuneables */ 207 u_int uvm_swapisfull_factor = 99; 208 209 /* 210 * prototypes 211 */ 212 static struct swapdev *swapdrum_getsdp(int); 213 214 static struct swapdev *swaplist_find(struct vnode *, bool); 215 static void swaplist_insert(struct swapdev *, 216 struct swappri *, int); 217 static void swaplist_trim(void); 218 219 static int swap_on(struct lwp *, struct swapdev *); 220 static int swap_off(struct lwp *, struct swapdev *); 221 222 static void uvm_swap_stats_locked(int, struct swapent *, int, register_t *); 223 224 static void sw_reg_strategy(struct swapdev *, struct buf *, int); 225 static void sw_reg_biodone(struct buf *); 226 static void sw_reg_iodone(struct work *wk, void *dummy); 227 static void sw_reg_start(struct swapdev *); 228 229 static int uvm_swap_io(struct vm_page **, int, int, int); 230 231 /* 232 * uvm_swap_init: init the swap system data structures and locks 233 * 234 * => called at boot time from init_main.c after the filesystems 235 * are brought up (which happens after uvm_init()) 236 */ 237 void 238 uvm_swap_init(void) 239 { 240 UVMHIST_FUNC("uvm_swap_init"); 241 242 UVMHIST_CALLED(pdhist); 243 /* 244 * first, init the swap list, its counter, and its lock. 245 * then get a handle on the vnode for /dev/drum by using 246 * the its dev_t number ("swapdev", from MD conf.c). 247 */ 248 249 LIST_INIT(&swap_priority); 250 uvmexp.nswapdev = 0; 251 rw_init(&swap_syscall_lock); 252 cv_init(&uvm.scheduler_cv, "schedule"); 253 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE); 254 255 /* XXXSMP should be at IPL_VM, but for audio interrupt handlers. */ 256 mutex_init(&uvm_scheduler_mutex, MUTEX_SPIN, IPL_SCHED); 257 258 if (bdevvp(swapdev, &swapdev_vp)) 259 panic("uvm_swap_init: can't get vnode for swap device"); 260 if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY)) 261 panic("uvm_swap_init: can't lock swap device"); 262 if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED)) 263 panic("uvm_swap_init: can't open swap device"); 264 VOP_UNLOCK(swapdev_vp, 0); 265 266 /* 267 * create swap block resource map to map /dev/drum. the range 268 * from 1 to INT_MAX allows 2 gigablocks of swap space. note 269 * that block 0 is reserved (used to indicate an allocation 270 * failure, or no allocation). 271 */ 272 swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0, 273 VM_NOSLEEP, IPL_NONE); 274 if (swapmap == 0) 275 panic("uvm_swap_init: extent_create failed"); 276 277 /* 278 * done! 279 */ 280 uvm.swap_running = true; 281 #ifdef __SWAP_BROKEN 282 uvm.swapout_enabled = 0; 283 #else 284 uvm.swapout_enabled = 1; 285 #endif 286 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); 287 288 sysctl_createv(NULL, 0, NULL, NULL, 289 CTLFLAG_READWRITE, 290 CTLTYPE_INT, "swapout", 291 SYSCTL_DESCR("Set 0 to disable swapout of kernel stacks"), 292 NULL, 0, &uvm.swapout_enabled, 0, CTL_VM, CTL_CREATE, CTL_EOL); 293 } 294 295 /* 296 * swaplist functions: functions that operate on the list of swap 297 * devices on the system. 298 */ 299 300 /* 301 * swaplist_insert: insert swap device "sdp" into the global list 302 * 303 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 304 * => caller must provide a newly malloc'd swappri structure (we will 305 * FREE it if we don't need it... this it to prevent malloc blocking 306 * here while adding swap) 307 */ 308 static void 309 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) 310 { 311 struct swappri *spp, *pspp; 312 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist); 313 314 /* 315 * find entry at or after which to insert the new device. 316 */ 317 pspp = NULL; 318 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 319 if (priority <= spp->spi_priority) 320 break; 321 pspp = spp; 322 } 323 324 /* 325 * new priority? 326 */ 327 if (spp == NULL || spp->spi_priority != priority) { 328 spp = newspp; /* use newspp! */ 329 UVMHIST_LOG(pdhist, "created new swappri = %d", 330 priority, 0, 0, 0); 331 332 spp->spi_priority = priority; 333 CIRCLEQ_INIT(&spp->spi_swapdev); 334 335 if (pspp) 336 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 337 else 338 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 339 } else { 340 /* we don't need a new priority structure, free it */ 341 free(newspp, M_VMSWAP); 342 } 343 344 /* 345 * priority found (or created). now insert on the priority's 346 * circleq list and bump the total number of swapdevs. 347 */ 348 sdp->swd_priority = priority; 349 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 350 uvmexp.nswapdev++; 351 } 352 353 /* 354 * swaplist_find: find and optionally remove a swap device from the 355 * global list. 356 * 357 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 358 * => we return the swapdev we found (and removed) 359 */ 360 static struct swapdev * 361 swaplist_find(struct vnode *vp, bool remove) 362 { 363 struct swapdev *sdp; 364 struct swappri *spp; 365 366 /* 367 * search the lists for the requested vp 368 */ 369 370 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 371 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 372 if (sdp->swd_vp == vp) { 373 if (remove) { 374 CIRCLEQ_REMOVE(&spp->spi_swapdev, 375 sdp, swd_next); 376 uvmexp.nswapdev--; 377 } 378 return(sdp); 379 } 380 } 381 } 382 return (NULL); 383 } 384 385 /* 386 * swaplist_trim: scan priority list for empty priority entries and kill 387 * them. 388 * 389 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 390 */ 391 static void 392 swaplist_trim(void) 393 { 394 struct swappri *spp, *nextspp; 395 396 for (spp = LIST_FIRST(&swap_priority); spp != NULL; spp = nextspp) { 397 nextspp = LIST_NEXT(spp, spi_swappri); 398 if (CIRCLEQ_FIRST(&spp->spi_swapdev) != 399 (void *)&spp->spi_swapdev) 400 continue; 401 LIST_REMOVE(spp, spi_swappri); 402 free(spp, M_VMSWAP); 403 } 404 } 405 406 /* 407 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 408 * to the "swapdev" that maps that section of the drum. 409 * 410 * => each swapdev takes one big contig chunk of the drum 411 * => caller must hold uvm_swap_data_lock 412 */ 413 static struct swapdev * 414 swapdrum_getsdp(int pgno) 415 { 416 struct swapdev *sdp; 417 struct swappri *spp; 418 419 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 420 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 421 if (sdp->swd_flags & SWF_FAKE) 422 continue; 423 if (pgno >= sdp->swd_drumoffset && 424 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 425 return sdp; 426 } 427 } 428 } 429 return NULL; 430 } 431 432 433 /* 434 * sys_swapctl: main entry point for swapctl(2) system call 435 * [with two helper functions: swap_on and swap_off] 436 */ 437 int 438 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval) 439 { 440 /* { 441 syscallarg(int) cmd; 442 syscallarg(void *) arg; 443 syscallarg(int) misc; 444 } */ 445 struct vnode *vp; 446 struct nameidata nd; 447 struct swappri *spp; 448 struct swapdev *sdp; 449 struct swapent *sep; 450 #define SWAP_PATH_MAX (PATH_MAX + 1) 451 char *userpath; 452 size_t len; 453 int error, misc; 454 int priority; 455 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist); 456 457 misc = SCARG(uap, misc); 458 459 /* 460 * ensure serialized syscall access by grabbing the swap_syscall_lock 461 */ 462 rw_enter(&swap_syscall_lock, RW_WRITER); 463 464 userpath = malloc(SWAP_PATH_MAX, M_TEMP, M_WAITOK); 465 /* 466 * we handle the non-priv NSWAP and STATS request first. 467 * 468 * SWAP_NSWAP: return number of config'd swap devices 469 * [can also be obtained with uvmexp sysctl] 470 */ 471 if (SCARG(uap, cmd) == SWAP_NSWAP) { 472 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev, 473 0, 0, 0); 474 *retval = uvmexp.nswapdev; 475 error = 0; 476 goto out; 477 } 478 479 /* 480 * SWAP_STATS: get stats on current # of configured swap devs 481 * 482 * note that the swap_priority list can't change as long 483 * as we are holding the swap_syscall_lock. we don't want 484 * to grab the uvm_swap_data_lock because we may fault&sleep during 485 * copyout() and we don't want to be holding that lock then! 486 */ 487 if (SCARG(uap, cmd) == SWAP_STATS 488 #if defined(COMPAT_13) 489 || SCARG(uap, cmd) == SWAP_OSTATS 490 #endif 491 ) { 492 if ((size_t)misc > (size_t)uvmexp.nswapdev) 493 misc = uvmexp.nswapdev; 494 #if defined(COMPAT_13) 495 if (SCARG(uap, cmd) == SWAP_OSTATS) 496 len = sizeof(struct oswapent) * misc; 497 else 498 #endif 499 len = sizeof(struct swapent) * misc; 500 sep = (struct swapent *)malloc(len, M_TEMP, M_WAITOK); 501 502 uvm_swap_stats_locked(SCARG(uap, cmd), sep, misc, retval); 503 error = copyout(sep, SCARG(uap, arg), len); 504 505 free(sep, M_TEMP); 506 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); 507 goto out; 508 } 509 if (SCARG(uap, cmd) == SWAP_GETDUMPDEV) { 510 dev_t *devp = (dev_t *)SCARG(uap, arg); 511 512 error = copyout(&dumpdev, devp, sizeof(dumpdev)); 513 goto out; 514 } 515 516 /* 517 * all other requests require superuser privs. verify. 518 */ 519 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL, 520 0, NULL, NULL, NULL))) 521 goto out; 522 523 if (SCARG(uap, cmd) == SWAP_DUMPOFF) { 524 /* drop the current dump device */ 525 dumpdev = NODEV; 526 dumpcdev = NODEV; 527 cpu_dumpconf(); 528 goto out; 529 } 530 531 /* 532 * at this point we expect a path name in arg. we will 533 * use namei() to gain a vnode reference (vref), and lock 534 * the vnode (VOP_LOCK). 535 * 536 * XXX: a NULL arg means use the root vnode pointer (e.g. for 537 * miniroot) 538 */ 539 if (SCARG(uap, arg) == NULL) { 540 vp = rootvp; /* miniroot */ 541 if (vget(vp, LK_EXCLUSIVE)) { 542 error = EBUSY; 543 goto out; 544 } 545 if (SCARG(uap, cmd) == SWAP_ON && 546 copystr("miniroot", userpath, SWAP_PATH_MAX, &len)) 547 panic("swapctl: miniroot copy failed"); 548 } else { 549 int space; 550 char *where; 551 552 if (SCARG(uap, cmd) == SWAP_ON) { 553 if ((error = copyinstr(SCARG(uap, arg), userpath, 554 SWAP_PATH_MAX, &len))) 555 goto out; 556 space = UIO_SYSSPACE; 557 where = userpath; 558 } else { 559 space = UIO_USERSPACE; 560 where = (char *)SCARG(uap, arg); 561 } 562 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, 563 space, where); 564 if ((error = namei(&nd))) 565 goto out; 566 vp = nd.ni_vp; 567 } 568 /* note: "vp" is referenced and locked */ 569 570 error = 0; /* assume no error */ 571 switch(SCARG(uap, cmd)) { 572 573 case SWAP_DUMPDEV: 574 if (vp->v_type != VBLK) { 575 error = ENOTBLK; 576 break; 577 } 578 if (bdevsw_lookup(vp->v_rdev)) { 579 dumpdev = vp->v_rdev; 580 dumpcdev = devsw_blk2chr(dumpdev); 581 } else 582 dumpdev = NODEV; 583 cpu_dumpconf(); 584 break; 585 586 case SWAP_CTL: 587 /* 588 * get new priority, remove old entry (if any) and then 589 * reinsert it in the correct place. finally, prune out 590 * any empty priority structures. 591 */ 592 priority = SCARG(uap, misc); 593 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 594 mutex_enter(&uvm_swap_data_lock); 595 if ((sdp = swaplist_find(vp, true)) == NULL) { 596 error = ENOENT; 597 } else { 598 swaplist_insert(sdp, spp, priority); 599 swaplist_trim(); 600 } 601 mutex_exit(&uvm_swap_data_lock); 602 if (error) 603 free(spp, M_VMSWAP); 604 break; 605 606 case SWAP_ON: 607 608 /* 609 * check for duplicates. if none found, then insert a 610 * dummy entry on the list to prevent someone else from 611 * trying to enable this device while we are working on 612 * it. 613 */ 614 615 priority = SCARG(uap, misc); 616 sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK); 617 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 618 memset(sdp, 0, sizeof(*sdp)); 619 sdp->swd_flags = SWF_FAKE; 620 sdp->swd_vp = vp; 621 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 622 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK); 623 mutex_enter(&uvm_swap_data_lock); 624 if (swaplist_find(vp, false) != NULL) { 625 error = EBUSY; 626 mutex_exit(&uvm_swap_data_lock); 627 bufq_free(sdp->swd_tab); 628 free(sdp, M_VMSWAP); 629 free(spp, M_VMSWAP); 630 break; 631 } 632 swaplist_insert(sdp, spp, priority); 633 mutex_exit(&uvm_swap_data_lock); 634 635 sdp->swd_pathlen = len; 636 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK); 637 if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0) 638 panic("swapctl: copystr"); 639 640 /* 641 * we've now got a FAKE placeholder in the swap list. 642 * now attempt to enable swap on it. if we fail, undo 643 * what we've done and kill the fake entry we just inserted. 644 * if swap_on is a success, it will clear the SWF_FAKE flag 645 */ 646 647 if ((error = swap_on(l, sdp)) != 0) { 648 mutex_enter(&uvm_swap_data_lock); 649 (void) swaplist_find(vp, true); /* kill fake entry */ 650 swaplist_trim(); 651 mutex_exit(&uvm_swap_data_lock); 652 bufq_free(sdp->swd_tab); 653 free(sdp->swd_path, M_VMSWAP); 654 free(sdp, M_VMSWAP); 655 break; 656 } 657 break; 658 659 case SWAP_OFF: 660 mutex_enter(&uvm_swap_data_lock); 661 if ((sdp = swaplist_find(vp, false)) == NULL) { 662 mutex_exit(&uvm_swap_data_lock); 663 error = ENXIO; 664 break; 665 } 666 667 /* 668 * If a device isn't in use or enabled, we 669 * can't stop swapping from it (again). 670 */ 671 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 672 mutex_exit(&uvm_swap_data_lock); 673 error = EBUSY; 674 break; 675 } 676 677 /* 678 * do the real work. 679 */ 680 error = swap_off(l, sdp); 681 break; 682 683 default: 684 error = EINVAL; 685 } 686 687 /* 688 * done! release the ref gained by namei() and unlock. 689 */ 690 vput(vp); 691 692 out: 693 free(userpath, M_TEMP); 694 rw_exit(&swap_syscall_lock); 695 696 UVMHIST_LOG(pdhist, "<- done! error=%d", error, 0, 0, 0); 697 return (error); 698 } 699 700 /* 701 * swap_stats: implements swapctl(SWAP_STATS). The function is kept 702 * away from sys_swapctl() in order to allow COMPAT_* swapctl() 703 * emulation to use it directly without going through sys_swapctl(). 704 * The problem with using sys_swapctl() there is that it involves 705 * copying the swapent array to the stackgap, and this array's size 706 * is not known at build time. Hence it would not be possible to 707 * ensure it would fit in the stackgap in any case. 708 */ 709 void 710 uvm_swap_stats(int cmd, struct swapent *sep, int sec, register_t *retval) 711 { 712 713 rw_enter(&swap_syscall_lock, RW_READER); 714 uvm_swap_stats_locked(cmd, sep, sec, retval); 715 rw_exit(&swap_syscall_lock); 716 } 717 718 static void 719 uvm_swap_stats_locked(int cmd, struct swapent *sep, int sec, register_t *retval) 720 { 721 struct swappri *spp; 722 struct swapdev *sdp; 723 int count = 0; 724 725 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 726 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev); 727 sdp != (void *)&spp->spi_swapdev && sec-- > 0; 728 sdp = CIRCLEQ_NEXT(sdp, swd_next)) { 729 /* 730 * backwards compatibility for system call. 731 * note that we use 'struct oswapent' as an 732 * overlay into both 'struct swapdev' and 733 * the userland 'struct swapent', as we 734 * want to retain backwards compatibility 735 * with NetBSD 1.3. 736 */ 737 sdp->swd_ose.ose_inuse = 738 btodb((uint64_t)sdp->swd_npginuse << 739 PAGE_SHIFT); 740 (void)memcpy(sep, &sdp->swd_ose, 741 sizeof(struct oswapent)); 742 743 /* now copy out the path if necessary */ 744 #if !defined(COMPAT_13) 745 (void) cmd; 746 #endif 747 #if defined(COMPAT_13) 748 if (cmd == SWAP_STATS) 749 #endif 750 (void)memcpy(&sep->se_path, sdp->swd_path, 751 sdp->swd_pathlen); 752 753 count++; 754 #if defined(COMPAT_13) 755 if (cmd == SWAP_OSTATS) 756 sep = (struct swapent *) 757 ((struct oswapent *)sep + 1); 758 else 759 #endif 760 sep++; 761 } 762 } 763 764 *retval = count; 765 return; 766 } 767 768 /* 769 * swap_on: attempt to enable a swapdev for swapping. note that the 770 * swapdev is already on the global list, but disabled (marked 771 * SWF_FAKE). 772 * 773 * => we avoid the start of the disk (to protect disk labels) 774 * => we also avoid the miniroot, if we are swapping to root. 775 * => caller should leave uvm_swap_data_lock unlocked, we may lock it 776 * if needed. 777 */ 778 static int 779 swap_on(struct lwp *l, struct swapdev *sdp) 780 { 781 struct vnode *vp; 782 int error, npages, nblocks, size; 783 long addr; 784 u_long result; 785 struct vattr va; 786 #ifdef NFS 787 extern int (**nfsv2_vnodeop_p)(void *); 788 #endif /* NFS */ 789 const struct bdevsw *bdev; 790 dev_t dev; 791 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist); 792 793 /* 794 * we want to enable swapping on sdp. the swd_vp contains 795 * the vnode we want (locked and ref'd), and the swd_dev 796 * contains the dev_t of the file, if it a block device. 797 */ 798 799 vp = sdp->swd_vp; 800 dev = sdp->swd_dev; 801 802 /* 803 * open the swap file (mostly useful for block device files to 804 * let device driver know what is up). 805 * 806 * we skip the open/close for root on swap because the root 807 * has already been opened when root was mounted (mountroot). 808 */ 809 if (vp != rootvp) { 810 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred))) 811 return (error); 812 } 813 814 /* XXX this only works for block devices */ 815 UVMHIST_LOG(pdhist, " dev=%d, major(dev)=%d", dev, major(dev), 0,0); 816 817 /* 818 * we now need to determine the size of the swap area. for 819 * block specials we can call the d_psize function. 820 * for normal files, we must stat [get attrs]. 821 * 822 * we put the result in nblks. 823 * for normal files, we also want the filesystem block size 824 * (which we get with statfs). 825 */ 826 switch (vp->v_type) { 827 case VBLK: 828 bdev = bdevsw_lookup(dev); 829 if (bdev == NULL || bdev->d_psize == NULL || 830 (nblocks = (*bdev->d_psize)(dev)) == -1) { 831 error = ENXIO; 832 goto bad; 833 } 834 break; 835 836 case VREG: 837 if ((error = VOP_GETATTR(vp, &va, l->l_cred))) 838 goto bad; 839 nblocks = (int)btodb(va.va_size); 840 if ((error = 841 VFS_STATVFS(vp->v_mount, &vp->v_mount->mnt_stat)) != 0) 842 goto bad; 843 844 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize; 845 /* 846 * limit the max # of outstanding I/O requests we issue 847 * at any one time. take it easy on NFS servers. 848 */ 849 #ifdef NFS 850 if (vp->v_op == nfsv2_vnodeop_p) 851 sdp->swd_maxactive = 2; /* XXX */ 852 else 853 #endif /* NFS */ 854 sdp->swd_maxactive = 8; /* XXX */ 855 break; 856 857 default: 858 error = ENXIO; 859 goto bad; 860 } 861 862 /* 863 * save nblocks in a safe place and convert to pages. 864 */ 865 866 sdp->swd_ose.ose_nblks = nblocks; 867 npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT; 868 869 /* 870 * for block special files, we want to make sure that leave 871 * the disklabel and bootblocks alone, so we arrange to skip 872 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 873 * note that because of this the "size" can be less than the 874 * actual number of blocks on the device. 875 */ 876 if (vp->v_type == VBLK) { 877 /* we use pages 1 to (size - 1) [inclusive] */ 878 size = npages - 1; 879 addr = 1; 880 } else { 881 /* we use pages 0 to (size - 1) [inclusive] */ 882 size = npages; 883 addr = 0; 884 } 885 886 /* 887 * make sure we have enough blocks for a reasonable sized swap 888 * area. we want at least one page. 889 */ 890 891 if (size < 1) { 892 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); 893 error = EINVAL; 894 goto bad; 895 } 896 897 UVMHIST_LOG(pdhist, " dev=%x: size=%d addr=%ld\n", dev, size, addr, 0); 898 899 /* 900 * now we need to allocate an extent to manage this swap device 901 */ 902 903 sdp->swd_blist = blist_create(npages); 904 /* mark all expect the `saved' region free. */ 905 blist_free(sdp->swd_blist, addr, size); 906 907 /* 908 * if the vnode we are swapping to is the root vnode 909 * (i.e. we are swapping to the miniroot) then we want 910 * to make sure we don't overwrite it. do a statfs to 911 * find its size and skip over it. 912 */ 913 if (vp == rootvp) { 914 struct mount *mp; 915 struct statvfs *sp; 916 int rootblocks, rootpages; 917 918 mp = rootvnode->v_mount; 919 sp = &mp->mnt_stat; 920 rootblocks = sp->f_blocks * btodb(sp->f_frsize); 921 /* 922 * XXX: sp->f_blocks isn't the total number of 923 * blocks in the filesystem, it's the number of 924 * data blocks. so, our rootblocks almost 925 * definitely underestimates the total size 926 * of the filesystem - how badly depends on the 927 * details of the filesystem type. there isn't 928 * an obvious way to deal with this cleanly 929 * and perfectly, so for now we just pad our 930 * rootblocks estimate with an extra 5 percent. 931 */ 932 rootblocks += (rootblocks >> 5) + 933 (rootblocks >> 6) + 934 (rootblocks >> 7); 935 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; 936 if (rootpages > size) 937 panic("swap_on: miniroot larger than swap?"); 938 939 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) { 940 panic("swap_on: unable to preserve miniroot"); 941 } 942 943 size -= rootpages; 944 printf("Preserved %d pages of miniroot ", rootpages); 945 printf("leaving %d pages of swap\n", size); 946 } 947 948 /* 949 * add a ref to vp to reflect usage as a swap device. 950 */ 951 vref(vp); 952 953 /* 954 * now add the new swapdev to the drum and enable. 955 */ 956 result = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP); 957 if (result == 0) 958 panic("swapdrum_add"); 959 /* 960 * If this is the first regular swap create the workqueue. 961 * => Protected by swap_syscall_lock. 962 */ 963 if (vp->v_type != VBLK) { 964 if (sw_reg_count++ == 0) { 965 KASSERT(sw_reg_workqueue == NULL); 966 if (workqueue_create(&sw_reg_workqueue, "swapiod", 967 sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0) 968 panic("swap_add: workqueue_create failed"); 969 } 970 } 971 972 sdp->swd_drumoffset = (int)result; 973 sdp->swd_drumsize = npages; 974 sdp->swd_npages = size; 975 mutex_enter(&uvm_swap_data_lock); 976 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 977 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 978 uvmexp.swpages += size; 979 uvmexp.swpgavail += size; 980 mutex_exit(&uvm_swap_data_lock); 981 return (0); 982 983 /* 984 * failure: clean up and return error. 985 */ 986 987 bad: 988 if (sdp->swd_blist) { 989 blist_destroy(sdp->swd_blist); 990 } 991 if (vp != rootvp) { 992 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred); 993 } 994 return (error); 995 } 996 997 /* 998 * swap_off: stop swapping on swapdev 999 * 1000 * => swap data should be locked, we will unlock. 1001 */ 1002 static int 1003 swap_off(struct lwp *l, struct swapdev *sdp) 1004 { 1005 int npages = sdp->swd_npages; 1006 int error = 0; 1007 1008 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist); 1009 UVMHIST_LOG(pdhist, " dev=%x, npages=%d", sdp->swd_dev,npages,0,0); 1010 1011 /* disable the swap area being removed */ 1012 sdp->swd_flags &= ~SWF_ENABLE; 1013 uvmexp.swpgavail -= npages; 1014 mutex_exit(&uvm_swap_data_lock); 1015 1016 /* 1017 * the idea is to find all the pages that are paged out to this 1018 * device, and page them all in. in uvm, swap-backed pageable 1019 * memory can take two forms: aobjs and anons. call the 1020 * swapoff hook for each subsystem to bring in pages. 1021 */ 1022 1023 if (uao_swap_off(sdp->swd_drumoffset, 1024 sdp->swd_drumoffset + sdp->swd_drumsize) || 1025 amap_swap_off(sdp->swd_drumoffset, 1026 sdp->swd_drumoffset + sdp->swd_drumsize)) { 1027 error = ENOMEM; 1028 } else if (sdp->swd_npginuse > sdp->swd_npgbad) { 1029 error = EBUSY; 1030 } 1031 1032 if (error) { 1033 mutex_enter(&uvm_swap_data_lock); 1034 sdp->swd_flags |= SWF_ENABLE; 1035 uvmexp.swpgavail += npages; 1036 mutex_exit(&uvm_swap_data_lock); 1037 1038 return error; 1039 } 1040 1041 /* 1042 * If this is the last regular swap destroy the workqueue. 1043 * => Protected by swap_syscall_lock. 1044 */ 1045 if (sdp->swd_vp->v_type != VBLK) { 1046 KASSERT(sw_reg_count > 0); 1047 KASSERT(sw_reg_workqueue != NULL); 1048 if (--sw_reg_count == 0) { 1049 workqueue_destroy(sw_reg_workqueue); 1050 sw_reg_workqueue = NULL; 1051 } 1052 } 1053 1054 /* 1055 * done with the vnode. 1056 * drop our ref on the vnode before calling VOP_CLOSE() 1057 * so that spec_close() can tell if this is the last close. 1058 */ 1059 vrele(sdp->swd_vp); 1060 if (sdp->swd_vp != rootvp) { 1061 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred); 1062 } 1063 1064 mutex_enter(&uvm_swap_data_lock); 1065 uvmexp.swpages -= npages; 1066 uvmexp.swpginuse -= sdp->swd_npgbad; 1067 1068 if (swaplist_find(sdp->swd_vp, true) == NULL) 1069 panic("swap_off: swapdev not in list"); 1070 swaplist_trim(); 1071 mutex_exit(&uvm_swap_data_lock); 1072 1073 /* 1074 * free all resources! 1075 */ 1076 vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize); 1077 blist_destroy(sdp->swd_blist); 1078 bufq_free(sdp->swd_tab); 1079 free(sdp, M_VMSWAP); 1080 return (0); 1081 } 1082 1083 /* 1084 * /dev/drum interface and i/o functions 1085 */ 1086 1087 /* 1088 * swstrategy: perform I/O on the drum 1089 * 1090 * => we must map the i/o request from the drum to the correct swapdev. 1091 */ 1092 static void 1093 swstrategy(struct buf *bp) 1094 { 1095 struct swapdev *sdp; 1096 struct vnode *vp; 1097 int pageno, bn; 1098 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist); 1099 1100 /* 1101 * convert block number to swapdev. note that swapdev can't 1102 * be yanked out from under us because we are holding resources 1103 * in it (i.e. the blocks we are doing I/O on). 1104 */ 1105 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT; 1106 mutex_enter(&uvm_swap_data_lock); 1107 sdp = swapdrum_getsdp(pageno); 1108 mutex_exit(&uvm_swap_data_lock); 1109 if (sdp == NULL) { 1110 bp->b_error = EINVAL; 1111 biodone(bp); 1112 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); 1113 return; 1114 } 1115 1116 /* 1117 * convert drum page number to block number on this swapdev. 1118 */ 1119 1120 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1121 bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ 1122 1123 UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld", 1124 ((bp->b_flags & B_READ) == 0) ? "write" : "read", 1125 sdp->swd_drumoffset, bn, bp->b_bcount); 1126 1127 /* 1128 * for block devices we finish up here. 1129 * for regular files we have to do more work which we delegate 1130 * to sw_reg_strategy(). 1131 */ 1132 1133 vp = sdp->swd_vp; /* swapdev vnode pointer */ 1134 switch (vp->v_type) { 1135 default: 1136 panic("swstrategy: vnode type 0x%x", vp->v_type); 1137 1138 case VBLK: 1139 1140 /* 1141 * must convert "bp" from an I/O on /dev/drum to an I/O 1142 * on the swapdev (sdp). 1143 */ 1144 bp->b_blkno = bn; /* swapdev block number */ 1145 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */ 1146 1147 /* 1148 * if we are doing a write, we have to redirect the i/o on 1149 * drum's v_numoutput counter to the swapdevs. 1150 */ 1151 if ((bp->b_flags & B_READ) == 0) { 1152 mutex_enter(bp->b_objlock); 1153 vwakeup(bp); /* kills one 'v_numoutput' on drum */ 1154 mutex_exit(bp->b_objlock); 1155 mutex_enter(&vp->v_interlock); 1156 vp->v_numoutput++; /* put it on swapdev */ 1157 mutex_exit(&vp->v_interlock); 1158 } 1159 1160 /* 1161 * finally plug in swapdev vnode and start I/O 1162 */ 1163 bp->b_vp = vp; 1164 bp->b_objlock = &vp->v_interlock; 1165 VOP_STRATEGY(vp, bp); 1166 return; 1167 1168 case VREG: 1169 /* 1170 * delegate to sw_reg_strategy function. 1171 */ 1172 sw_reg_strategy(sdp, bp, bn); 1173 return; 1174 } 1175 /* NOTREACHED */ 1176 } 1177 1178 /* 1179 * swread: the read function for the drum (just a call to physio) 1180 */ 1181 /*ARGSUSED*/ 1182 static int 1183 swread(dev_t dev, struct uio *uio, int ioflag) 1184 { 1185 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist); 1186 1187 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); 1188 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); 1189 } 1190 1191 /* 1192 * swwrite: the write function for the drum (just a call to physio) 1193 */ 1194 /*ARGSUSED*/ 1195 static int 1196 swwrite(dev_t dev, struct uio *uio, int ioflag) 1197 { 1198 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist); 1199 1200 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); 1201 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); 1202 } 1203 1204 const struct bdevsw swap_bdevsw = { 1205 nullopen, nullclose, swstrategy, noioctl, nodump, nosize, D_OTHER, 1206 }; 1207 1208 const struct cdevsw swap_cdevsw = { 1209 nullopen, nullclose, swread, swwrite, noioctl, 1210 nostop, notty, nopoll, nommap, nokqfilter, D_OTHER, 1211 }; 1212 1213 /* 1214 * sw_reg_strategy: handle swap i/o to regular files 1215 */ 1216 static void 1217 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) 1218 { 1219 struct vnode *vp; 1220 struct vndxfer *vnx; 1221 daddr_t nbn; 1222 char *addr; 1223 off_t byteoff; 1224 int s, off, nra, error, sz, resid; 1225 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist); 1226 1227 /* 1228 * allocate a vndxfer head for this transfer and point it to 1229 * our buffer. 1230 */ 1231 vnx = pool_get(&vndxfer_pool, PR_WAITOK); 1232 vnx->vx_flags = VX_BUSY; 1233 vnx->vx_error = 0; 1234 vnx->vx_pending = 0; 1235 vnx->vx_bp = bp; 1236 vnx->vx_sdp = sdp; 1237 1238 /* 1239 * setup for main loop where we read filesystem blocks into 1240 * our buffer. 1241 */ 1242 error = 0; 1243 bp->b_resid = bp->b_bcount; /* nothing transfered yet! */ 1244 addr = bp->b_data; /* current position in buffer */ 1245 byteoff = dbtob((uint64_t)bn); 1246 1247 for (resid = bp->b_resid; resid; resid -= sz) { 1248 struct vndbuf *nbp; 1249 1250 /* 1251 * translate byteoffset into block number. return values: 1252 * vp = vnode of underlying device 1253 * nbn = new block number (on underlying vnode dev) 1254 * nra = num blocks we can read-ahead (excludes requested 1255 * block) 1256 */ 1257 nra = 0; 1258 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1259 &vp, &nbn, &nra); 1260 1261 if (error == 0 && nbn == (daddr_t)-1) { 1262 /* 1263 * this used to just set error, but that doesn't 1264 * do the right thing. Instead, it causes random 1265 * memory errors. The panic() should remain until 1266 * this condition doesn't destabilize the system. 1267 */ 1268 #if 1 1269 panic("sw_reg_strategy: swap to sparse file"); 1270 #else 1271 error = EIO; /* failure */ 1272 #endif 1273 } 1274 1275 /* 1276 * punt if there was an error or a hole in the file. 1277 * we must wait for any i/o ops we have already started 1278 * to finish before returning. 1279 * 1280 * XXX we could deal with holes here but it would be 1281 * a hassle (in the write case). 1282 */ 1283 if (error) { 1284 s = splbio(); 1285 vnx->vx_error = error; /* pass error up */ 1286 goto out; 1287 } 1288 1289 /* 1290 * compute the size ("sz") of this transfer (in bytes). 1291 */ 1292 off = byteoff % sdp->swd_bsize; 1293 sz = (1 + nra) * sdp->swd_bsize - off; 1294 if (sz > resid) 1295 sz = resid; 1296 1297 UVMHIST_LOG(pdhist, "sw_reg_strategy: " 1298 "vp %p/%p offset 0x%x/0x%x", 1299 sdp->swd_vp, vp, byteoff, nbn); 1300 1301 /* 1302 * now get a buf structure. note that the vb_buf is 1303 * at the front of the nbp structure so that you can 1304 * cast pointers between the two structure easily. 1305 */ 1306 nbp = pool_get(&vndbuf_pool, PR_WAITOK); 1307 buf_init(&nbp->vb_buf); 1308 nbp->vb_buf.b_flags = bp->b_flags; 1309 nbp->vb_buf.b_cflags = bp->b_cflags; 1310 nbp->vb_buf.b_oflags = bp->b_oflags; 1311 nbp->vb_buf.b_bcount = sz; 1312 nbp->vb_buf.b_bufsize = sz; 1313 nbp->vb_buf.b_error = 0; 1314 nbp->vb_buf.b_data = addr; 1315 nbp->vb_buf.b_lblkno = 0; 1316 nbp->vb_buf.b_blkno = nbn + btodb(off); 1317 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno; 1318 nbp->vb_buf.b_iodone = sw_reg_biodone; 1319 nbp->vb_buf.b_vp = vp; 1320 nbp->vb_buf.b_objlock = &vp->v_interlock; 1321 if (vp->v_type == VBLK) { 1322 nbp->vb_buf.b_dev = vp->v_rdev; 1323 } 1324 1325 nbp->vb_xfer = vnx; /* patch it back in to vnx */ 1326 1327 /* 1328 * Just sort by block number 1329 */ 1330 s = splbio(); 1331 if (vnx->vx_error != 0) { 1332 buf_destroy(&nbp->vb_buf); 1333 pool_put(&vndbuf_pool, nbp); 1334 goto out; 1335 } 1336 vnx->vx_pending++; 1337 1338 /* sort it in and start I/O if we are not over our limit */ 1339 /* XXXAD locking */ 1340 BUFQ_PUT(sdp->swd_tab, &nbp->vb_buf); 1341 sw_reg_start(sdp); 1342 splx(s); 1343 1344 /* 1345 * advance to the next I/O 1346 */ 1347 byteoff += sz; 1348 addr += sz; 1349 } 1350 1351 s = splbio(); 1352 1353 out: /* Arrive here at splbio */ 1354 vnx->vx_flags &= ~VX_BUSY; 1355 if (vnx->vx_pending == 0) { 1356 error = vnx->vx_error; 1357 pool_put(&vndxfer_pool, vnx); 1358 bp->b_error = error; 1359 biodone(bp); 1360 } 1361 splx(s); 1362 } 1363 1364 /* 1365 * sw_reg_start: start an I/O request on the requested swapdev 1366 * 1367 * => reqs are sorted by b_rawblkno (above) 1368 */ 1369 static void 1370 sw_reg_start(struct swapdev *sdp) 1371 { 1372 struct buf *bp; 1373 struct vnode *vp; 1374 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist); 1375 1376 /* recursion control */ 1377 if ((sdp->swd_flags & SWF_BUSY) != 0) 1378 return; 1379 1380 sdp->swd_flags |= SWF_BUSY; 1381 1382 while (sdp->swd_active < sdp->swd_maxactive) { 1383 bp = BUFQ_GET(sdp->swd_tab); 1384 if (bp == NULL) 1385 break; 1386 sdp->swd_active++; 1387 1388 UVMHIST_LOG(pdhist, 1389 "sw_reg_start: bp %p vp %p blkno %p cnt %lx", 1390 bp, bp->b_vp, bp->b_blkno, bp->b_bcount); 1391 vp = bp->b_vp; 1392 KASSERT(bp->b_objlock == &vp->v_interlock); 1393 if ((bp->b_flags & B_READ) == 0) { 1394 mutex_enter(&vp->v_interlock); 1395 vp->v_numoutput++; 1396 mutex_exit(&vp->v_interlock); 1397 } 1398 VOP_STRATEGY(vp, bp); 1399 } 1400 sdp->swd_flags &= ~SWF_BUSY; 1401 } 1402 1403 /* 1404 * sw_reg_biodone: one of our i/o's has completed 1405 */ 1406 static void 1407 sw_reg_biodone(struct buf *bp) 1408 { 1409 workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL); 1410 } 1411 1412 /* 1413 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1414 * 1415 * => note that we can recover the vndbuf struct by casting the buf ptr 1416 */ 1417 static void 1418 sw_reg_iodone(struct work *wk, void *dummy) 1419 { 1420 struct vndbuf *vbp = (void *)wk; 1421 struct vndxfer *vnx = vbp->vb_xfer; 1422 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1423 struct swapdev *sdp = vnx->vx_sdp; 1424 int s, resid, error; 1425 KASSERT(&vbp->vb_buf.b_work == wk); 1426 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist); 1427 1428 UVMHIST_LOG(pdhist, " vbp=%p vp=%p blkno=%x addr=%p", 1429 vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data); 1430 UVMHIST_LOG(pdhist, " cnt=%lx resid=%lx", 1431 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); 1432 1433 /* 1434 * protect vbp at splbio and update. 1435 */ 1436 1437 s = splbio(); 1438 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1439 pbp->b_resid -= resid; 1440 vnx->vx_pending--; 1441 1442 if (vbp->vb_buf.b_error != 0) { 1443 /* pass error upward */ 1444 error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO; 1445 UVMHIST_LOG(pdhist, " got error=%d !", error, 0, 0, 0); 1446 vnx->vx_error = error; 1447 } 1448 1449 /* 1450 * kill vbp structure 1451 */ 1452 buf_destroy(&vbp->vb_buf); 1453 pool_put(&vndbuf_pool, vbp); 1454 1455 /* 1456 * wrap up this transaction if it has run to completion or, in 1457 * case of an error, when all auxiliary buffers have returned. 1458 */ 1459 if (vnx->vx_error != 0) { 1460 /* pass error upward */ 1461 error = vnx->vx_error; 1462 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1463 pbp->b_error = error; 1464 biodone(pbp); 1465 pool_put(&vndxfer_pool, vnx); 1466 } 1467 } else if (pbp->b_resid == 0) { 1468 KASSERT(vnx->vx_pending == 0); 1469 if ((vnx->vx_flags & VX_BUSY) == 0) { 1470 UVMHIST_LOG(pdhist, " iodone error=%d !", 1471 pbp, vnx->vx_error, 0, 0); 1472 biodone(pbp); 1473 pool_put(&vndxfer_pool, vnx); 1474 } 1475 } 1476 1477 /* 1478 * done! start next swapdev I/O if one is pending 1479 */ 1480 sdp->swd_active--; 1481 sw_reg_start(sdp); 1482 splx(s); 1483 } 1484 1485 1486 /* 1487 * uvm_swap_alloc: allocate space on swap 1488 * 1489 * => allocation is done "round robin" down the priority list, as we 1490 * allocate in a priority we "rotate" the circle queue. 1491 * => space can be freed with uvm_swap_free 1492 * => we return the page slot number in /dev/drum (0 == invalid slot) 1493 * => we lock uvm_swap_data_lock 1494 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1495 */ 1496 int 1497 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok) 1498 { 1499 struct swapdev *sdp; 1500 struct swappri *spp; 1501 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist); 1502 1503 /* 1504 * no swap devices configured yet? definite failure. 1505 */ 1506 if (uvmexp.nswapdev < 1) 1507 return 0; 1508 1509 /* 1510 * lock data lock, convert slots into blocks, and enter loop 1511 */ 1512 mutex_enter(&uvm_swap_data_lock); 1513 1514 ReTry: /* XXXMRG */ 1515 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1516 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1517 uint64_t result; 1518 1519 /* if it's not enabled, then we can't swap from it */ 1520 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1521 continue; 1522 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1523 continue; 1524 result = blist_alloc(sdp->swd_blist, *nslots); 1525 if (result == BLIST_NONE) { 1526 continue; 1527 } 1528 KASSERT(result < sdp->swd_drumsize); 1529 1530 /* 1531 * successful allocation! now rotate the circleq. 1532 */ 1533 CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1534 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1535 sdp->swd_npginuse += *nslots; 1536 uvmexp.swpginuse += *nslots; 1537 mutex_exit(&uvm_swap_data_lock); 1538 /* done! return drum slot number */ 1539 UVMHIST_LOG(pdhist, 1540 "success! returning %d slots starting at %d", 1541 *nslots, result + sdp->swd_drumoffset, 0, 0); 1542 return (result + sdp->swd_drumoffset); 1543 } 1544 } 1545 1546 /* XXXMRG: BEGIN HACK */ 1547 if (*nslots > 1 && lessok) { 1548 *nslots = 1; 1549 /* XXXMRG: ugh! blist should support this for us */ 1550 goto ReTry; 1551 } 1552 /* XXXMRG: END HACK */ 1553 1554 mutex_exit(&uvm_swap_data_lock); 1555 return 0; 1556 } 1557 1558 /* 1559 * uvm_swapisfull: return true if most of available swap is allocated 1560 * and in use. we don't count some small portion as it may be inaccessible 1561 * to us at any given moment, for example if there is lock contention or if 1562 * pages are busy. 1563 */ 1564 bool 1565 uvm_swapisfull(void) 1566 { 1567 int swpgonly; 1568 bool rv; 1569 1570 mutex_enter(&uvm_swap_data_lock); 1571 KASSERT(uvmexp.swpgonly <= uvmexp.swpages); 1572 swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 / 1573 uvm_swapisfull_factor); 1574 rv = (swpgonly >= uvmexp.swpgavail); 1575 mutex_exit(&uvm_swap_data_lock); 1576 1577 return (rv); 1578 } 1579 1580 /* 1581 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1582 * 1583 * => we lock uvm_swap_data_lock 1584 */ 1585 void 1586 uvm_swap_markbad(int startslot, int nslots) 1587 { 1588 struct swapdev *sdp; 1589 UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist); 1590 1591 mutex_enter(&uvm_swap_data_lock); 1592 sdp = swapdrum_getsdp(startslot); 1593 KASSERT(sdp != NULL); 1594 1595 /* 1596 * we just keep track of how many pages have been marked bad 1597 * in this device, to make everything add up in swap_off(). 1598 * we assume here that the range of slots will all be within 1599 * one swap device. 1600 */ 1601 1602 KASSERT(uvmexp.swpgonly >= nslots); 1603 uvmexp.swpgonly -= nslots; 1604 sdp->swd_npgbad += nslots; 1605 UVMHIST_LOG(pdhist, "now %d bad", sdp->swd_npgbad, 0,0,0); 1606 mutex_exit(&uvm_swap_data_lock); 1607 } 1608 1609 /* 1610 * uvm_swap_free: free swap slots 1611 * 1612 * => this can be all or part of an allocation made by uvm_swap_alloc 1613 * => we lock uvm_swap_data_lock 1614 */ 1615 void 1616 uvm_swap_free(int startslot, int nslots) 1617 { 1618 struct swapdev *sdp; 1619 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist); 1620 1621 UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots, 1622 startslot, 0, 0); 1623 1624 /* 1625 * ignore attempts to free the "bad" slot. 1626 */ 1627 1628 if (startslot == SWSLOT_BAD) { 1629 return; 1630 } 1631 1632 /* 1633 * convert drum slot offset back to sdp, free the blocks 1634 * in the extent, and return. must hold pri lock to do 1635 * lookup and access the extent. 1636 */ 1637 1638 mutex_enter(&uvm_swap_data_lock); 1639 sdp = swapdrum_getsdp(startslot); 1640 KASSERT(uvmexp.nswapdev >= 1); 1641 KASSERT(sdp != NULL); 1642 KASSERT(sdp->swd_npginuse >= nslots); 1643 blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots); 1644 sdp->swd_npginuse -= nslots; 1645 uvmexp.swpginuse -= nslots; 1646 mutex_exit(&uvm_swap_data_lock); 1647 } 1648 1649 /* 1650 * uvm_swap_put: put any number of pages into a contig place on swap 1651 * 1652 * => can be sync or async 1653 */ 1654 1655 int 1656 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) 1657 { 1658 int error; 1659 1660 error = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1661 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1662 return error; 1663 } 1664 1665 /* 1666 * uvm_swap_get: get a single page from swap 1667 * 1668 * => usually a sync op (from fault) 1669 */ 1670 1671 int 1672 uvm_swap_get(struct vm_page *page, int swslot, int flags) 1673 { 1674 int error; 1675 1676 uvmexp.nswget++; 1677 KASSERT(flags & PGO_SYNCIO); 1678 if (swslot == SWSLOT_BAD) { 1679 return EIO; 1680 } 1681 1682 error = uvm_swap_io(&page, swslot, 1, B_READ | 1683 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1684 if (error == 0) { 1685 1686 /* 1687 * this page is no longer only in swap. 1688 */ 1689 1690 mutex_enter(&uvm_swap_data_lock); 1691 KASSERT(uvmexp.swpgonly > 0); 1692 uvmexp.swpgonly--; 1693 mutex_exit(&uvm_swap_data_lock); 1694 } 1695 return error; 1696 } 1697 1698 /* 1699 * uvm_swap_io: do an i/o operation to swap 1700 */ 1701 1702 static int 1703 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) 1704 { 1705 daddr_t startblk; 1706 struct buf *bp; 1707 vaddr_t kva; 1708 int error, mapinflags; 1709 bool write, async; 1710 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist); 1711 1712 UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d", 1713 startslot, npages, flags, 0); 1714 1715 write = (flags & B_READ) == 0; 1716 async = (flags & B_ASYNC) != 0; 1717 1718 /* 1719 * allocate a buf for the i/o. 1720 */ 1721 1722 KASSERT(curlwp != uvm.pagedaemon_lwp || (write && async)); 1723 bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp); 1724 if (bp == NULL) { 1725 uvm_aio_aiodone_pages(pps, npages, true, ENOMEM); 1726 return ENOMEM; 1727 } 1728 1729 /* 1730 * convert starting drum slot to block number 1731 */ 1732 1733 startblk = btodb((uint64_t)startslot << PAGE_SHIFT); 1734 1735 /* 1736 * first, map the pages into the kernel. 1737 */ 1738 1739 mapinflags = !write ? 1740 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ : 1741 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE; 1742 kva = uvm_pagermapin(pps, npages, mapinflags); 1743 1744 /* 1745 * fill in the bp/sbp. we currently route our i/o through 1746 * /dev/drum's vnode [swapdev_vp]. 1747 */ 1748 1749 bp->b_cflags = BC_BUSY | BC_NOCACHE; 1750 bp->b_flags = (flags & (B_READ|B_ASYNC)); 1751 bp->b_proc = &proc0; /* XXX */ 1752 bp->b_vnbufs.le_next = NOLIST; 1753 bp->b_data = (void *)kva; 1754 bp->b_blkno = startblk; 1755 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT; 1756 1757 /* 1758 * bump v_numoutput (counter of number of active outputs). 1759 */ 1760 1761 if (write) { 1762 mutex_enter(&swapdev_vp->v_interlock); 1763 swapdev_vp->v_numoutput++; 1764 mutex_exit(&swapdev_vp->v_interlock); 1765 } 1766 1767 /* 1768 * for async ops we must set up the iodone handler. 1769 */ 1770 1771 if (async) { 1772 bp->b_iodone = uvm_aio_biodone; 1773 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); 1774 if (curlwp == uvm.pagedaemon_lwp) 1775 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1776 else 1777 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 1778 } else { 1779 bp->b_iodone = NULL; 1780 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1781 } 1782 UVMHIST_LOG(pdhist, 1783 "about to start io: data = %p blkno = 0x%x, bcount = %ld", 1784 bp->b_data, bp->b_blkno, bp->b_bcount, 0); 1785 1786 /* 1787 * now we start the I/O, and if async, return. 1788 */ 1789 1790 VOP_STRATEGY(swapdev_vp, bp); 1791 if (async) 1792 return 0; 1793 1794 /* 1795 * must be sync i/o. wait for it to finish 1796 */ 1797 1798 error = biowait(bp); 1799 1800 /* 1801 * kill the pager mapping 1802 */ 1803 1804 uvm_pagermapout(kva, npages); 1805 1806 /* 1807 * now dispose of the buf and we're done. 1808 */ 1809 1810 if (write) { 1811 mutex_enter(&swapdev_vp->v_interlock); 1812 vwakeup(bp); 1813 mutex_exit(&swapdev_vp->v_interlock); 1814 } 1815 putiobuf(bp); 1816 UVMHIST_LOG(pdhist, "<- done (sync) error=%d", error, 0, 0, 0); 1817 1818 return (error); 1819 } 1820