1 /* $NetBSD: uvm_swap.c,v 1.132 2007/12/08 19:29:57 pooka Exp $ */ 2 3 /* 4 * Copyright (c) 1995, 1996, 1997 Matthew R. Green 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. The name of the author may not be used to endorse or promote products 16 * derived from this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 31 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 32 */ 33 34 #include <sys/cdefs.h> 35 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.132 2007/12/08 19:29:57 pooka Exp $"); 36 37 #include "fs_nfs.h" 38 #include "opt_uvmhist.h" 39 #include "opt_compat_netbsd.h" 40 #include "opt_ddb.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/buf.h> 45 #include <sys/bufq.h> 46 #include <sys/conf.h> 47 #include <sys/proc.h> 48 #include <sys/namei.h> 49 #include <sys/disklabel.h> 50 #include <sys/errno.h> 51 #include <sys/kernel.h> 52 #include <sys/malloc.h> 53 #include <sys/vnode.h> 54 #include <sys/file.h> 55 #include <sys/vmem.h> 56 #include <sys/blist.h> 57 #include <sys/mount.h> 58 #include <sys/pool.h> 59 #include <sys/syscallargs.h> 60 #include <sys/swap.h> 61 #include <sys/kauth.h> 62 #include <sys/sysctl.h> 63 #include <sys/workqueue.h> 64 65 #include <uvm/uvm.h> 66 67 #include <miscfs/specfs/specdev.h> 68 69 /* 70 * uvm_swap.c: manage configuration and i/o to swap space. 71 */ 72 73 /* 74 * swap space is managed in the following way: 75 * 76 * each swap partition or file is described by a "swapdev" structure. 77 * each "swapdev" structure contains a "swapent" structure which contains 78 * information that is passed up to the user (via system calls). 79 * 80 * each swap partition is assigned a "priority" (int) which controls 81 * swap parition usage. 82 * 83 * the system maintains a global data structure describing all swap 84 * partitions/files. there is a sorted LIST of "swappri" structures 85 * which describe "swapdev"'s at that priority. this LIST is headed 86 * by the "swap_priority" global var. each "swappri" contains a 87 * CIRCLEQ of "swapdev" structures at that priority. 88 * 89 * locking: 90 * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl 91 * system call and prevents the swap priority list from changing 92 * while we are in the middle of a system call (e.g. SWAP_STATS). 93 * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data 94 * structures including the priority list, the swapdev structures, 95 * and the swapmap arena. 96 * 97 * each swap device has the following info: 98 * - swap device in use (could be disabled, preventing future use) 99 * - swap enabled (allows new allocations on swap) 100 * - map info in /dev/drum 101 * - vnode pointer 102 * for swap files only: 103 * - block size 104 * - max byte count in buffer 105 * - buffer 106 * 107 * userland controls and configures swap with the swapctl(2) system call. 108 * the sys_swapctl performs the following operations: 109 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 110 * [2] SWAP_STATS: given a pointer to an array of swapent structures 111 * (passed in via "arg") of a size passed in via "misc" ... we load 112 * the current swap config into the array. The actual work is done 113 * in the uvm_swap_stats(9) function. 114 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 115 * priority in "misc", start swapping on it. 116 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 117 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 118 * "misc") 119 */ 120 121 /* 122 * swapdev: describes a single swap partition/file 123 * 124 * note the following should be true: 125 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 126 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] 127 */ 128 struct swapdev { 129 struct oswapent swd_ose; 130 #define swd_dev swd_ose.ose_dev /* device id */ 131 #define swd_flags swd_ose.ose_flags /* flags:inuse/enable/fake */ 132 #define swd_priority swd_ose.ose_priority /* our priority */ 133 /* also: swd_ose.ose_nblks, swd_ose.ose_inuse */ 134 char *swd_path; /* saved pathname of device */ 135 int swd_pathlen; /* length of pathname */ 136 int swd_npages; /* #pages we can use */ 137 int swd_npginuse; /* #pages in use */ 138 int swd_npgbad; /* #pages bad */ 139 int swd_drumoffset; /* page0 offset in drum */ 140 int swd_drumsize; /* #pages in drum */ 141 blist_t swd_blist; /* blist for this swapdev */ 142 struct vnode *swd_vp; /* backing vnode */ 143 CIRCLEQ_ENTRY(swapdev) swd_next; /* priority circleq */ 144 145 int swd_bsize; /* blocksize (bytes) */ 146 int swd_maxactive; /* max active i/o reqs */ 147 struct bufq_state *swd_tab; /* buffer list */ 148 int swd_active; /* number of active buffers */ 149 }; 150 151 /* 152 * swap device priority entry; the list is kept sorted on `spi_priority'. 153 */ 154 struct swappri { 155 int spi_priority; /* priority */ 156 CIRCLEQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 157 /* circleq of swapdevs at this priority */ 158 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 159 }; 160 161 /* 162 * The following two structures are used to keep track of data transfers 163 * on swap devices associated with regular files. 164 * NOTE: this code is more or less a copy of vnd.c; we use the same 165 * structure names here to ease porting.. 166 */ 167 struct vndxfer { 168 struct buf *vx_bp; /* Pointer to parent buffer */ 169 struct swapdev *vx_sdp; 170 int vx_error; 171 int vx_pending; /* # of pending aux buffers */ 172 int vx_flags; 173 #define VX_BUSY 1 174 #define VX_DEAD 2 175 }; 176 177 struct vndbuf { 178 struct buf vb_buf; 179 struct vndxfer *vb_xfer; 180 }; 181 182 183 /* 184 * We keep a of pool vndbuf's and vndxfer structures. 185 */ 186 POOL_INIT(vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx", NULL, 187 IPL_BIO); 188 POOL_INIT(vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd", NULL, 189 IPL_BIO); 190 191 #define getvndxfer(vnx) do { \ 192 int sp = splbio(); \ 193 vnx = pool_get(&vndxfer_pool, PR_WAITOK); \ 194 splx(sp); \ 195 } while (/*CONSTCOND*/ 0) 196 197 #define putvndxfer(vnx) { \ 198 pool_put(&vndxfer_pool, (void *)(vnx)); \ 199 } 200 201 #define getvndbuf(vbp) do { \ 202 int sp = splbio(); \ 203 vbp = pool_get(&vndbuf_pool, PR_WAITOK); \ 204 splx(sp); \ 205 } while (/*CONSTCOND*/ 0) 206 207 #define putvndbuf(vbp) { \ 208 pool_put(&vndbuf_pool, (void *)(vbp)); \ 209 } 210 211 /* 212 * local variables 213 */ 214 MALLOC_DEFINE(M_VMSWAP, "VM swap", "VM swap structures"); 215 static vmem_t *swapmap; /* controls the mapping of /dev/drum */ 216 217 /* list of all active swap devices [by priority] */ 218 LIST_HEAD(swap_priority, swappri); 219 static struct swap_priority swap_priority; 220 221 /* locks */ 222 static krwlock_t swap_syscall_lock; 223 224 /* workqueue and use counter for swap to regular files */ 225 static int sw_reg_count = 0; 226 static struct workqueue *sw_reg_workqueue; 227 228 /* 229 * prototypes 230 */ 231 static struct swapdev *swapdrum_getsdp(int); 232 233 static struct swapdev *swaplist_find(struct vnode *, bool); 234 static void swaplist_insert(struct swapdev *, 235 struct swappri *, int); 236 static void swaplist_trim(void); 237 238 static int swap_on(struct lwp *, struct swapdev *); 239 static int swap_off(struct lwp *, struct swapdev *); 240 241 static void uvm_swap_stats_locked(int, struct swapent *, int, register_t *); 242 243 static void sw_reg_strategy(struct swapdev *, struct buf *, int); 244 static void sw_reg_biodone(struct buf *); 245 static void sw_reg_iodone(struct work *wk, void *dummy); 246 static void sw_reg_start(struct swapdev *); 247 248 static int uvm_swap_io(struct vm_page **, int, int, int); 249 250 /* 251 * uvm_swap_init: init the swap system data structures and locks 252 * 253 * => called at boot time from init_main.c after the filesystems 254 * are brought up (which happens after uvm_init()) 255 */ 256 void 257 uvm_swap_init(void) 258 { 259 UVMHIST_FUNC("uvm_swap_init"); 260 261 UVMHIST_CALLED(pdhist); 262 /* 263 * first, init the swap list, its counter, and its lock. 264 * then get a handle on the vnode for /dev/drum by using 265 * the its dev_t number ("swapdev", from MD conf.c). 266 */ 267 268 LIST_INIT(&swap_priority); 269 uvmexp.nswapdev = 0; 270 rw_init(&swap_syscall_lock); 271 cv_init(&uvm.scheduler_cv, "schedule"); 272 /* XXXSMP should be adaptive, but needs vmobjlock replaced */ 273 mutex_init(&uvm_swap_data_lock, MUTEX_SPIN, IPL_NONE); 274 275 /* XXXSMP should be at IPL_VM, but for audio interrupt handlers. */ 276 mutex_init(&uvm_scheduler_mutex, MUTEX_SPIN, IPL_SCHED); 277 278 if (bdevvp(swapdev, &swapdev_vp)) 279 panic("uvm_swap_init: can't get vnode for swap device"); 280 281 /* 282 * create swap block resource map to map /dev/drum. the range 283 * from 1 to INT_MAX allows 2 gigablocks of swap space. note 284 * that block 0 is reserved (used to indicate an allocation 285 * failure, or no allocation). 286 */ 287 swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0, 288 VM_NOSLEEP, IPL_NONE); 289 if (swapmap == 0) 290 panic("uvm_swap_init: extent_create failed"); 291 292 /* 293 * done! 294 */ 295 uvm.swap_running = true; 296 uvm.swapout_enabled = 1; 297 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); 298 299 sysctl_createv(NULL, 0, NULL, NULL, 300 CTLFLAG_READWRITE, 301 CTLTYPE_INT, "swapout", 302 SYSCTL_DESCR("Set 0 to disable swapout of kernel stacks"), 303 NULL, 0, &uvm.swapout_enabled, 0, CTL_VM, CTL_CREATE, CTL_EOL); 304 } 305 306 /* 307 * swaplist functions: functions that operate on the list of swap 308 * devices on the system. 309 */ 310 311 /* 312 * swaplist_insert: insert swap device "sdp" into the global list 313 * 314 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 315 * => caller must provide a newly malloc'd swappri structure (we will 316 * FREE it if we don't need it... this it to prevent malloc blocking 317 * here while adding swap) 318 */ 319 static void 320 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) 321 { 322 struct swappri *spp, *pspp; 323 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist); 324 325 /* 326 * find entry at or after which to insert the new device. 327 */ 328 pspp = NULL; 329 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 330 if (priority <= spp->spi_priority) 331 break; 332 pspp = spp; 333 } 334 335 /* 336 * new priority? 337 */ 338 if (spp == NULL || spp->spi_priority != priority) { 339 spp = newspp; /* use newspp! */ 340 UVMHIST_LOG(pdhist, "created new swappri = %d", 341 priority, 0, 0, 0); 342 343 spp->spi_priority = priority; 344 CIRCLEQ_INIT(&spp->spi_swapdev); 345 346 if (pspp) 347 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 348 else 349 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 350 } else { 351 /* we don't need a new priority structure, free it */ 352 FREE(newspp, M_VMSWAP); 353 } 354 355 /* 356 * priority found (or created). now insert on the priority's 357 * circleq list and bump the total number of swapdevs. 358 */ 359 sdp->swd_priority = priority; 360 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 361 uvmexp.nswapdev++; 362 } 363 364 /* 365 * swaplist_find: find and optionally remove a swap device from the 366 * global list. 367 * 368 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 369 * => we return the swapdev we found (and removed) 370 */ 371 static struct swapdev * 372 swaplist_find(struct vnode *vp, bool remove) 373 { 374 struct swapdev *sdp; 375 struct swappri *spp; 376 377 /* 378 * search the lists for the requested vp 379 */ 380 381 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 382 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 383 if (sdp->swd_vp == vp) { 384 if (remove) { 385 CIRCLEQ_REMOVE(&spp->spi_swapdev, 386 sdp, swd_next); 387 uvmexp.nswapdev--; 388 } 389 return(sdp); 390 } 391 } 392 } 393 return (NULL); 394 } 395 396 /* 397 * swaplist_trim: scan priority list for empty priority entries and kill 398 * them. 399 * 400 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 401 */ 402 static void 403 swaplist_trim(void) 404 { 405 struct swappri *spp, *nextspp; 406 407 for (spp = LIST_FIRST(&swap_priority); spp != NULL; spp = nextspp) { 408 nextspp = LIST_NEXT(spp, spi_swappri); 409 if (CIRCLEQ_FIRST(&spp->spi_swapdev) != 410 (void *)&spp->spi_swapdev) 411 continue; 412 LIST_REMOVE(spp, spi_swappri); 413 free(spp, M_VMSWAP); 414 } 415 } 416 417 /* 418 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 419 * to the "swapdev" that maps that section of the drum. 420 * 421 * => each swapdev takes one big contig chunk of the drum 422 * => caller must hold uvm_swap_data_lock 423 */ 424 static struct swapdev * 425 swapdrum_getsdp(int pgno) 426 { 427 struct swapdev *sdp; 428 struct swappri *spp; 429 430 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 431 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 432 if (sdp->swd_flags & SWF_FAKE) 433 continue; 434 if (pgno >= sdp->swd_drumoffset && 435 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 436 return sdp; 437 } 438 } 439 } 440 return NULL; 441 } 442 443 444 /* 445 * sys_swapctl: main entry point for swapctl(2) system call 446 * [with two helper functions: swap_on and swap_off] 447 */ 448 int 449 sys_swapctl(struct lwp *l, void *v, register_t *retval) 450 { 451 struct sys_swapctl_args /* { 452 syscallarg(int) cmd; 453 syscallarg(void *) arg; 454 syscallarg(int) misc; 455 } */ *uap = (struct sys_swapctl_args *)v; 456 struct vnode *vp; 457 struct nameidata nd; 458 struct swappri *spp; 459 struct swapdev *sdp; 460 struct swapent *sep; 461 #define SWAP_PATH_MAX (PATH_MAX + 1) 462 char *userpath; 463 size_t len; 464 int error, misc; 465 int priority; 466 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist); 467 468 misc = SCARG(uap, misc); 469 470 /* 471 * ensure serialized syscall access by grabbing the swap_syscall_lock 472 */ 473 rw_enter(&swap_syscall_lock, RW_WRITER); 474 475 userpath = malloc(SWAP_PATH_MAX, M_TEMP, M_WAITOK); 476 /* 477 * we handle the non-priv NSWAP and STATS request first. 478 * 479 * SWAP_NSWAP: return number of config'd swap devices 480 * [can also be obtained with uvmexp sysctl] 481 */ 482 if (SCARG(uap, cmd) == SWAP_NSWAP) { 483 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev, 484 0, 0, 0); 485 *retval = uvmexp.nswapdev; 486 error = 0; 487 goto out; 488 } 489 490 /* 491 * SWAP_STATS: get stats on current # of configured swap devs 492 * 493 * note that the swap_priority list can't change as long 494 * as we are holding the swap_syscall_lock. we don't want 495 * to grab the uvm_swap_data_lock because we may fault&sleep during 496 * copyout() and we don't want to be holding that lock then! 497 */ 498 if (SCARG(uap, cmd) == SWAP_STATS 499 #if defined(COMPAT_13) 500 || SCARG(uap, cmd) == SWAP_OSTATS 501 #endif 502 ) { 503 if ((size_t)misc > (size_t)uvmexp.nswapdev) 504 misc = uvmexp.nswapdev; 505 #if defined(COMPAT_13) 506 if (SCARG(uap, cmd) == SWAP_OSTATS) 507 len = sizeof(struct oswapent) * misc; 508 else 509 #endif 510 len = sizeof(struct swapent) * misc; 511 sep = (struct swapent *)malloc(len, M_TEMP, M_WAITOK); 512 513 uvm_swap_stats_locked(SCARG(uap, cmd), sep, misc, retval); 514 error = copyout(sep, SCARG(uap, arg), len); 515 516 free(sep, M_TEMP); 517 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); 518 goto out; 519 } 520 if (SCARG(uap, cmd) == SWAP_GETDUMPDEV) { 521 dev_t *devp = (dev_t *)SCARG(uap, arg); 522 523 error = copyout(&dumpdev, devp, sizeof(dumpdev)); 524 goto out; 525 } 526 527 /* 528 * all other requests require superuser privs. verify. 529 */ 530 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL, 531 0, NULL, NULL, NULL))) 532 goto out; 533 534 if (SCARG(uap, cmd) == SWAP_DUMPOFF) { 535 /* drop the current dump device */ 536 dumpdev = NODEV; 537 cpu_dumpconf(); 538 goto out; 539 } 540 541 /* 542 * at this point we expect a path name in arg. we will 543 * use namei() to gain a vnode reference (vref), and lock 544 * the vnode (VOP_LOCK). 545 * 546 * XXX: a NULL arg means use the root vnode pointer (e.g. for 547 * miniroot) 548 */ 549 if (SCARG(uap, arg) == NULL) { 550 vp = rootvp; /* miniroot */ 551 if (vget(vp, LK_EXCLUSIVE)) { 552 error = EBUSY; 553 goto out; 554 } 555 if (SCARG(uap, cmd) == SWAP_ON && 556 copystr("miniroot", userpath, SWAP_PATH_MAX, &len)) 557 panic("swapctl: miniroot copy failed"); 558 } else { 559 int space; 560 char *where; 561 562 if (SCARG(uap, cmd) == SWAP_ON) { 563 if ((error = copyinstr(SCARG(uap, arg), userpath, 564 SWAP_PATH_MAX, &len))) 565 goto out; 566 space = UIO_SYSSPACE; 567 where = userpath; 568 } else { 569 space = UIO_USERSPACE; 570 where = (char *)SCARG(uap, arg); 571 } 572 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, 573 space, where); 574 if ((error = namei(&nd))) 575 goto out; 576 vp = nd.ni_vp; 577 } 578 /* note: "vp" is referenced and locked */ 579 580 error = 0; /* assume no error */ 581 switch(SCARG(uap, cmd)) { 582 583 case SWAP_DUMPDEV: 584 if (vp->v_type != VBLK) { 585 error = ENOTBLK; 586 break; 587 } 588 if (bdevsw_lookup(vp->v_rdev)) 589 dumpdev = vp->v_rdev; 590 else 591 dumpdev = NODEV; 592 cpu_dumpconf(); 593 break; 594 595 case SWAP_CTL: 596 /* 597 * get new priority, remove old entry (if any) and then 598 * reinsert it in the correct place. finally, prune out 599 * any empty priority structures. 600 */ 601 priority = SCARG(uap, misc); 602 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 603 mutex_enter(&uvm_swap_data_lock); 604 if ((sdp = swaplist_find(vp, true)) == NULL) { 605 error = ENOENT; 606 } else { 607 swaplist_insert(sdp, spp, priority); 608 swaplist_trim(); 609 } 610 mutex_exit(&uvm_swap_data_lock); 611 if (error) 612 free(spp, M_VMSWAP); 613 break; 614 615 case SWAP_ON: 616 617 /* 618 * check for duplicates. if none found, then insert a 619 * dummy entry on the list to prevent someone else from 620 * trying to enable this device while we are working on 621 * it. 622 */ 623 624 priority = SCARG(uap, misc); 625 sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK); 626 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 627 memset(sdp, 0, sizeof(*sdp)); 628 sdp->swd_flags = SWF_FAKE; 629 sdp->swd_vp = vp; 630 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 631 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK); 632 mutex_enter(&uvm_swap_data_lock); 633 if (swaplist_find(vp, false) != NULL) { 634 error = EBUSY; 635 mutex_exit(&uvm_swap_data_lock); 636 bufq_free(sdp->swd_tab); 637 free(sdp, M_VMSWAP); 638 free(spp, M_VMSWAP); 639 break; 640 } 641 swaplist_insert(sdp, spp, priority); 642 mutex_exit(&uvm_swap_data_lock); 643 644 sdp->swd_pathlen = len; 645 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK); 646 if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0) 647 panic("swapctl: copystr"); 648 649 /* 650 * we've now got a FAKE placeholder in the swap list. 651 * now attempt to enable swap on it. if we fail, undo 652 * what we've done and kill the fake entry we just inserted. 653 * if swap_on is a success, it will clear the SWF_FAKE flag 654 */ 655 656 if ((error = swap_on(l, sdp)) != 0) { 657 mutex_enter(&uvm_swap_data_lock); 658 (void) swaplist_find(vp, true); /* kill fake entry */ 659 swaplist_trim(); 660 mutex_exit(&uvm_swap_data_lock); 661 bufq_free(sdp->swd_tab); 662 free(sdp->swd_path, M_VMSWAP); 663 free(sdp, M_VMSWAP); 664 break; 665 } 666 break; 667 668 case SWAP_OFF: 669 mutex_enter(&uvm_swap_data_lock); 670 if ((sdp = swaplist_find(vp, false)) == NULL) { 671 mutex_exit(&uvm_swap_data_lock); 672 error = ENXIO; 673 break; 674 } 675 676 /* 677 * If a device isn't in use or enabled, we 678 * can't stop swapping from it (again). 679 */ 680 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 681 mutex_exit(&uvm_swap_data_lock); 682 error = EBUSY; 683 break; 684 } 685 686 /* 687 * do the real work. 688 */ 689 error = swap_off(l, sdp); 690 break; 691 692 default: 693 error = EINVAL; 694 } 695 696 /* 697 * done! release the ref gained by namei() and unlock. 698 */ 699 vput(vp); 700 701 out: 702 free(userpath, M_TEMP); 703 rw_exit(&swap_syscall_lock); 704 705 UVMHIST_LOG(pdhist, "<- done! error=%d", error, 0, 0, 0); 706 return (error); 707 } 708 709 /* 710 * swap_stats: implements swapctl(SWAP_STATS). The function is kept 711 * away from sys_swapctl() in order to allow COMPAT_* swapctl() 712 * emulation to use it directly without going through sys_swapctl(). 713 * The problem with using sys_swapctl() there is that it involves 714 * copying the swapent array to the stackgap, and this array's size 715 * is not known at build time. Hence it would not be possible to 716 * ensure it would fit in the stackgap in any case. 717 */ 718 void 719 uvm_swap_stats(int cmd, struct swapent *sep, int sec, register_t *retval) 720 { 721 722 rw_enter(&swap_syscall_lock, RW_READER); 723 uvm_swap_stats_locked(cmd, sep, sec, retval); 724 rw_exit(&swap_syscall_lock); 725 } 726 727 static void 728 uvm_swap_stats_locked(int cmd, struct swapent *sep, int sec, register_t *retval) 729 { 730 struct swappri *spp; 731 struct swapdev *sdp; 732 int count = 0; 733 734 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 735 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev); 736 sdp != (void *)&spp->spi_swapdev && sec-- > 0; 737 sdp = CIRCLEQ_NEXT(sdp, swd_next)) { 738 /* 739 * backwards compatibility for system call. 740 * note that we use 'struct oswapent' as an 741 * overlay into both 'struct swapdev' and 742 * the userland 'struct swapent', as we 743 * want to retain backwards compatibility 744 * with NetBSD 1.3. 745 */ 746 sdp->swd_ose.ose_inuse = 747 btodb((uint64_t)sdp->swd_npginuse << 748 PAGE_SHIFT); 749 (void)memcpy(sep, &sdp->swd_ose, 750 sizeof(struct oswapent)); 751 752 /* now copy out the path if necessary */ 753 #if !defined(COMPAT_13) 754 (void) cmd; 755 #endif 756 #if defined(COMPAT_13) 757 if (cmd == SWAP_STATS) 758 #endif 759 (void)memcpy(&sep->se_path, sdp->swd_path, 760 sdp->swd_pathlen); 761 762 count++; 763 #if defined(COMPAT_13) 764 if (cmd == SWAP_OSTATS) 765 sep = (struct swapent *) 766 ((struct oswapent *)sep + 1); 767 else 768 #endif 769 sep++; 770 } 771 } 772 773 *retval = count; 774 return; 775 } 776 777 /* 778 * swap_on: attempt to enable a swapdev for swapping. note that the 779 * swapdev is already on the global list, but disabled (marked 780 * SWF_FAKE). 781 * 782 * => we avoid the start of the disk (to protect disk labels) 783 * => we also avoid the miniroot, if we are swapping to root. 784 * => caller should leave uvm_swap_data_lock unlocked, we may lock it 785 * if needed. 786 */ 787 static int 788 swap_on(struct lwp *l, struct swapdev *sdp) 789 { 790 struct vnode *vp; 791 int error, npages, nblocks, size; 792 long addr; 793 u_long result; 794 struct vattr va; 795 #ifdef NFS 796 extern int (**nfsv2_vnodeop_p)(void *); 797 #endif /* NFS */ 798 const struct bdevsw *bdev; 799 dev_t dev; 800 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist); 801 802 /* 803 * we want to enable swapping on sdp. the swd_vp contains 804 * the vnode we want (locked and ref'd), and the swd_dev 805 * contains the dev_t of the file, if it a block device. 806 */ 807 808 vp = sdp->swd_vp; 809 dev = sdp->swd_dev; 810 811 /* 812 * open the swap file (mostly useful for block device files to 813 * let device driver know what is up). 814 * 815 * we skip the open/close for root on swap because the root 816 * has already been opened when root was mounted (mountroot). 817 */ 818 if (vp != rootvp) { 819 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred))) 820 return (error); 821 } 822 823 /* XXX this only works for block devices */ 824 UVMHIST_LOG(pdhist, " dev=%d, major(dev)=%d", dev, major(dev), 0,0); 825 826 /* 827 * we now need to determine the size of the swap area. for 828 * block specials we can call the d_psize function. 829 * for normal files, we must stat [get attrs]. 830 * 831 * we put the result in nblks. 832 * for normal files, we also want the filesystem block size 833 * (which we get with statfs). 834 */ 835 switch (vp->v_type) { 836 case VBLK: 837 bdev = bdevsw_lookup(dev); 838 if (bdev == NULL || bdev->d_psize == NULL || 839 (nblocks = (*bdev->d_psize)(dev)) == -1) { 840 error = ENXIO; 841 goto bad; 842 } 843 break; 844 845 case VREG: 846 if ((error = VOP_GETATTR(vp, &va, l->l_cred))) 847 goto bad; 848 nblocks = (int)btodb(va.va_size); 849 if ((error = 850 VFS_STATVFS(vp->v_mount, &vp->v_mount->mnt_stat)) != 0) 851 goto bad; 852 853 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize; 854 /* 855 * limit the max # of outstanding I/O requests we issue 856 * at any one time. take it easy on NFS servers. 857 */ 858 #ifdef NFS 859 if (vp->v_op == nfsv2_vnodeop_p) 860 sdp->swd_maxactive = 2; /* XXX */ 861 else 862 #endif /* NFS */ 863 sdp->swd_maxactive = 8; /* XXX */ 864 break; 865 866 default: 867 error = ENXIO; 868 goto bad; 869 } 870 871 /* 872 * save nblocks in a safe place and convert to pages. 873 */ 874 875 sdp->swd_ose.ose_nblks = nblocks; 876 npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT; 877 878 /* 879 * for block special files, we want to make sure that leave 880 * the disklabel and bootblocks alone, so we arrange to skip 881 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 882 * note that because of this the "size" can be less than the 883 * actual number of blocks on the device. 884 */ 885 if (vp->v_type == VBLK) { 886 /* we use pages 1 to (size - 1) [inclusive] */ 887 size = npages - 1; 888 addr = 1; 889 } else { 890 /* we use pages 0 to (size - 1) [inclusive] */ 891 size = npages; 892 addr = 0; 893 } 894 895 /* 896 * make sure we have enough blocks for a reasonable sized swap 897 * area. we want at least one page. 898 */ 899 900 if (size < 1) { 901 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); 902 error = EINVAL; 903 goto bad; 904 } 905 906 UVMHIST_LOG(pdhist, " dev=%x: size=%d addr=%ld\n", dev, size, addr, 0); 907 908 /* 909 * now we need to allocate an extent to manage this swap device 910 */ 911 912 sdp->swd_blist = blist_create(npages); 913 /* mark all expect the `saved' region free. */ 914 blist_free(sdp->swd_blist, addr, size); 915 916 /* 917 * if the vnode we are swapping to is the root vnode 918 * (i.e. we are swapping to the miniroot) then we want 919 * to make sure we don't overwrite it. do a statfs to 920 * find its size and skip over it. 921 */ 922 if (vp == rootvp) { 923 struct mount *mp; 924 struct statvfs *sp; 925 int rootblocks, rootpages; 926 927 mp = rootvnode->v_mount; 928 sp = &mp->mnt_stat; 929 rootblocks = sp->f_blocks * btodb(sp->f_frsize); 930 /* 931 * XXX: sp->f_blocks isn't the total number of 932 * blocks in the filesystem, it's the number of 933 * data blocks. so, our rootblocks almost 934 * definitely underestimates the total size 935 * of the filesystem - how badly depends on the 936 * details of the filesystem type. there isn't 937 * an obvious way to deal with this cleanly 938 * and perfectly, so for now we just pad our 939 * rootblocks estimate with an extra 5 percent. 940 */ 941 rootblocks += (rootblocks >> 5) + 942 (rootblocks >> 6) + 943 (rootblocks >> 7); 944 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; 945 if (rootpages > size) 946 panic("swap_on: miniroot larger than swap?"); 947 948 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) { 949 panic("swap_on: unable to preserve miniroot"); 950 } 951 952 size -= rootpages; 953 printf("Preserved %d pages of miniroot ", rootpages); 954 printf("leaving %d pages of swap\n", size); 955 } 956 957 /* 958 * add a ref to vp to reflect usage as a swap device. 959 */ 960 vref(vp); 961 962 /* 963 * now add the new swapdev to the drum and enable. 964 */ 965 result = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP); 966 if (result == 0) 967 panic("swapdrum_add"); 968 /* 969 * If this is the first regular swap create the workqueue. 970 * => Protected by swap_syscall_lock. 971 */ 972 if (vp->v_type != VBLK) { 973 if (sw_reg_count++ == 0) { 974 KASSERT(sw_reg_workqueue == NULL); 975 if (workqueue_create(&sw_reg_workqueue, "swapiod", 976 sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0) 977 panic("swap_add: workqueue_create failed"); 978 } 979 } 980 981 sdp->swd_drumoffset = (int)result; 982 sdp->swd_drumsize = npages; 983 sdp->swd_npages = size; 984 mutex_enter(&uvm_swap_data_lock); 985 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 986 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 987 uvmexp.swpages += size; 988 uvmexp.swpgavail += size; 989 mutex_exit(&uvm_swap_data_lock); 990 return (0); 991 992 /* 993 * failure: clean up and return error. 994 */ 995 996 bad: 997 if (sdp->swd_blist) { 998 blist_destroy(sdp->swd_blist); 999 } 1000 if (vp != rootvp) { 1001 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred); 1002 } 1003 return (error); 1004 } 1005 1006 /* 1007 * swap_off: stop swapping on swapdev 1008 * 1009 * => swap data should be locked, we will unlock. 1010 */ 1011 static int 1012 swap_off(struct lwp *l, struct swapdev *sdp) 1013 { 1014 int npages = sdp->swd_npages; 1015 int error = 0; 1016 1017 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist); 1018 UVMHIST_LOG(pdhist, " dev=%x, npages=%d", sdp->swd_dev,npages,0,0); 1019 1020 /* disable the swap area being removed */ 1021 sdp->swd_flags &= ~SWF_ENABLE; 1022 uvmexp.swpgavail -= npages; 1023 mutex_exit(&uvm_swap_data_lock); 1024 1025 /* 1026 * the idea is to find all the pages that are paged out to this 1027 * device, and page them all in. in uvm, swap-backed pageable 1028 * memory can take two forms: aobjs and anons. call the 1029 * swapoff hook for each subsystem to bring in pages. 1030 */ 1031 1032 if (uao_swap_off(sdp->swd_drumoffset, 1033 sdp->swd_drumoffset + sdp->swd_drumsize) || 1034 amap_swap_off(sdp->swd_drumoffset, 1035 sdp->swd_drumoffset + sdp->swd_drumsize)) { 1036 error = ENOMEM; 1037 } else if (sdp->swd_npginuse > sdp->swd_npgbad) { 1038 error = EBUSY; 1039 } 1040 1041 if (error) { 1042 mutex_enter(&uvm_swap_data_lock); 1043 sdp->swd_flags |= SWF_ENABLE; 1044 uvmexp.swpgavail += npages; 1045 mutex_exit(&uvm_swap_data_lock); 1046 1047 return error; 1048 } 1049 1050 /* 1051 * If this is the last regular swap destroy the workqueue. 1052 * => Protected by swap_syscall_lock. 1053 */ 1054 if (sdp->swd_vp->v_type != VBLK) { 1055 KASSERT(sw_reg_count > 0); 1056 KASSERT(sw_reg_workqueue != NULL); 1057 if (--sw_reg_count == 0) { 1058 workqueue_destroy(sw_reg_workqueue); 1059 sw_reg_workqueue = NULL; 1060 } 1061 } 1062 1063 /* 1064 * done with the vnode. 1065 * drop our ref on the vnode before calling VOP_CLOSE() 1066 * so that spec_close() can tell if this is the last close. 1067 */ 1068 vrele(sdp->swd_vp); 1069 if (sdp->swd_vp != rootvp) { 1070 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred); 1071 } 1072 1073 mutex_enter(&uvm_swap_data_lock); 1074 uvmexp.swpages -= npages; 1075 uvmexp.swpginuse -= sdp->swd_npgbad; 1076 1077 if (swaplist_find(sdp->swd_vp, true) == NULL) 1078 panic("swap_off: swapdev not in list"); 1079 swaplist_trim(); 1080 mutex_exit(&uvm_swap_data_lock); 1081 1082 /* 1083 * free all resources! 1084 */ 1085 vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize); 1086 blist_destroy(sdp->swd_blist); 1087 bufq_free(sdp->swd_tab); 1088 free(sdp, M_VMSWAP); 1089 return (0); 1090 } 1091 1092 /* 1093 * /dev/drum interface and i/o functions 1094 */ 1095 1096 /* 1097 * swstrategy: perform I/O on the drum 1098 * 1099 * => we must map the i/o request from the drum to the correct swapdev. 1100 */ 1101 static void 1102 swstrategy(struct buf *bp) 1103 { 1104 struct swapdev *sdp; 1105 struct vnode *vp; 1106 int s, pageno, bn; 1107 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist); 1108 1109 /* 1110 * convert block number to swapdev. note that swapdev can't 1111 * be yanked out from under us because we are holding resources 1112 * in it (i.e. the blocks we are doing I/O on). 1113 */ 1114 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT; 1115 mutex_enter(&uvm_swap_data_lock); 1116 sdp = swapdrum_getsdp(pageno); 1117 mutex_exit(&uvm_swap_data_lock); 1118 if (sdp == NULL) { 1119 bp->b_error = EINVAL; 1120 biodone(bp); 1121 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); 1122 return; 1123 } 1124 1125 /* 1126 * convert drum page number to block number on this swapdev. 1127 */ 1128 1129 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1130 bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ 1131 1132 UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld", 1133 ((bp->b_flags & B_READ) == 0) ? "write" : "read", 1134 sdp->swd_drumoffset, bn, bp->b_bcount); 1135 1136 /* 1137 * for block devices we finish up here. 1138 * for regular files we have to do more work which we delegate 1139 * to sw_reg_strategy(). 1140 */ 1141 1142 switch (sdp->swd_vp->v_type) { 1143 default: 1144 panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type); 1145 1146 case VBLK: 1147 1148 /* 1149 * must convert "bp" from an I/O on /dev/drum to an I/O 1150 * on the swapdev (sdp). 1151 */ 1152 s = splbio(); 1153 bp->b_blkno = bn; /* swapdev block number */ 1154 vp = sdp->swd_vp; /* swapdev vnode pointer */ 1155 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */ 1156 1157 /* 1158 * if we are doing a write, we have to redirect the i/o on 1159 * drum's v_numoutput counter to the swapdevs. 1160 */ 1161 if ((bp->b_flags & B_READ) == 0) { 1162 vwakeup(bp); /* kills one 'v_numoutput' on drum */ 1163 V_INCR_NUMOUTPUT(vp); /* put it on swapdev */ 1164 } 1165 1166 /* 1167 * finally plug in swapdev vnode and start I/O 1168 */ 1169 bp->b_vp = vp; 1170 splx(s); 1171 VOP_STRATEGY(vp, bp); 1172 return; 1173 1174 case VREG: 1175 /* 1176 * delegate to sw_reg_strategy function. 1177 */ 1178 sw_reg_strategy(sdp, bp, bn); 1179 return; 1180 } 1181 /* NOTREACHED */ 1182 } 1183 1184 /* 1185 * swread: the read function for the drum (just a call to physio) 1186 */ 1187 /*ARGSUSED*/ 1188 static int 1189 swread(dev_t dev, struct uio *uio, int ioflag) 1190 { 1191 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist); 1192 1193 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); 1194 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); 1195 } 1196 1197 /* 1198 * swwrite: the write function for the drum (just a call to physio) 1199 */ 1200 /*ARGSUSED*/ 1201 static int 1202 swwrite(dev_t dev, struct uio *uio, int ioflag) 1203 { 1204 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist); 1205 1206 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); 1207 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); 1208 } 1209 1210 const struct bdevsw swap_bdevsw = { 1211 noopen, noclose, swstrategy, noioctl, nodump, nosize, D_OTHER, 1212 }; 1213 1214 const struct cdevsw swap_cdevsw = { 1215 nullopen, nullclose, swread, swwrite, noioctl, 1216 nostop, notty, nopoll, nommap, nokqfilter, D_OTHER, 1217 }; 1218 1219 /* 1220 * sw_reg_strategy: handle swap i/o to regular files 1221 */ 1222 static void 1223 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) 1224 { 1225 struct vnode *vp; 1226 struct vndxfer *vnx; 1227 daddr_t nbn; 1228 char *addr; 1229 off_t byteoff; 1230 int s, off, nra, error, sz, resid; 1231 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist); 1232 1233 /* 1234 * allocate a vndxfer head for this transfer and point it to 1235 * our buffer. 1236 */ 1237 getvndxfer(vnx); 1238 vnx->vx_flags = VX_BUSY; 1239 vnx->vx_error = 0; 1240 vnx->vx_pending = 0; 1241 vnx->vx_bp = bp; 1242 vnx->vx_sdp = sdp; 1243 1244 /* 1245 * setup for main loop where we read filesystem blocks into 1246 * our buffer. 1247 */ 1248 error = 0; 1249 bp->b_resid = bp->b_bcount; /* nothing transfered yet! */ 1250 addr = bp->b_data; /* current position in buffer */ 1251 byteoff = dbtob((uint64_t)bn); 1252 1253 for (resid = bp->b_resid; resid; resid -= sz) { 1254 struct vndbuf *nbp; 1255 1256 /* 1257 * translate byteoffset into block number. return values: 1258 * vp = vnode of underlying device 1259 * nbn = new block number (on underlying vnode dev) 1260 * nra = num blocks we can read-ahead (excludes requested 1261 * block) 1262 */ 1263 nra = 0; 1264 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1265 &vp, &nbn, &nra); 1266 1267 if (error == 0 && nbn == (daddr_t)-1) { 1268 /* 1269 * this used to just set error, but that doesn't 1270 * do the right thing. Instead, it causes random 1271 * memory errors. The panic() should remain until 1272 * this condition doesn't destabilize the system. 1273 */ 1274 #if 1 1275 panic("sw_reg_strategy: swap to sparse file"); 1276 #else 1277 error = EIO; /* failure */ 1278 #endif 1279 } 1280 1281 /* 1282 * punt if there was an error or a hole in the file. 1283 * we must wait for any i/o ops we have already started 1284 * to finish before returning. 1285 * 1286 * XXX we could deal with holes here but it would be 1287 * a hassle (in the write case). 1288 */ 1289 if (error) { 1290 s = splbio(); 1291 vnx->vx_error = error; /* pass error up */ 1292 goto out; 1293 } 1294 1295 /* 1296 * compute the size ("sz") of this transfer (in bytes). 1297 */ 1298 off = byteoff % sdp->swd_bsize; 1299 sz = (1 + nra) * sdp->swd_bsize - off; 1300 if (sz > resid) 1301 sz = resid; 1302 1303 UVMHIST_LOG(pdhist, "sw_reg_strategy: " 1304 "vp %p/%p offset 0x%x/0x%x", 1305 sdp->swd_vp, vp, byteoff, nbn); 1306 1307 /* 1308 * now get a buf structure. note that the vb_buf is 1309 * at the front of the nbp structure so that you can 1310 * cast pointers between the two structure easily. 1311 */ 1312 getvndbuf(nbp); 1313 BUF_INIT(&nbp->vb_buf); 1314 nbp->vb_buf.b_flags = bp->b_flags | B_CALL; 1315 nbp->vb_buf.b_bcount = sz; 1316 nbp->vb_buf.b_bufsize = sz; 1317 nbp->vb_buf.b_error = 0; 1318 nbp->vb_buf.b_data = addr; 1319 nbp->vb_buf.b_lblkno = 0; 1320 nbp->vb_buf.b_blkno = nbn + btodb(off); 1321 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno; 1322 nbp->vb_buf.b_iodone = sw_reg_biodone; 1323 nbp->vb_buf.b_vp = vp; 1324 if (vp->v_type == VBLK) { 1325 nbp->vb_buf.b_dev = vp->v_rdev; 1326 } 1327 1328 nbp->vb_xfer = vnx; /* patch it back in to vnx */ 1329 1330 /* 1331 * Just sort by block number 1332 */ 1333 s = splbio(); 1334 if (vnx->vx_error != 0) { 1335 putvndbuf(nbp); 1336 goto out; 1337 } 1338 vnx->vx_pending++; 1339 1340 /* sort it in and start I/O if we are not over our limit */ 1341 BUFQ_PUT(sdp->swd_tab, &nbp->vb_buf); 1342 sw_reg_start(sdp); 1343 splx(s); 1344 1345 /* 1346 * advance to the next I/O 1347 */ 1348 byteoff += sz; 1349 addr += sz; 1350 } 1351 1352 s = splbio(); 1353 1354 out: /* Arrive here at splbio */ 1355 vnx->vx_flags &= ~VX_BUSY; 1356 if (vnx->vx_pending == 0) { 1357 if (vnx->vx_error != 0) 1358 bp->b_error = vnx->vx_error; 1359 putvndxfer(vnx); 1360 biodone(bp); 1361 } 1362 splx(s); 1363 } 1364 1365 /* 1366 * sw_reg_start: start an I/O request on the requested swapdev 1367 * 1368 * => reqs are sorted by b_rawblkno (above) 1369 */ 1370 static void 1371 sw_reg_start(struct swapdev *sdp) 1372 { 1373 struct buf *bp; 1374 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist); 1375 1376 /* recursion control */ 1377 if ((sdp->swd_flags & SWF_BUSY) != 0) 1378 return; 1379 1380 sdp->swd_flags |= SWF_BUSY; 1381 1382 while (sdp->swd_active < sdp->swd_maxactive) { 1383 bp = BUFQ_GET(sdp->swd_tab); 1384 if (bp == NULL) 1385 break; 1386 sdp->swd_active++; 1387 1388 UVMHIST_LOG(pdhist, 1389 "sw_reg_start: bp %p vp %p blkno %p cnt %lx", 1390 bp, bp->b_vp, bp->b_blkno, bp->b_bcount); 1391 if ((bp->b_flags & B_READ) == 0) 1392 V_INCR_NUMOUTPUT(bp->b_vp); 1393 1394 VOP_STRATEGY(bp->b_vp, bp); 1395 } 1396 sdp->swd_flags &= ~SWF_BUSY; 1397 } 1398 1399 /* 1400 * sw_reg_biodone: one of our i/o's has completed 1401 */ 1402 static void 1403 sw_reg_biodone(struct buf *bp) 1404 { 1405 workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL); 1406 } 1407 1408 /* 1409 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1410 * 1411 * => note that we can recover the vndbuf struct by casting the buf ptr 1412 */ 1413 static void 1414 sw_reg_iodone(struct work *wk, void *dummy) 1415 { 1416 struct vndbuf *vbp = (void *)wk; 1417 struct vndxfer *vnx = vbp->vb_xfer; 1418 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1419 struct swapdev *sdp = vnx->vx_sdp; 1420 int s, resid, error; 1421 KASSERT(&vbp->vb_buf.b_work == wk); 1422 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist); 1423 1424 UVMHIST_LOG(pdhist, " vbp=%p vp=%p blkno=%x addr=%p", 1425 vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data); 1426 UVMHIST_LOG(pdhist, " cnt=%lx resid=%lx", 1427 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); 1428 1429 /* 1430 * protect vbp at splbio and update. 1431 */ 1432 1433 s = splbio(); 1434 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1435 pbp->b_resid -= resid; 1436 vnx->vx_pending--; 1437 1438 if (vbp->vb_buf.b_error != 0) { 1439 /* pass error upward */ 1440 error = vbp->vb_buf.b_error; 1441 UVMHIST_LOG(pdhist, " got error=%d !", error, 0, 0, 0); 1442 vnx->vx_error = error; 1443 } 1444 1445 /* 1446 * kill vbp structure 1447 */ 1448 putvndbuf(vbp); 1449 1450 /* 1451 * wrap up this transaction if it has run to completion or, in 1452 * case of an error, when all auxiliary buffers have returned. 1453 */ 1454 if (vnx->vx_error != 0) { 1455 /* pass error upward */ 1456 pbp->b_error = vnx->vx_error; 1457 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1458 putvndxfer(vnx); 1459 biodone(pbp); 1460 } 1461 } else if (pbp->b_resid == 0) { 1462 KASSERT(vnx->vx_pending == 0); 1463 if ((vnx->vx_flags & VX_BUSY) == 0) { 1464 UVMHIST_LOG(pdhist, " iodone error=%d !", 1465 pbp, vnx->vx_error, 0, 0); 1466 putvndxfer(vnx); 1467 biodone(pbp); 1468 } 1469 } 1470 1471 /* 1472 * done! start next swapdev I/O if one is pending 1473 */ 1474 sdp->swd_active--; 1475 sw_reg_start(sdp); 1476 splx(s); 1477 } 1478 1479 1480 /* 1481 * uvm_swap_alloc: allocate space on swap 1482 * 1483 * => allocation is done "round robin" down the priority list, as we 1484 * allocate in a priority we "rotate" the circle queue. 1485 * => space can be freed with uvm_swap_free 1486 * => we return the page slot number in /dev/drum (0 == invalid slot) 1487 * => we lock uvm_swap_data_lock 1488 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1489 */ 1490 int 1491 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok) 1492 { 1493 struct swapdev *sdp; 1494 struct swappri *spp; 1495 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist); 1496 1497 /* 1498 * no swap devices configured yet? definite failure. 1499 */ 1500 if (uvmexp.nswapdev < 1) 1501 return 0; 1502 1503 /* 1504 * lock data lock, convert slots into blocks, and enter loop 1505 */ 1506 mutex_enter(&uvm_swap_data_lock); 1507 1508 ReTry: /* XXXMRG */ 1509 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1510 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1511 uint64_t result; 1512 1513 /* if it's not enabled, then we can't swap from it */ 1514 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1515 continue; 1516 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1517 continue; 1518 result = blist_alloc(sdp->swd_blist, *nslots); 1519 if (result == BLIST_NONE) { 1520 continue; 1521 } 1522 KASSERT(result < sdp->swd_drumsize); 1523 1524 /* 1525 * successful allocation! now rotate the circleq. 1526 */ 1527 CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1528 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1529 sdp->swd_npginuse += *nslots; 1530 uvmexp.swpginuse += *nslots; 1531 mutex_exit(&uvm_swap_data_lock); 1532 /* done! return drum slot number */ 1533 UVMHIST_LOG(pdhist, 1534 "success! returning %d slots starting at %d", 1535 *nslots, result + sdp->swd_drumoffset, 0, 0); 1536 return (result + sdp->swd_drumoffset); 1537 } 1538 } 1539 1540 /* XXXMRG: BEGIN HACK */ 1541 if (*nslots > 1 && lessok) { 1542 *nslots = 1; 1543 /* XXXMRG: ugh! blist should support this for us */ 1544 goto ReTry; 1545 } 1546 /* XXXMRG: END HACK */ 1547 1548 mutex_exit(&uvm_swap_data_lock); 1549 return 0; 1550 } 1551 1552 bool 1553 uvm_swapisfull(void) 1554 { 1555 bool rv; 1556 1557 mutex_enter(&uvm_swap_data_lock); 1558 KASSERT(uvmexp.swpgonly <= uvmexp.swpages); 1559 rv = (uvmexp.swpgonly >= uvmexp.swpgavail); 1560 mutex_exit(&uvm_swap_data_lock); 1561 1562 return (rv); 1563 } 1564 1565 /* 1566 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1567 * 1568 * => we lock uvm_swap_data_lock 1569 */ 1570 void 1571 uvm_swap_markbad(int startslot, int nslots) 1572 { 1573 struct swapdev *sdp; 1574 UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist); 1575 1576 mutex_enter(&uvm_swap_data_lock); 1577 sdp = swapdrum_getsdp(startslot); 1578 KASSERT(sdp != NULL); 1579 1580 /* 1581 * we just keep track of how many pages have been marked bad 1582 * in this device, to make everything add up in swap_off(). 1583 * we assume here that the range of slots will all be within 1584 * one swap device. 1585 */ 1586 1587 KASSERT(uvmexp.swpgonly >= nslots); 1588 uvmexp.swpgonly -= nslots; 1589 sdp->swd_npgbad += nslots; 1590 UVMHIST_LOG(pdhist, "now %d bad", sdp->swd_npgbad, 0,0,0); 1591 mutex_exit(&uvm_swap_data_lock); 1592 } 1593 1594 /* 1595 * uvm_swap_free: free swap slots 1596 * 1597 * => this can be all or part of an allocation made by uvm_swap_alloc 1598 * => we lock uvm_swap_data_lock 1599 */ 1600 void 1601 uvm_swap_free(int startslot, int nslots) 1602 { 1603 struct swapdev *sdp; 1604 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist); 1605 1606 UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots, 1607 startslot, 0, 0); 1608 1609 /* 1610 * ignore attempts to free the "bad" slot. 1611 */ 1612 1613 if (startslot == SWSLOT_BAD) { 1614 return; 1615 } 1616 1617 /* 1618 * convert drum slot offset back to sdp, free the blocks 1619 * in the extent, and return. must hold pri lock to do 1620 * lookup and access the extent. 1621 */ 1622 1623 mutex_enter(&uvm_swap_data_lock); 1624 sdp = swapdrum_getsdp(startslot); 1625 KASSERT(uvmexp.nswapdev >= 1); 1626 KASSERT(sdp != NULL); 1627 KASSERT(sdp->swd_npginuse >= nslots); 1628 blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots); 1629 sdp->swd_npginuse -= nslots; 1630 uvmexp.swpginuse -= nslots; 1631 mutex_exit(&uvm_swap_data_lock); 1632 } 1633 1634 /* 1635 * uvm_swap_put: put any number of pages into a contig place on swap 1636 * 1637 * => can be sync or async 1638 */ 1639 1640 int 1641 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) 1642 { 1643 int error; 1644 1645 error = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1646 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1647 return error; 1648 } 1649 1650 /* 1651 * uvm_swap_get: get a single page from swap 1652 * 1653 * => usually a sync op (from fault) 1654 */ 1655 1656 int 1657 uvm_swap_get(struct vm_page *page, int swslot, int flags) 1658 { 1659 int error; 1660 1661 uvmexp.nswget++; 1662 KASSERT(flags & PGO_SYNCIO); 1663 if (swslot == SWSLOT_BAD) { 1664 return EIO; 1665 } 1666 1667 error = uvm_swap_io(&page, swslot, 1, B_READ | 1668 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1669 if (error == 0) { 1670 1671 /* 1672 * this page is no longer only in swap. 1673 */ 1674 1675 mutex_enter(&uvm_swap_data_lock); 1676 KASSERT(uvmexp.swpgonly > 0); 1677 uvmexp.swpgonly--; 1678 mutex_exit(&uvm_swap_data_lock); 1679 } 1680 return error; 1681 } 1682 1683 /* 1684 * uvm_swap_io: do an i/o operation to swap 1685 */ 1686 1687 static int 1688 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) 1689 { 1690 daddr_t startblk; 1691 struct buf *bp; 1692 vaddr_t kva; 1693 int error, s, mapinflags; 1694 bool write, async; 1695 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist); 1696 1697 UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d", 1698 startslot, npages, flags, 0); 1699 1700 write = (flags & B_READ) == 0; 1701 async = (flags & B_ASYNC) != 0; 1702 1703 /* 1704 * convert starting drum slot to block number 1705 */ 1706 1707 startblk = btodb((uint64_t)startslot << PAGE_SHIFT); 1708 1709 /* 1710 * first, map the pages into the kernel. 1711 */ 1712 1713 mapinflags = !write ? 1714 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ : 1715 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE; 1716 kva = uvm_pagermapin(pps, npages, mapinflags); 1717 1718 /* 1719 * now allocate a buf for the i/o. 1720 */ 1721 1722 bp = getiobuf(); 1723 1724 /* 1725 * fill in the bp/sbp. we currently route our i/o through 1726 * /dev/drum's vnode [swapdev_vp]. 1727 */ 1728 1729 bp->b_flags = B_BUSY | B_NOCACHE | (flags & (B_READ|B_ASYNC)); 1730 bp->b_proc = &proc0; /* XXX */ 1731 bp->b_vnbufs.le_next = NOLIST; 1732 bp->b_data = (void *)kva; 1733 bp->b_blkno = startblk; 1734 bp->b_vp = swapdev_vp; 1735 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT; 1736 1737 /* 1738 * bump v_numoutput (counter of number of active outputs). 1739 */ 1740 1741 if (write) { 1742 s = splbio(); 1743 V_INCR_NUMOUTPUT(swapdev_vp); 1744 splx(s); 1745 } 1746 1747 /* 1748 * for async ops we must set up the iodone handler. 1749 */ 1750 1751 if (async) { 1752 bp->b_flags |= B_CALL; 1753 bp->b_iodone = uvm_aio_biodone; 1754 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); 1755 if (curlwp == uvm.pagedaemon_lwp) 1756 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1757 else 1758 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 1759 } else { 1760 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1761 } 1762 UVMHIST_LOG(pdhist, 1763 "about to start io: data = %p blkno = 0x%x, bcount = %ld", 1764 bp->b_data, bp->b_blkno, bp->b_bcount, 0); 1765 1766 /* 1767 * now we start the I/O, and if async, return. 1768 */ 1769 1770 VOP_STRATEGY(swapdev_vp, bp); 1771 if (async) 1772 return 0; 1773 1774 /* 1775 * must be sync i/o. wait for it to finish 1776 */ 1777 1778 error = biowait(bp); 1779 1780 /* 1781 * kill the pager mapping 1782 */ 1783 1784 uvm_pagermapout(kva, npages); 1785 1786 /* 1787 * now dispose of the buf and we're done. 1788 */ 1789 1790 s = splbio(); 1791 if (write) 1792 vwakeup(bp); 1793 putiobuf(bp); 1794 splx(s); 1795 UVMHIST_LOG(pdhist, "<- done (sync) error=%d", error, 0, 0, 0); 1796 return (error); 1797 } 1798