1 /* $NetBSD: uvm_swap.c,v 1.130 2007/10/15 08:12:13 hannken Exp $ */ 2 3 /* 4 * Copyright (c) 1995, 1996, 1997 Matthew R. Green 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. The name of the author may not be used to endorse or promote products 16 * derived from this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 31 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 32 */ 33 34 #include <sys/cdefs.h> 35 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.130 2007/10/15 08:12:13 hannken Exp $"); 36 37 #include "fs_nfs.h" 38 #include "opt_uvmhist.h" 39 #include "opt_compat_netbsd.h" 40 #include "opt_ddb.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/buf.h> 45 #include <sys/bufq.h> 46 #include <sys/conf.h> 47 #include <sys/proc.h> 48 #include <sys/namei.h> 49 #include <sys/disklabel.h> 50 #include <sys/errno.h> 51 #include <sys/kernel.h> 52 #include <sys/malloc.h> 53 #include <sys/vnode.h> 54 #include <sys/file.h> 55 #include <sys/vmem.h> 56 #include <sys/blist.h> 57 #include <sys/mount.h> 58 #include <sys/pool.h> 59 #include <sys/syscallargs.h> 60 #include <sys/swap.h> 61 #include <sys/kauth.h> 62 #include <sys/sysctl.h> 63 #include <sys/workqueue.h> 64 65 #include <uvm/uvm.h> 66 67 #include <miscfs/specfs/specdev.h> 68 69 /* 70 * uvm_swap.c: manage configuration and i/o to swap space. 71 */ 72 73 /* 74 * swap space is managed in the following way: 75 * 76 * each swap partition or file is described by a "swapdev" structure. 77 * each "swapdev" structure contains a "swapent" structure which contains 78 * information that is passed up to the user (via system calls). 79 * 80 * each swap partition is assigned a "priority" (int) which controls 81 * swap parition usage. 82 * 83 * the system maintains a global data structure describing all swap 84 * partitions/files. there is a sorted LIST of "swappri" structures 85 * which describe "swapdev"'s at that priority. this LIST is headed 86 * by the "swap_priority" global var. each "swappri" contains a 87 * CIRCLEQ of "swapdev" structures at that priority. 88 * 89 * locking: 90 * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl 91 * system call and prevents the swap priority list from changing 92 * while we are in the middle of a system call (e.g. SWAP_STATS). 93 * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data 94 * structures including the priority list, the swapdev structures, 95 * and the swapmap arena. 96 * 97 * each swap device has the following info: 98 * - swap device in use (could be disabled, preventing future use) 99 * - swap enabled (allows new allocations on swap) 100 * - map info in /dev/drum 101 * - vnode pointer 102 * for swap files only: 103 * - block size 104 * - max byte count in buffer 105 * - buffer 106 * 107 * userland controls and configures swap with the swapctl(2) system call. 108 * the sys_swapctl performs the following operations: 109 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 110 * [2] SWAP_STATS: given a pointer to an array of swapent structures 111 * (passed in via "arg") of a size passed in via "misc" ... we load 112 * the current swap config into the array. The actual work is done 113 * in the uvm_swap_stats(9) function. 114 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 115 * priority in "misc", start swapping on it. 116 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 117 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 118 * "misc") 119 */ 120 121 /* 122 * swapdev: describes a single swap partition/file 123 * 124 * note the following should be true: 125 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 126 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] 127 */ 128 struct swapdev { 129 struct oswapent swd_ose; 130 #define swd_dev swd_ose.ose_dev /* device id */ 131 #define swd_flags swd_ose.ose_flags /* flags:inuse/enable/fake */ 132 #define swd_priority swd_ose.ose_priority /* our priority */ 133 /* also: swd_ose.ose_nblks, swd_ose.ose_inuse */ 134 char *swd_path; /* saved pathname of device */ 135 int swd_pathlen; /* length of pathname */ 136 int swd_npages; /* #pages we can use */ 137 int swd_npginuse; /* #pages in use */ 138 int swd_npgbad; /* #pages bad */ 139 int swd_drumoffset; /* page0 offset in drum */ 140 int swd_drumsize; /* #pages in drum */ 141 blist_t swd_blist; /* blist for this swapdev */ 142 struct vnode *swd_vp; /* backing vnode */ 143 CIRCLEQ_ENTRY(swapdev) swd_next; /* priority circleq */ 144 145 int swd_bsize; /* blocksize (bytes) */ 146 int swd_maxactive; /* max active i/o reqs */ 147 struct bufq_state *swd_tab; /* buffer list */ 148 int swd_active; /* number of active buffers */ 149 }; 150 151 /* 152 * swap device priority entry; the list is kept sorted on `spi_priority'. 153 */ 154 struct swappri { 155 int spi_priority; /* priority */ 156 CIRCLEQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 157 /* circleq of swapdevs at this priority */ 158 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 159 }; 160 161 /* 162 * The following two structures are used to keep track of data transfers 163 * on swap devices associated with regular files. 164 * NOTE: this code is more or less a copy of vnd.c; we use the same 165 * structure names here to ease porting.. 166 */ 167 struct vndxfer { 168 struct buf *vx_bp; /* Pointer to parent buffer */ 169 struct swapdev *vx_sdp; 170 int vx_error; 171 int vx_pending; /* # of pending aux buffers */ 172 int vx_flags; 173 #define VX_BUSY 1 174 #define VX_DEAD 2 175 }; 176 177 struct vndbuf { 178 struct buf vb_buf; 179 struct vndxfer *vb_xfer; 180 }; 181 182 183 /* 184 * We keep a of pool vndbuf's and vndxfer structures. 185 */ 186 POOL_INIT(vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx", NULL, 187 IPL_BIO); 188 POOL_INIT(vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd", NULL, 189 IPL_BIO); 190 191 #define getvndxfer(vnx) do { \ 192 int sp = splbio(); \ 193 vnx = pool_get(&vndxfer_pool, PR_WAITOK); \ 194 splx(sp); \ 195 } while (/*CONSTCOND*/ 0) 196 197 #define putvndxfer(vnx) { \ 198 pool_put(&vndxfer_pool, (void *)(vnx)); \ 199 } 200 201 #define getvndbuf(vbp) do { \ 202 int sp = splbio(); \ 203 vbp = pool_get(&vndbuf_pool, PR_WAITOK); \ 204 splx(sp); \ 205 } while (/*CONSTCOND*/ 0) 206 207 #define putvndbuf(vbp) { \ 208 pool_put(&vndbuf_pool, (void *)(vbp)); \ 209 } 210 211 /* 212 * local variables 213 */ 214 MALLOC_DEFINE(M_VMSWAP, "VM swap", "VM swap structures"); 215 static vmem_t *swapmap; /* controls the mapping of /dev/drum */ 216 217 /* list of all active swap devices [by priority] */ 218 LIST_HEAD(swap_priority, swappri); 219 static struct swap_priority swap_priority; 220 221 /* locks */ 222 static krwlock_t swap_syscall_lock; 223 224 /* workqueue and use counter for swap to regular files */ 225 static int sw_reg_count = 0; 226 static struct workqueue *sw_reg_workqueue; 227 228 /* 229 * prototypes 230 */ 231 static struct swapdev *swapdrum_getsdp(int); 232 233 static struct swapdev *swaplist_find(struct vnode *, bool); 234 static void swaplist_insert(struct swapdev *, 235 struct swappri *, int); 236 static void swaplist_trim(void); 237 238 static int swap_on(struct lwp *, struct swapdev *); 239 static int swap_off(struct lwp *, struct swapdev *); 240 241 static void uvm_swap_stats_locked(int, struct swapent *, int, register_t *); 242 243 static void sw_reg_strategy(struct swapdev *, struct buf *, int); 244 static void sw_reg_biodone(struct buf *); 245 static void sw_reg_iodone(struct work *wk, void *dummy); 246 static void sw_reg_start(struct swapdev *); 247 248 static int uvm_swap_io(struct vm_page **, int, int, int); 249 250 /* 251 * uvm_swap_init: init the swap system data structures and locks 252 * 253 * => called at boot time from init_main.c after the filesystems 254 * are brought up (which happens after uvm_init()) 255 */ 256 void 257 uvm_swap_init(void) 258 { 259 UVMHIST_FUNC("uvm_swap_init"); 260 261 UVMHIST_CALLED(pdhist); 262 /* 263 * first, init the swap list, its counter, and its lock. 264 * then get a handle on the vnode for /dev/drum by using 265 * the its dev_t number ("swapdev", from MD conf.c). 266 */ 267 268 LIST_INIT(&swap_priority); 269 uvmexp.nswapdev = 0; 270 rw_init(&swap_syscall_lock); 271 cv_init(&uvm.scheduler_cv, "schedule"); 272 /* XXXSMP should be adaptive, but needs vmobjlock replaced */ 273 mutex_init(&uvm_swap_data_lock, MUTEX_SPIN, IPL_NONE); 274 275 /* XXXSMP should be at IPL_VM, but for audio interrupt handlers. */ 276 mutex_init(&uvm_scheduler_mutex, MUTEX_SPIN, IPL_SCHED); 277 278 if (bdevvp(swapdev, &swapdev_vp)) 279 panic("uvm_swap_init: can't get vnode for swap device"); 280 281 /* 282 * create swap block resource map to map /dev/drum. the range 283 * from 1 to INT_MAX allows 2 gigablocks of swap space. note 284 * that block 0 is reserved (used to indicate an allocation 285 * failure, or no allocation). 286 */ 287 swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0, 288 VM_NOSLEEP, IPL_NONE); 289 if (swapmap == 0) 290 panic("uvm_swap_init: extent_create failed"); 291 292 /* 293 * done! 294 */ 295 uvm.swap_running = true; 296 uvm.swapout_enabled = 1; 297 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); 298 299 sysctl_createv(NULL, 0, NULL, NULL, 300 CTLFLAG_READWRITE, 301 CTLTYPE_INT, "swapout", 302 SYSCTL_DESCR("Set 0 to disable swapout of kernel stacks"), 303 NULL, 0, &uvm.swapout_enabled, 0, CTL_VM, CTL_CREATE, CTL_EOL); 304 } 305 306 /* 307 * swaplist functions: functions that operate on the list of swap 308 * devices on the system. 309 */ 310 311 /* 312 * swaplist_insert: insert swap device "sdp" into the global list 313 * 314 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 315 * => caller must provide a newly malloc'd swappri structure (we will 316 * FREE it if we don't need it... this it to prevent malloc blocking 317 * here while adding swap) 318 */ 319 static void 320 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) 321 { 322 struct swappri *spp, *pspp; 323 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist); 324 325 /* 326 * find entry at or after which to insert the new device. 327 */ 328 pspp = NULL; 329 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 330 if (priority <= spp->spi_priority) 331 break; 332 pspp = spp; 333 } 334 335 /* 336 * new priority? 337 */ 338 if (spp == NULL || spp->spi_priority != priority) { 339 spp = newspp; /* use newspp! */ 340 UVMHIST_LOG(pdhist, "created new swappri = %d", 341 priority, 0, 0, 0); 342 343 spp->spi_priority = priority; 344 CIRCLEQ_INIT(&spp->spi_swapdev); 345 346 if (pspp) 347 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 348 else 349 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 350 } else { 351 /* we don't need a new priority structure, free it */ 352 FREE(newspp, M_VMSWAP); 353 } 354 355 /* 356 * priority found (or created). now insert on the priority's 357 * circleq list and bump the total number of swapdevs. 358 */ 359 sdp->swd_priority = priority; 360 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 361 uvmexp.nswapdev++; 362 } 363 364 /* 365 * swaplist_find: find and optionally remove a swap device from the 366 * global list. 367 * 368 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 369 * => we return the swapdev we found (and removed) 370 */ 371 static struct swapdev * 372 swaplist_find(struct vnode *vp, bool remove) 373 { 374 struct swapdev *sdp; 375 struct swappri *spp; 376 377 /* 378 * search the lists for the requested vp 379 */ 380 381 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 382 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 383 if (sdp->swd_vp == vp) { 384 if (remove) { 385 CIRCLEQ_REMOVE(&spp->spi_swapdev, 386 sdp, swd_next); 387 uvmexp.nswapdev--; 388 } 389 return(sdp); 390 } 391 } 392 } 393 return (NULL); 394 } 395 396 /* 397 * swaplist_trim: scan priority list for empty priority entries and kill 398 * them. 399 * 400 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 401 */ 402 static void 403 swaplist_trim(void) 404 { 405 struct swappri *spp, *nextspp; 406 407 for (spp = LIST_FIRST(&swap_priority); spp != NULL; spp = nextspp) { 408 nextspp = LIST_NEXT(spp, spi_swappri); 409 if (CIRCLEQ_FIRST(&spp->spi_swapdev) != 410 (void *)&spp->spi_swapdev) 411 continue; 412 LIST_REMOVE(spp, spi_swappri); 413 free(spp, M_VMSWAP); 414 } 415 } 416 417 /* 418 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 419 * to the "swapdev" that maps that section of the drum. 420 * 421 * => each swapdev takes one big contig chunk of the drum 422 * => caller must hold uvm_swap_data_lock 423 */ 424 static struct swapdev * 425 swapdrum_getsdp(int pgno) 426 { 427 struct swapdev *sdp; 428 struct swappri *spp; 429 430 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 431 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 432 if (sdp->swd_flags & SWF_FAKE) 433 continue; 434 if (pgno >= sdp->swd_drumoffset && 435 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 436 return sdp; 437 } 438 } 439 } 440 return NULL; 441 } 442 443 444 /* 445 * sys_swapctl: main entry point for swapctl(2) system call 446 * [with two helper functions: swap_on and swap_off] 447 */ 448 int 449 sys_swapctl(struct lwp *l, void *v, register_t *retval) 450 { 451 struct sys_swapctl_args /* { 452 syscallarg(int) cmd; 453 syscallarg(void *) arg; 454 syscallarg(int) misc; 455 } */ *uap = (struct sys_swapctl_args *)v; 456 struct vnode *vp; 457 struct nameidata nd; 458 struct swappri *spp; 459 struct swapdev *sdp; 460 struct swapent *sep; 461 #define SWAP_PATH_MAX (PATH_MAX + 1) 462 char *userpath; 463 size_t len; 464 int error, misc; 465 int priority; 466 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist); 467 468 misc = SCARG(uap, misc); 469 470 /* 471 * ensure serialized syscall access by grabbing the swap_syscall_lock 472 */ 473 rw_enter(&swap_syscall_lock, RW_WRITER); 474 475 userpath = malloc(SWAP_PATH_MAX, M_TEMP, M_WAITOK); 476 /* 477 * we handle the non-priv NSWAP and STATS request first. 478 * 479 * SWAP_NSWAP: return number of config'd swap devices 480 * [can also be obtained with uvmexp sysctl] 481 */ 482 if (SCARG(uap, cmd) == SWAP_NSWAP) { 483 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev, 484 0, 0, 0); 485 *retval = uvmexp.nswapdev; 486 error = 0; 487 goto out; 488 } 489 490 /* 491 * SWAP_STATS: get stats on current # of configured swap devs 492 * 493 * note that the swap_priority list can't change as long 494 * as we are holding the swap_syscall_lock. we don't want 495 * to grab the uvm_swap_data_lock because we may fault&sleep during 496 * copyout() and we don't want to be holding that lock then! 497 */ 498 if (SCARG(uap, cmd) == SWAP_STATS 499 #if defined(COMPAT_13) 500 || SCARG(uap, cmd) == SWAP_OSTATS 501 #endif 502 ) { 503 if ((size_t)misc > (size_t)uvmexp.nswapdev) 504 misc = uvmexp.nswapdev; 505 #if defined(COMPAT_13) 506 if (SCARG(uap, cmd) == SWAP_OSTATS) 507 len = sizeof(struct oswapent) * misc; 508 else 509 #endif 510 len = sizeof(struct swapent) * misc; 511 sep = (struct swapent *)malloc(len, M_TEMP, M_WAITOK); 512 513 uvm_swap_stats_locked(SCARG(uap, cmd), sep, misc, retval); 514 error = copyout(sep, SCARG(uap, arg), len); 515 516 free(sep, M_TEMP); 517 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); 518 goto out; 519 } 520 if (SCARG(uap, cmd) == SWAP_GETDUMPDEV) { 521 dev_t *devp = (dev_t *)SCARG(uap, arg); 522 523 error = copyout(&dumpdev, devp, sizeof(dumpdev)); 524 goto out; 525 } 526 527 /* 528 * all other requests require superuser privs. verify. 529 */ 530 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL, 531 0, NULL, NULL, NULL))) 532 goto out; 533 534 if (SCARG(uap, cmd) == SWAP_DUMPOFF) { 535 /* drop the current dump device */ 536 dumpdev = NODEV; 537 cpu_dumpconf(); 538 goto out; 539 } 540 541 /* 542 * at this point we expect a path name in arg. we will 543 * use namei() to gain a vnode reference (vref), and lock 544 * the vnode (VOP_LOCK). 545 * 546 * XXX: a NULL arg means use the root vnode pointer (e.g. for 547 * miniroot) 548 */ 549 if (SCARG(uap, arg) == NULL) { 550 vp = rootvp; /* miniroot */ 551 if (vget(vp, LK_EXCLUSIVE)) { 552 error = EBUSY; 553 goto out; 554 } 555 if (SCARG(uap, cmd) == SWAP_ON && 556 copystr("miniroot", userpath, SWAP_PATH_MAX, &len)) 557 panic("swapctl: miniroot copy failed"); 558 } else { 559 int space; 560 char *where; 561 562 if (SCARG(uap, cmd) == SWAP_ON) { 563 if ((error = copyinstr(SCARG(uap, arg), userpath, 564 SWAP_PATH_MAX, &len))) 565 goto out; 566 space = UIO_SYSSPACE; 567 where = userpath; 568 } else { 569 space = UIO_USERSPACE; 570 where = (char *)SCARG(uap, arg); 571 } 572 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, space, where, l); 573 if ((error = namei(&nd))) 574 goto out; 575 vp = nd.ni_vp; 576 } 577 /* note: "vp" is referenced and locked */ 578 579 error = 0; /* assume no error */ 580 switch(SCARG(uap, cmd)) { 581 582 case SWAP_DUMPDEV: 583 if (vp->v_type != VBLK) { 584 error = ENOTBLK; 585 break; 586 } 587 if (bdevsw_lookup(vp->v_rdev)) 588 dumpdev = vp->v_rdev; 589 else 590 dumpdev = NODEV; 591 cpu_dumpconf(); 592 break; 593 594 case SWAP_CTL: 595 /* 596 * get new priority, remove old entry (if any) and then 597 * reinsert it in the correct place. finally, prune out 598 * any empty priority structures. 599 */ 600 priority = SCARG(uap, misc); 601 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 602 mutex_enter(&uvm_swap_data_lock); 603 if ((sdp = swaplist_find(vp, true)) == NULL) { 604 error = ENOENT; 605 } else { 606 swaplist_insert(sdp, spp, priority); 607 swaplist_trim(); 608 } 609 mutex_exit(&uvm_swap_data_lock); 610 if (error) 611 free(spp, M_VMSWAP); 612 break; 613 614 case SWAP_ON: 615 616 /* 617 * check for duplicates. if none found, then insert a 618 * dummy entry on the list to prevent someone else from 619 * trying to enable this device while we are working on 620 * it. 621 */ 622 623 priority = SCARG(uap, misc); 624 sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK); 625 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 626 memset(sdp, 0, sizeof(*sdp)); 627 sdp->swd_flags = SWF_FAKE; 628 sdp->swd_vp = vp; 629 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 630 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK); 631 mutex_enter(&uvm_swap_data_lock); 632 if (swaplist_find(vp, false) != NULL) { 633 error = EBUSY; 634 mutex_exit(&uvm_swap_data_lock); 635 bufq_free(sdp->swd_tab); 636 free(sdp, M_VMSWAP); 637 free(spp, M_VMSWAP); 638 break; 639 } 640 swaplist_insert(sdp, spp, priority); 641 mutex_exit(&uvm_swap_data_lock); 642 643 sdp->swd_pathlen = len; 644 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK); 645 if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0) 646 panic("swapctl: copystr"); 647 648 /* 649 * we've now got a FAKE placeholder in the swap list. 650 * now attempt to enable swap on it. if we fail, undo 651 * what we've done and kill the fake entry we just inserted. 652 * if swap_on is a success, it will clear the SWF_FAKE flag 653 */ 654 655 if ((error = swap_on(l, sdp)) != 0) { 656 mutex_enter(&uvm_swap_data_lock); 657 (void) swaplist_find(vp, true); /* kill fake entry */ 658 swaplist_trim(); 659 mutex_exit(&uvm_swap_data_lock); 660 bufq_free(sdp->swd_tab); 661 free(sdp->swd_path, M_VMSWAP); 662 free(sdp, M_VMSWAP); 663 break; 664 } 665 break; 666 667 case SWAP_OFF: 668 mutex_enter(&uvm_swap_data_lock); 669 if ((sdp = swaplist_find(vp, false)) == NULL) { 670 mutex_exit(&uvm_swap_data_lock); 671 error = ENXIO; 672 break; 673 } 674 675 /* 676 * If a device isn't in use or enabled, we 677 * can't stop swapping from it (again). 678 */ 679 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 680 mutex_exit(&uvm_swap_data_lock); 681 error = EBUSY; 682 break; 683 } 684 685 /* 686 * do the real work. 687 */ 688 error = swap_off(l, sdp); 689 break; 690 691 default: 692 error = EINVAL; 693 } 694 695 /* 696 * done! release the ref gained by namei() and unlock. 697 */ 698 vput(vp); 699 700 out: 701 free(userpath, M_TEMP); 702 rw_exit(&swap_syscall_lock); 703 704 UVMHIST_LOG(pdhist, "<- done! error=%d", error, 0, 0, 0); 705 return (error); 706 } 707 708 /* 709 * swap_stats: implements swapctl(SWAP_STATS). The function is kept 710 * away from sys_swapctl() in order to allow COMPAT_* swapctl() 711 * emulation to use it directly without going through sys_swapctl(). 712 * The problem with using sys_swapctl() there is that it involves 713 * copying the swapent array to the stackgap, and this array's size 714 * is not known at build time. Hence it would not be possible to 715 * ensure it would fit in the stackgap in any case. 716 */ 717 void 718 uvm_swap_stats(int cmd, struct swapent *sep, int sec, register_t *retval) 719 { 720 721 rw_enter(&swap_syscall_lock, RW_READER); 722 uvm_swap_stats_locked(cmd, sep, sec, retval); 723 rw_exit(&swap_syscall_lock); 724 } 725 726 static void 727 uvm_swap_stats_locked(int cmd, struct swapent *sep, int sec, register_t *retval) 728 { 729 struct swappri *spp; 730 struct swapdev *sdp; 731 int count = 0; 732 733 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 734 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev); 735 sdp != (void *)&spp->spi_swapdev && sec-- > 0; 736 sdp = CIRCLEQ_NEXT(sdp, swd_next)) { 737 /* 738 * backwards compatibility for system call. 739 * note that we use 'struct oswapent' as an 740 * overlay into both 'struct swapdev' and 741 * the userland 'struct swapent', as we 742 * want to retain backwards compatibility 743 * with NetBSD 1.3. 744 */ 745 sdp->swd_ose.ose_inuse = 746 btodb((uint64_t)sdp->swd_npginuse << 747 PAGE_SHIFT); 748 (void)memcpy(sep, &sdp->swd_ose, 749 sizeof(struct oswapent)); 750 751 /* now copy out the path if necessary */ 752 #if !defined(COMPAT_13) 753 (void) cmd; 754 #endif 755 #if defined(COMPAT_13) 756 if (cmd == SWAP_STATS) 757 #endif 758 (void)memcpy(&sep->se_path, sdp->swd_path, 759 sdp->swd_pathlen); 760 761 count++; 762 #if defined(COMPAT_13) 763 if (cmd == SWAP_OSTATS) 764 sep = (struct swapent *) 765 ((struct oswapent *)sep + 1); 766 else 767 #endif 768 sep++; 769 } 770 } 771 772 *retval = count; 773 return; 774 } 775 776 /* 777 * swap_on: attempt to enable a swapdev for swapping. note that the 778 * swapdev is already on the global list, but disabled (marked 779 * SWF_FAKE). 780 * 781 * => we avoid the start of the disk (to protect disk labels) 782 * => we also avoid the miniroot, if we are swapping to root. 783 * => caller should leave uvm_swap_data_lock unlocked, we may lock it 784 * if needed. 785 */ 786 static int 787 swap_on(struct lwp *l, struct swapdev *sdp) 788 { 789 struct vnode *vp; 790 int error, npages, nblocks, size; 791 long addr; 792 u_long result; 793 struct vattr va; 794 #ifdef NFS 795 extern int (**nfsv2_vnodeop_p)(void *); 796 #endif /* NFS */ 797 const struct bdevsw *bdev; 798 dev_t dev; 799 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist); 800 801 /* 802 * we want to enable swapping on sdp. the swd_vp contains 803 * the vnode we want (locked and ref'd), and the swd_dev 804 * contains the dev_t of the file, if it a block device. 805 */ 806 807 vp = sdp->swd_vp; 808 dev = sdp->swd_dev; 809 810 /* 811 * open the swap file (mostly useful for block device files to 812 * let device driver know what is up). 813 * 814 * we skip the open/close for root on swap because the root 815 * has already been opened when root was mounted (mountroot). 816 */ 817 if (vp != rootvp) { 818 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred, l))) 819 return (error); 820 } 821 822 /* XXX this only works for block devices */ 823 UVMHIST_LOG(pdhist, " dev=%d, major(dev)=%d", dev, major(dev), 0,0); 824 825 /* 826 * we now need to determine the size of the swap area. for 827 * block specials we can call the d_psize function. 828 * for normal files, we must stat [get attrs]. 829 * 830 * we put the result in nblks. 831 * for normal files, we also want the filesystem block size 832 * (which we get with statfs). 833 */ 834 switch (vp->v_type) { 835 case VBLK: 836 bdev = bdevsw_lookup(dev); 837 if (bdev == NULL || bdev->d_psize == NULL || 838 (nblocks = (*bdev->d_psize)(dev)) == -1) { 839 error = ENXIO; 840 goto bad; 841 } 842 break; 843 844 case VREG: 845 if ((error = VOP_GETATTR(vp, &va, l->l_cred, l))) 846 goto bad; 847 nblocks = (int)btodb(va.va_size); 848 if ((error = 849 VFS_STATVFS(vp->v_mount, &vp->v_mount->mnt_stat, l)) != 0) 850 goto bad; 851 852 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize; 853 /* 854 * limit the max # of outstanding I/O requests we issue 855 * at any one time. take it easy on NFS servers. 856 */ 857 #ifdef NFS 858 if (vp->v_op == nfsv2_vnodeop_p) 859 sdp->swd_maxactive = 2; /* XXX */ 860 else 861 #endif /* NFS */ 862 sdp->swd_maxactive = 8; /* XXX */ 863 break; 864 865 default: 866 error = ENXIO; 867 goto bad; 868 } 869 870 /* 871 * save nblocks in a safe place and convert to pages. 872 */ 873 874 sdp->swd_ose.ose_nblks = nblocks; 875 npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT; 876 877 /* 878 * for block special files, we want to make sure that leave 879 * the disklabel and bootblocks alone, so we arrange to skip 880 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 881 * note that because of this the "size" can be less than the 882 * actual number of blocks on the device. 883 */ 884 if (vp->v_type == VBLK) { 885 /* we use pages 1 to (size - 1) [inclusive] */ 886 size = npages - 1; 887 addr = 1; 888 } else { 889 /* we use pages 0 to (size - 1) [inclusive] */ 890 size = npages; 891 addr = 0; 892 } 893 894 /* 895 * make sure we have enough blocks for a reasonable sized swap 896 * area. we want at least one page. 897 */ 898 899 if (size < 1) { 900 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); 901 error = EINVAL; 902 goto bad; 903 } 904 905 UVMHIST_LOG(pdhist, " dev=%x: size=%d addr=%ld\n", dev, size, addr, 0); 906 907 /* 908 * now we need to allocate an extent to manage this swap device 909 */ 910 911 sdp->swd_blist = blist_create(npages); 912 /* mark all expect the `saved' region free. */ 913 blist_free(sdp->swd_blist, addr, size); 914 915 /* 916 * if the vnode we are swapping to is the root vnode 917 * (i.e. we are swapping to the miniroot) then we want 918 * to make sure we don't overwrite it. do a statfs to 919 * find its size and skip over it. 920 */ 921 if (vp == rootvp) { 922 struct mount *mp; 923 struct statvfs *sp; 924 int rootblocks, rootpages; 925 926 mp = rootvnode->v_mount; 927 sp = &mp->mnt_stat; 928 rootblocks = sp->f_blocks * btodb(sp->f_frsize); 929 /* 930 * XXX: sp->f_blocks isn't the total number of 931 * blocks in the filesystem, it's the number of 932 * data blocks. so, our rootblocks almost 933 * definitely underestimates the total size 934 * of the filesystem - how badly depends on the 935 * details of the filesystem type. there isn't 936 * an obvious way to deal with this cleanly 937 * and perfectly, so for now we just pad our 938 * rootblocks estimate with an extra 5 percent. 939 */ 940 rootblocks += (rootblocks >> 5) + 941 (rootblocks >> 6) + 942 (rootblocks >> 7); 943 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; 944 if (rootpages > size) 945 panic("swap_on: miniroot larger than swap?"); 946 947 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) { 948 panic("swap_on: unable to preserve miniroot"); 949 } 950 951 size -= rootpages; 952 printf("Preserved %d pages of miniroot ", rootpages); 953 printf("leaving %d pages of swap\n", size); 954 } 955 956 /* 957 * add a ref to vp to reflect usage as a swap device. 958 */ 959 vref(vp); 960 961 /* 962 * now add the new swapdev to the drum and enable. 963 */ 964 result = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP); 965 if (result == 0) 966 panic("swapdrum_add"); 967 /* 968 * If this is the first regular swap create the workqueue. 969 * => Protected by swap_syscall_lock. 970 */ 971 if (vp->v_type != VBLK) { 972 if (sw_reg_count++ == 0) { 973 KASSERT(sw_reg_workqueue == NULL); 974 if (workqueue_create(&sw_reg_workqueue, "swapiod", 975 sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0) 976 panic("swap_add: workqueue_create failed"); 977 } 978 } 979 980 sdp->swd_drumoffset = (int)result; 981 sdp->swd_drumsize = npages; 982 sdp->swd_npages = size; 983 mutex_enter(&uvm_swap_data_lock); 984 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 985 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 986 uvmexp.swpages += size; 987 uvmexp.swpgavail += size; 988 mutex_exit(&uvm_swap_data_lock); 989 return (0); 990 991 /* 992 * failure: clean up and return error. 993 */ 994 995 bad: 996 if (sdp->swd_blist) { 997 blist_destroy(sdp->swd_blist); 998 } 999 if (vp != rootvp) { 1000 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred, l); 1001 } 1002 return (error); 1003 } 1004 1005 /* 1006 * swap_off: stop swapping on swapdev 1007 * 1008 * => swap data should be locked, we will unlock. 1009 */ 1010 static int 1011 swap_off(struct lwp *l, struct swapdev *sdp) 1012 { 1013 int npages = sdp->swd_npages; 1014 int error = 0; 1015 1016 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist); 1017 UVMHIST_LOG(pdhist, " dev=%x, npages=%d", sdp->swd_dev,npages,0,0); 1018 1019 /* disable the swap area being removed */ 1020 sdp->swd_flags &= ~SWF_ENABLE; 1021 uvmexp.swpgavail -= npages; 1022 mutex_exit(&uvm_swap_data_lock); 1023 1024 /* 1025 * the idea is to find all the pages that are paged out to this 1026 * device, and page them all in. in uvm, swap-backed pageable 1027 * memory can take two forms: aobjs and anons. call the 1028 * swapoff hook for each subsystem to bring in pages. 1029 */ 1030 1031 if (uao_swap_off(sdp->swd_drumoffset, 1032 sdp->swd_drumoffset + sdp->swd_drumsize) || 1033 amap_swap_off(sdp->swd_drumoffset, 1034 sdp->swd_drumoffset + sdp->swd_drumsize)) { 1035 error = ENOMEM; 1036 } else if (sdp->swd_npginuse > sdp->swd_npgbad) { 1037 error = EBUSY; 1038 } 1039 1040 if (error) { 1041 mutex_enter(&uvm_swap_data_lock); 1042 sdp->swd_flags |= SWF_ENABLE; 1043 uvmexp.swpgavail += npages; 1044 mutex_exit(&uvm_swap_data_lock); 1045 1046 return error; 1047 } 1048 1049 /* 1050 * If this is the last regular swap destroy the workqueue. 1051 * => Protected by swap_syscall_lock. 1052 */ 1053 if (sdp->swd_vp->v_type != VBLK) { 1054 KASSERT(sw_reg_count > 0); 1055 KASSERT(sw_reg_workqueue != NULL); 1056 if (--sw_reg_count == 0) { 1057 workqueue_destroy(sw_reg_workqueue); 1058 sw_reg_workqueue = NULL; 1059 } 1060 } 1061 1062 /* 1063 * done with the vnode. 1064 * drop our ref on the vnode before calling VOP_CLOSE() 1065 * so that spec_close() can tell if this is the last close. 1066 */ 1067 vrele(sdp->swd_vp); 1068 if (sdp->swd_vp != rootvp) { 1069 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred, l); 1070 } 1071 1072 mutex_enter(&uvm_swap_data_lock); 1073 uvmexp.swpages -= npages; 1074 uvmexp.swpginuse -= sdp->swd_npgbad; 1075 1076 if (swaplist_find(sdp->swd_vp, true) == NULL) 1077 panic("swap_off: swapdev not in list"); 1078 swaplist_trim(); 1079 mutex_exit(&uvm_swap_data_lock); 1080 1081 /* 1082 * free all resources! 1083 */ 1084 vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize); 1085 blist_destroy(sdp->swd_blist); 1086 bufq_free(sdp->swd_tab); 1087 free(sdp, M_VMSWAP); 1088 return (0); 1089 } 1090 1091 /* 1092 * /dev/drum interface and i/o functions 1093 */ 1094 1095 /* 1096 * swstrategy: perform I/O on the drum 1097 * 1098 * => we must map the i/o request from the drum to the correct swapdev. 1099 */ 1100 static void 1101 swstrategy(struct buf *bp) 1102 { 1103 struct swapdev *sdp; 1104 struct vnode *vp; 1105 int s, pageno, bn; 1106 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist); 1107 1108 /* 1109 * convert block number to swapdev. note that swapdev can't 1110 * be yanked out from under us because we are holding resources 1111 * in it (i.e. the blocks we are doing I/O on). 1112 */ 1113 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT; 1114 mutex_enter(&uvm_swap_data_lock); 1115 sdp = swapdrum_getsdp(pageno); 1116 mutex_exit(&uvm_swap_data_lock); 1117 if (sdp == NULL) { 1118 bp->b_error = EINVAL; 1119 biodone(bp); 1120 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); 1121 return; 1122 } 1123 1124 /* 1125 * convert drum page number to block number on this swapdev. 1126 */ 1127 1128 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1129 bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ 1130 1131 UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld", 1132 ((bp->b_flags & B_READ) == 0) ? "write" : "read", 1133 sdp->swd_drumoffset, bn, bp->b_bcount); 1134 1135 /* 1136 * for block devices we finish up here. 1137 * for regular files we have to do more work which we delegate 1138 * to sw_reg_strategy(). 1139 */ 1140 1141 switch (sdp->swd_vp->v_type) { 1142 default: 1143 panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type); 1144 1145 case VBLK: 1146 1147 /* 1148 * must convert "bp" from an I/O on /dev/drum to an I/O 1149 * on the swapdev (sdp). 1150 */ 1151 s = splbio(); 1152 bp->b_blkno = bn; /* swapdev block number */ 1153 vp = sdp->swd_vp; /* swapdev vnode pointer */ 1154 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */ 1155 1156 /* 1157 * if we are doing a write, we have to redirect the i/o on 1158 * drum's v_numoutput counter to the swapdevs. 1159 */ 1160 if ((bp->b_flags & B_READ) == 0) { 1161 vwakeup(bp); /* kills one 'v_numoutput' on drum */ 1162 V_INCR_NUMOUTPUT(vp); /* put it on swapdev */ 1163 } 1164 1165 /* 1166 * finally plug in swapdev vnode and start I/O 1167 */ 1168 bp->b_vp = vp; 1169 splx(s); 1170 VOP_STRATEGY(vp, bp); 1171 return; 1172 1173 case VREG: 1174 /* 1175 * delegate to sw_reg_strategy function. 1176 */ 1177 sw_reg_strategy(sdp, bp, bn); 1178 return; 1179 } 1180 /* NOTREACHED */ 1181 } 1182 1183 /* 1184 * swread: the read function for the drum (just a call to physio) 1185 */ 1186 /*ARGSUSED*/ 1187 static int 1188 swread(dev_t dev, struct uio *uio, int ioflag) 1189 { 1190 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist); 1191 1192 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); 1193 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); 1194 } 1195 1196 /* 1197 * swwrite: the write function for the drum (just a call to physio) 1198 */ 1199 /*ARGSUSED*/ 1200 static int 1201 swwrite(dev_t dev, struct uio *uio, int ioflag) 1202 { 1203 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist); 1204 1205 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); 1206 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); 1207 } 1208 1209 const struct bdevsw swap_bdevsw = { 1210 noopen, noclose, swstrategy, noioctl, nodump, nosize, D_OTHER, 1211 }; 1212 1213 const struct cdevsw swap_cdevsw = { 1214 nullopen, nullclose, swread, swwrite, noioctl, 1215 nostop, notty, nopoll, nommap, nokqfilter, D_OTHER, 1216 }; 1217 1218 /* 1219 * sw_reg_strategy: handle swap i/o to regular files 1220 */ 1221 static void 1222 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) 1223 { 1224 struct vnode *vp; 1225 struct vndxfer *vnx; 1226 daddr_t nbn; 1227 char *addr; 1228 off_t byteoff; 1229 int s, off, nra, error, sz, resid; 1230 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist); 1231 1232 /* 1233 * allocate a vndxfer head for this transfer and point it to 1234 * our buffer. 1235 */ 1236 getvndxfer(vnx); 1237 vnx->vx_flags = VX_BUSY; 1238 vnx->vx_error = 0; 1239 vnx->vx_pending = 0; 1240 vnx->vx_bp = bp; 1241 vnx->vx_sdp = sdp; 1242 1243 /* 1244 * setup for main loop where we read filesystem blocks into 1245 * our buffer. 1246 */ 1247 error = 0; 1248 bp->b_resid = bp->b_bcount; /* nothing transfered yet! */ 1249 addr = bp->b_data; /* current position in buffer */ 1250 byteoff = dbtob((uint64_t)bn); 1251 1252 for (resid = bp->b_resid; resid; resid -= sz) { 1253 struct vndbuf *nbp; 1254 1255 /* 1256 * translate byteoffset into block number. return values: 1257 * vp = vnode of underlying device 1258 * nbn = new block number (on underlying vnode dev) 1259 * nra = num blocks we can read-ahead (excludes requested 1260 * block) 1261 */ 1262 nra = 0; 1263 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1264 &vp, &nbn, &nra); 1265 1266 if (error == 0 && nbn == (daddr_t)-1) { 1267 /* 1268 * this used to just set error, but that doesn't 1269 * do the right thing. Instead, it causes random 1270 * memory errors. The panic() should remain until 1271 * this condition doesn't destabilize the system. 1272 */ 1273 #if 1 1274 panic("sw_reg_strategy: swap to sparse file"); 1275 #else 1276 error = EIO; /* failure */ 1277 #endif 1278 } 1279 1280 /* 1281 * punt if there was an error or a hole in the file. 1282 * we must wait for any i/o ops we have already started 1283 * to finish before returning. 1284 * 1285 * XXX we could deal with holes here but it would be 1286 * a hassle (in the write case). 1287 */ 1288 if (error) { 1289 s = splbio(); 1290 vnx->vx_error = error; /* pass error up */ 1291 goto out; 1292 } 1293 1294 /* 1295 * compute the size ("sz") of this transfer (in bytes). 1296 */ 1297 off = byteoff % sdp->swd_bsize; 1298 sz = (1 + nra) * sdp->swd_bsize - off; 1299 if (sz > resid) 1300 sz = resid; 1301 1302 UVMHIST_LOG(pdhist, "sw_reg_strategy: " 1303 "vp %p/%p offset 0x%x/0x%x", 1304 sdp->swd_vp, vp, byteoff, nbn); 1305 1306 /* 1307 * now get a buf structure. note that the vb_buf is 1308 * at the front of the nbp structure so that you can 1309 * cast pointers between the two structure easily. 1310 */ 1311 getvndbuf(nbp); 1312 BUF_INIT(&nbp->vb_buf); 1313 nbp->vb_buf.b_flags = bp->b_flags | B_CALL; 1314 nbp->vb_buf.b_bcount = sz; 1315 nbp->vb_buf.b_bufsize = sz; 1316 nbp->vb_buf.b_error = 0; 1317 nbp->vb_buf.b_data = addr; 1318 nbp->vb_buf.b_lblkno = 0; 1319 nbp->vb_buf.b_blkno = nbn + btodb(off); 1320 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno; 1321 nbp->vb_buf.b_iodone = sw_reg_biodone; 1322 nbp->vb_buf.b_vp = vp; 1323 if (vp->v_type == VBLK) { 1324 nbp->vb_buf.b_dev = vp->v_rdev; 1325 } 1326 1327 nbp->vb_xfer = vnx; /* patch it back in to vnx */ 1328 1329 /* 1330 * Just sort by block number 1331 */ 1332 s = splbio(); 1333 if (vnx->vx_error != 0) { 1334 putvndbuf(nbp); 1335 goto out; 1336 } 1337 vnx->vx_pending++; 1338 1339 /* sort it in and start I/O if we are not over our limit */ 1340 BUFQ_PUT(sdp->swd_tab, &nbp->vb_buf); 1341 sw_reg_start(sdp); 1342 splx(s); 1343 1344 /* 1345 * advance to the next I/O 1346 */ 1347 byteoff += sz; 1348 addr += sz; 1349 } 1350 1351 s = splbio(); 1352 1353 out: /* Arrive here at splbio */ 1354 vnx->vx_flags &= ~VX_BUSY; 1355 if (vnx->vx_pending == 0) { 1356 if (vnx->vx_error != 0) 1357 bp->b_error = vnx->vx_error; 1358 putvndxfer(vnx); 1359 biodone(bp); 1360 } 1361 splx(s); 1362 } 1363 1364 /* 1365 * sw_reg_start: start an I/O request on the requested swapdev 1366 * 1367 * => reqs are sorted by b_rawblkno (above) 1368 */ 1369 static void 1370 sw_reg_start(struct swapdev *sdp) 1371 { 1372 struct buf *bp; 1373 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist); 1374 1375 /* recursion control */ 1376 if ((sdp->swd_flags & SWF_BUSY) != 0) 1377 return; 1378 1379 sdp->swd_flags |= SWF_BUSY; 1380 1381 while (sdp->swd_active < sdp->swd_maxactive) { 1382 bp = BUFQ_GET(sdp->swd_tab); 1383 if (bp == NULL) 1384 break; 1385 sdp->swd_active++; 1386 1387 UVMHIST_LOG(pdhist, 1388 "sw_reg_start: bp %p vp %p blkno %p cnt %lx", 1389 bp, bp->b_vp, bp->b_blkno, bp->b_bcount); 1390 if ((bp->b_flags & B_READ) == 0) 1391 V_INCR_NUMOUTPUT(bp->b_vp); 1392 1393 VOP_STRATEGY(bp->b_vp, bp); 1394 } 1395 sdp->swd_flags &= ~SWF_BUSY; 1396 } 1397 1398 /* 1399 * sw_reg_biodone: one of our i/o's has completed 1400 */ 1401 static void 1402 sw_reg_biodone(struct buf *bp) 1403 { 1404 workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL); 1405 } 1406 1407 /* 1408 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1409 * 1410 * => note that we can recover the vndbuf struct by casting the buf ptr 1411 */ 1412 static void 1413 sw_reg_iodone(struct work *wk, void *dummy) 1414 { 1415 struct vndbuf *vbp = (void *)wk; 1416 struct vndxfer *vnx = vbp->vb_xfer; 1417 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1418 struct swapdev *sdp = vnx->vx_sdp; 1419 int s, resid, error; 1420 KASSERT(&vbp->vb_buf.b_work == wk); 1421 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist); 1422 1423 UVMHIST_LOG(pdhist, " vbp=%p vp=%p blkno=%x addr=%p", 1424 vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data); 1425 UVMHIST_LOG(pdhist, " cnt=%lx resid=%lx", 1426 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); 1427 1428 /* 1429 * protect vbp at splbio and update. 1430 */ 1431 1432 s = splbio(); 1433 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1434 pbp->b_resid -= resid; 1435 vnx->vx_pending--; 1436 1437 if (vbp->vb_buf.b_error != 0) { 1438 /* pass error upward */ 1439 error = vbp->vb_buf.b_error; 1440 UVMHIST_LOG(pdhist, " got error=%d !", error, 0, 0, 0); 1441 vnx->vx_error = error; 1442 } 1443 1444 /* 1445 * kill vbp structure 1446 */ 1447 putvndbuf(vbp); 1448 1449 /* 1450 * wrap up this transaction if it has run to completion or, in 1451 * case of an error, when all auxiliary buffers have returned. 1452 */ 1453 if (vnx->vx_error != 0) { 1454 /* pass error upward */ 1455 pbp->b_error = vnx->vx_error; 1456 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1457 putvndxfer(vnx); 1458 biodone(pbp); 1459 } 1460 } else if (pbp->b_resid == 0) { 1461 KASSERT(vnx->vx_pending == 0); 1462 if ((vnx->vx_flags & VX_BUSY) == 0) { 1463 UVMHIST_LOG(pdhist, " iodone error=%d !", 1464 pbp, vnx->vx_error, 0, 0); 1465 putvndxfer(vnx); 1466 biodone(pbp); 1467 } 1468 } 1469 1470 /* 1471 * done! start next swapdev I/O if one is pending 1472 */ 1473 sdp->swd_active--; 1474 sw_reg_start(sdp); 1475 splx(s); 1476 } 1477 1478 1479 /* 1480 * uvm_swap_alloc: allocate space on swap 1481 * 1482 * => allocation is done "round robin" down the priority list, as we 1483 * allocate in a priority we "rotate" the circle queue. 1484 * => space can be freed with uvm_swap_free 1485 * => we return the page slot number in /dev/drum (0 == invalid slot) 1486 * => we lock uvm_swap_data_lock 1487 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1488 */ 1489 int 1490 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok) 1491 { 1492 struct swapdev *sdp; 1493 struct swappri *spp; 1494 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist); 1495 1496 /* 1497 * no swap devices configured yet? definite failure. 1498 */ 1499 if (uvmexp.nswapdev < 1) 1500 return 0; 1501 1502 /* 1503 * lock data lock, convert slots into blocks, and enter loop 1504 */ 1505 mutex_enter(&uvm_swap_data_lock); 1506 1507 ReTry: /* XXXMRG */ 1508 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1509 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1510 uint64_t result; 1511 1512 /* if it's not enabled, then we can't swap from it */ 1513 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1514 continue; 1515 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1516 continue; 1517 result = blist_alloc(sdp->swd_blist, *nslots); 1518 if (result == BLIST_NONE) { 1519 continue; 1520 } 1521 KASSERT(result < sdp->swd_drumsize); 1522 1523 /* 1524 * successful allocation! now rotate the circleq. 1525 */ 1526 CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1527 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1528 sdp->swd_npginuse += *nslots; 1529 uvmexp.swpginuse += *nslots; 1530 mutex_exit(&uvm_swap_data_lock); 1531 /* done! return drum slot number */ 1532 UVMHIST_LOG(pdhist, 1533 "success! returning %d slots starting at %d", 1534 *nslots, result + sdp->swd_drumoffset, 0, 0); 1535 return (result + sdp->swd_drumoffset); 1536 } 1537 } 1538 1539 /* XXXMRG: BEGIN HACK */ 1540 if (*nslots > 1 && lessok) { 1541 *nslots = 1; 1542 /* XXXMRG: ugh! blist should support this for us */ 1543 goto ReTry; 1544 } 1545 /* XXXMRG: END HACK */ 1546 1547 mutex_exit(&uvm_swap_data_lock); 1548 return 0; 1549 } 1550 1551 bool 1552 uvm_swapisfull(void) 1553 { 1554 bool rv; 1555 1556 mutex_enter(&uvm_swap_data_lock); 1557 KASSERT(uvmexp.swpgonly <= uvmexp.swpages); 1558 rv = (uvmexp.swpgonly >= uvmexp.swpgavail); 1559 mutex_exit(&uvm_swap_data_lock); 1560 1561 return (rv); 1562 } 1563 1564 /* 1565 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1566 * 1567 * => we lock uvm_swap_data_lock 1568 */ 1569 void 1570 uvm_swap_markbad(int startslot, int nslots) 1571 { 1572 struct swapdev *sdp; 1573 UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist); 1574 1575 mutex_enter(&uvm_swap_data_lock); 1576 sdp = swapdrum_getsdp(startslot); 1577 KASSERT(sdp != NULL); 1578 1579 /* 1580 * we just keep track of how many pages have been marked bad 1581 * in this device, to make everything add up in swap_off(). 1582 * we assume here that the range of slots will all be within 1583 * one swap device. 1584 */ 1585 1586 KASSERT(uvmexp.swpgonly >= nslots); 1587 uvmexp.swpgonly -= nslots; 1588 sdp->swd_npgbad += nslots; 1589 UVMHIST_LOG(pdhist, "now %d bad", sdp->swd_npgbad, 0,0,0); 1590 mutex_exit(&uvm_swap_data_lock); 1591 } 1592 1593 /* 1594 * uvm_swap_free: free swap slots 1595 * 1596 * => this can be all or part of an allocation made by uvm_swap_alloc 1597 * => we lock uvm_swap_data_lock 1598 */ 1599 void 1600 uvm_swap_free(int startslot, int nslots) 1601 { 1602 struct swapdev *sdp; 1603 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist); 1604 1605 UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots, 1606 startslot, 0, 0); 1607 1608 /* 1609 * ignore attempts to free the "bad" slot. 1610 */ 1611 1612 if (startslot == SWSLOT_BAD) { 1613 return; 1614 } 1615 1616 /* 1617 * convert drum slot offset back to sdp, free the blocks 1618 * in the extent, and return. must hold pri lock to do 1619 * lookup and access the extent. 1620 */ 1621 1622 mutex_enter(&uvm_swap_data_lock); 1623 sdp = swapdrum_getsdp(startslot); 1624 KASSERT(uvmexp.nswapdev >= 1); 1625 KASSERT(sdp != NULL); 1626 KASSERT(sdp->swd_npginuse >= nslots); 1627 blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots); 1628 sdp->swd_npginuse -= nslots; 1629 uvmexp.swpginuse -= nslots; 1630 mutex_exit(&uvm_swap_data_lock); 1631 } 1632 1633 /* 1634 * uvm_swap_put: put any number of pages into a contig place on swap 1635 * 1636 * => can be sync or async 1637 */ 1638 1639 int 1640 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) 1641 { 1642 int error; 1643 1644 error = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1645 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1646 return error; 1647 } 1648 1649 /* 1650 * uvm_swap_get: get a single page from swap 1651 * 1652 * => usually a sync op (from fault) 1653 */ 1654 1655 int 1656 uvm_swap_get(struct vm_page *page, int swslot, int flags) 1657 { 1658 int error; 1659 1660 uvmexp.nswget++; 1661 KASSERT(flags & PGO_SYNCIO); 1662 if (swslot == SWSLOT_BAD) { 1663 return EIO; 1664 } 1665 1666 error = uvm_swap_io(&page, swslot, 1, B_READ | 1667 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1668 if (error == 0) { 1669 1670 /* 1671 * this page is no longer only in swap. 1672 */ 1673 1674 mutex_enter(&uvm_swap_data_lock); 1675 KASSERT(uvmexp.swpgonly > 0); 1676 uvmexp.swpgonly--; 1677 mutex_exit(&uvm_swap_data_lock); 1678 } 1679 return error; 1680 } 1681 1682 /* 1683 * uvm_swap_io: do an i/o operation to swap 1684 */ 1685 1686 static int 1687 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) 1688 { 1689 daddr_t startblk; 1690 struct buf *bp; 1691 vaddr_t kva; 1692 int error, s, mapinflags; 1693 bool write, async; 1694 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist); 1695 1696 UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d", 1697 startslot, npages, flags, 0); 1698 1699 write = (flags & B_READ) == 0; 1700 async = (flags & B_ASYNC) != 0; 1701 1702 /* 1703 * convert starting drum slot to block number 1704 */ 1705 1706 startblk = btodb((uint64_t)startslot << PAGE_SHIFT); 1707 1708 /* 1709 * first, map the pages into the kernel. 1710 */ 1711 1712 mapinflags = !write ? 1713 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ : 1714 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE; 1715 kva = uvm_pagermapin(pps, npages, mapinflags); 1716 1717 /* 1718 * now allocate a buf for the i/o. 1719 */ 1720 1721 bp = getiobuf(); 1722 1723 /* 1724 * fill in the bp/sbp. we currently route our i/o through 1725 * /dev/drum's vnode [swapdev_vp]. 1726 */ 1727 1728 bp->b_flags = B_BUSY | B_NOCACHE | (flags & (B_READ|B_ASYNC)); 1729 bp->b_proc = &proc0; /* XXX */ 1730 bp->b_vnbufs.le_next = NOLIST; 1731 bp->b_data = (void *)kva; 1732 bp->b_blkno = startblk; 1733 bp->b_vp = swapdev_vp; 1734 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT; 1735 1736 /* 1737 * bump v_numoutput (counter of number of active outputs). 1738 */ 1739 1740 if (write) { 1741 s = splbio(); 1742 V_INCR_NUMOUTPUT(swapdev_vp); 1743 splx(s); 1744 } 1745 1746 /* 1747 * for async ops we must set up the iodone handler. 1748 */ 1749 1750 if (async) { 1751 bp->b_flags |= B_CALL; 1752 bp->b_iodone = uvm_aio_biodone; 1753 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); 1754 if (curlwp == uvm.pagedaemon_lwp) 1755 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1756 else 1757 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 1758 } else { 1759 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1760 } 1761 UVMHIST_LOG(pdhist, 1762 "about to start io: data = %p blkno = 0x%x, bcount = %ld", 1763 bp->b_data, bp->b_blkno, bp->b_bcount, 0); 1764 1765 /* 1766 * now we start the I/O, and if async, return. 1767 */ 1768 1769 VOP_STRATEGY(swapdev_vp, bp); 1770 if (async) 1771 return 0; 1772 1773 /* 1774 * must be sync i/o. wait for it to finish 1775 */ 1776 1777 error = biowait(bp); 1778 1779 /* 1780 * kill the pager mapping 1781 */ 1782 1783 uvm_pagermapout(kva, npages); 1784 1785 /* 1786 * now dispose of the buf and we're done. 1787 */ 1788 1789 s = splbio(); 1790 if (write) 1791 vwakeup(bp); 1792 putiobuf(bp); 1793 splx(s); 1794 UVMHIST_LOG(pdhist, "<- done (sync) error=%d", error, 0, 0, 0); 1795 return (error); 1796 } 1797