1 /* $NetBSD: uvm_swap.c,v 1.138 2008/05/11 20:19:27 kardel Exp $ */ 2 3 /* 4 * Copyright (c) 1995, 1996, 1997 Matthew R. Green 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. The name of the author may not be used to endorse or promote products 16 * derived from this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 31 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 32 */ 33 34 #include <sys/cdefs.h> 35 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.138 2008/05/11 20:19:27 kardel Exp $"); 36 37 #include "fs_nfs.h" 38 #include "opt_uvmhist.h" 39 #include "opt_compat_netbsd.h" 40 #include "opt_ddb.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/buf.h> 45 #include <sys/bufq.h> 46 #include <sys/conf.h> 47 #include <sys/proc.h> 48 #include <sys/namei.h> 49 #include <sys/disklabel.h> 50 #include <sys/errno.h> 51 #include <sys/kernel.h> 52 #include <sys/malloc.h> 53 #include <sys/vnode.h> 54 #include <sys/file.h> 55 #include <sys/vmem.h> 56 #include <sys/blist.h> 57 #include <sys/mount.h> 58 #include <sys/pool.h> 59 #include <sys/syscallargs.h> 60 #include <sys/swap.h> 61 #include <sys/kauth.h> 62 #include <sys/sysctl.h> 63 #include <sys/workqueue.h> 64 65 #include <uvm/uvm.h> 66 67 #include <miscfs/specfs/specdev.h> 68 69 /* 70 * uvm_swap.c: manage configuration and i/o to swap space. 71 */ 72 73 /* 74 * swap space is managed in the following way: 75 * 76 * each swap partition or file is described by a "swapdev" structure. 77 * each "swapdev" structure contains a "swapent" structure which contains 78 * information that is passed up to the user (via system calls). 79 * 80 * each swap partition is assigned a "priority" (int) which controls 81 * swap parition usage. 82 * 83 * the system maintains a global data structure describing all swap 84 * partitions/files. there is a sorted LIST of "swappri" structures 85 * which describe "swapdev"'s at that priority. this LIST is headed 86 * by the "swap_priority" global var. each "swappri" contains a 87 * CIRCLEQ of "swapdev" structures at that priority. 88 * 89 * locking: 90 * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl 91 * system call and prevents the swap priority list from changing 92 * while we are in the middle of a system call (e.g. SWAP_STATS). 93 * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data 94 * structures including the priority list, the swapdev structures, 95 * and the swapmap arena. 96 * 97 * each swap device has the following info: 98 * - swap device in use (could be disabled, preventing future use) 99 * - swap enabled (allows new allocations on swap) 100 * - map info in /dev/drum 101 * - vnode pointer 102 * for swap files only: 103 * - block size 104 * - max byte count in buffer 105 * - buffer 106 * 107 * userland controls and configures swap with the swapctl(2) system call. 108 * the sys_swapctl performs the following operations: 109 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 110 * [2] SWAP_STATS: given a pointer to an array of swapent structures 111 * (passed in via "arg") of a size passed in via "misc" ... we load 112 * the current swap config into the array. The actual work is done 113 * in the uvm_swap_stats(9) function. 114 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 115 * priority in "misc", start swapping on it. 116 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 117 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 118 * "misc") 119 */ 120 121 /* 122 * swapdev: describes a single swap partition/file 123 * 124 * note the following should be true: 125 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 126 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] 127 */ 128 struct swapdev { 129 struct oswapent swd_ose; 130 #define swd_dev swd_ose.ose_dev /* device id */ 131 #define swd_flags swd_ose.ose_flags /* flags:inuse/enable/fake */ 132 #define swd_priority swd_ose.ose_priority /* our priority */ 133 /* also: swd_ose.ose_nblks, swd_ose.ose_inuse */ 134 char *swd_path; /* saved pathname of device */ 135 int swd_pathlen; /* length of pathname */ 136 int swd_npages; /* #pages we can use */ 137 int swd_npginuse; /* #pages in use */ 138 int swd_npgbad; /* #pages bad */ 139 int swd_drumoffset; /* page0 offset in drum */ 140 int swd_drumsize; /* #pages in drum */ 141 blist_t swd_blist; /* blist for this swapdev */ 142 struct vnode *swd_vp; /* backing vnode */ 143 CIRCLEQ_ENTRY(swapdev) swd_next; /* priority circleq */ 144 145 int swd_bsize; /* blocksize (bytes) */ 146 int swd_maxactive; /* max active i/o reqs */ 147 struct bufq_state *swd_tab; /* buffer list */ 148 int swd_active; /* number of active buffers */ 149 }; 150 151 /* 152 * swap device priority entry; the list is kept sorted on `spi_priority'. 153 */ 154 struct swappri { 155 int spi_priority; /* priority */ 156 CIRCLEQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 157 /* circleq of swapdevs at this priority */ 158 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 159 }; 160 161 /* 162 * The following two structures are used to keep track of data transfers 163 * on swap devices associated with regular files. 164 * NOTE: this code is more or less a copy of vnd.c; we use the same 165 * structure names here to ease porting.. 166 */ 167 struct vndxfer { 168 struct buf *vx_bp; /* Pointer to parent buffer */ 169 struct swapdev *vx_sdp; 170 int vx_error; 171 int vx_pending; /* # of pending aux buffers */ 172 int vx_flags; 173 #define VX_BUSY 1 174 #define VX_DEAD 2 175 }; 176 177 struct vndbuf { 178 struct buf vb_buf; 179 struct vndxfer *vb_xfer; 180 }; 181 182 183 /* 184 * We keep a of pool vndbuf's and vndxfer structures. 185 */ 186 POOL_INIT(vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx", NULL, 187 IPL_BIO); 188 POOL_INIT(vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd", NULL, 189 IPL_BIO); 190 191 /* 192 * local variables 193 */ 194 MALLOC_DEFINE(M_VMSWAP, "VM swap", "VM swap structures"); 195 static vmem_t *swapmap; /* controls the mapping of /dev/drum */ 196 197 /* list of all active swap devices [by priority] */ 198 LIST_HEAD(swap_priority, swappri); 199 static struct swap_priority swap_priority; 200 201 /* locks */ 202 static krwlock_t swap_syscall_lock; 203 204 /* workqueue and use counter for swap to regular files */ 205 static int sw_reg_count = 0; 206 static struct workqueue *sw_reg_workqueue; 207 208 /* 209 * prototypes 210 */ 211 static struct swapdev *swapdrum_getsdp(int); 212 213 static struct swapdev *swaplist_find(struct vnode *, bool); 214 static void swaplist_insert(struct swapdev *, 215 struct swappri *, int); 216 static void swaplist_trim(void); 217 218 static int swap_on(struct lwp *, struct swapdev *); 219 static int swap_off(struct lwp *, struct swapdev *); 220 221 static void uvm_swap_stats_locked(int, struct swapent *, int, register_t *); 222 223 static void sw_reg_strategy(struct swapdev *, struct buf *, int); 224 static void sw_reg_biodone(struct buf *); 225 static void sw_reg_iodone(struct work *wk, void *dummy); 226 static void sw_reg_start(struct swapdev *); 227 228 static int uvm_swap_io(struct vm_page **, int, int, int); 229 230 /* 231 * uvm_swap_init: init the swap system data structures and locks 232 * 233 * => called at boot time from init_main.c after the filesystems 234 * are brought up (which happens after uvm_init()) 235 */ 236 void 237 uvm_swap_init(void) 238 { 239 UVMHIST_FUNC("uvm_swap_init"); 240 241 UVMHIST_CALLED(pdhist); 242 /* 243 * first, init the swap list, its counter, and its lock. 244 * then get a handle on the vnode for /dev/drum by using 245 * the its dev_t number ("swapdev", from MD conf.c). 246 */ 247 248 LIST_INIT(&swap_priority); 249 uvmexp.nswapdev = 0; 250 rw_init(&swap_syscall_lock); 251 cv_init(&uvm.scheduler_cv, "schedule"); 252 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE); 253 254 /* XXXSMP should be at IPL_VM, but for audio interrupt handlers. */ 255 mutex_init(&uvm_scheduler_mutex, MUTEX_SPIN, IPL_SCHED); 256 257 if (bdevvp(swapdev, &swapdev_vp)) 258 panic("uvm_swap_init: can't get vnode for swap device"); 259 if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY)) 260 panic("uvm_swap_init: can't lock swap device"); 261 if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED)) 262 panic("uvm_swap_init: can't open swap device"); 263 VOP_UNLOCK(swapdev_vp, 0); 264 265 /* 266 * create swap block resource map to map /dev/drum. the range 267 * from 1 to INT_MAX allows 2 gigablocks of swap space. note 268 * that block 0 is reserved (used to indicate an allocation 269 * failure, or no allocation). 270 */ 271 swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0, 272 VM_NOSLEEP, IPL_NONE); 273 if (swapmap == 0) 274 panic("uvm_swap_init: extent_create failed"); 275 276 /* 277 * done! 278 */ 279 uvm.swap_running = true; 280 uvm.swapout_enabled = 1; 281 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); 282 283 sysctl_createv(NULL, 0, NULL, NULL, 284 CTLFLAG_READWRITE, 285 CTLTYPE_INT, "swapout", 286 SYSCTL_DESCR("Set 0 to disable swapout of kernel stacks"), 287 NULL, 0, &uvm.swapout_enabled, 0, CTL_VM, CTL_CREATE, CTL_EOL); 288 } 289 290 /* 291 * swaplist functions: functions that operate on the list of swap 292 * devices on the system. 293 */ 294 295 /* 296 * swaplist_insert: insert swap device "sdp" into the global list 297 * 298 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 299 * => caller must provide a newly malloc'd swappri structure (we will 300 * FREE it if we don't need it... this it to prevent malloc blocking 301 * here while adding swap) 302 */ 303 static void 304 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) 305 { 306 struct swappri *spp, *pspp; 307 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist); 308 309 /* 310 * find entry at or after which to insert the new device. 311 */ 312 pspp = NULL; 313 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 314 if (priority <= spp->spi_priority) 315 break; 316 pspp = spp; 317 } 318 319 /* 320 * new priority? 321 */ 322 if (spp == NULL || spp->spi_priority != priority) { 323 spp = newspp; /* use newspp! */ 324 UVMHIST_LOG(pdhist, "created new swappri = %d", 325 priority, 0, 0, 0); 326 327 spp->spi_priority = priority; 328 CIRCLEQ_INIT(&spp->spi_swapdev); 329 330 if (pspp) 331 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 332 else 333 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 334 } else { 335 /* we don't need a new priority structure, free it */ 336 FREE(newspp, M_VMSWAP); 337 } 338 339 /* 340 * priority found (or created). now insert on the priority's 341 * circleq list and bump the total number of swapdevs. 342 */ 343 sdp->swd_priority = priority; 344 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 345 uvmexp.nswapdev++; 346 } 347 348 /* 349 * swaplist_find: find and optionally remove a swap device from the 350 * global list. 351 * 352 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 353 * => we return the swapdev we found (and removed) 354 */ 355 static struct swapdev * 356 swaplist_find(struct vnode *vp, bool remove) 357 { 358 struct swapdev *sdp; 359 struct swappri *spp; 360 361 /* 362 * search the lists for the requested vp 363 */ 364 365 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 366 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 367 if (sdp->swd_vp == vp) { 368 if (remove) { 369 CIRCLEQ_REMOVE(&spp->spi_swapdev, 370 sdp, swd_next); 371 uvmexp.nswapdev--; 372 } 373 return(sdp); 374 } 375 } 376 } 377 return (NULL); 378 } 379 380 /* 381 * swaplist_trim: scan priority list for empty priority entries and kill 382 * them. 383 * 384 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 385 */ 386 static void 387 swaplist_trim(void) 388 { 389 struct swappri *spp, *nextspp; 390 391 for (spp = LIST_FIRST(&swap_priority); spp != NULL; spp = nextspp) { 392 nextspp = LIST_NEXT(spp, spi_swappri); 393 if (CIRCLEQ_FIRST(&spp->spi_swapdev) != 394 (void *)&spp->spi_swapdev) 395 continue; 396 LIST_REMOVE(spp, spi_swappri); 397 free(spp, M_VMSWAP); 398 } 399 } 400 401 /* 402 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 403 * to the "swapdev" that maps that section of the drum. 404 * 405 * => each swapdev takes one big contig chunk of the drum 406 * => caller must hold uvm_swap_data_lock 407 */ 408 static struct swapdev * 409 swapdrum_getsdp(int pgno) 410 { 411 struct swapdev *sdp; 412 struct swappri *spp; 413 414 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 415 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 416 if (sdp->swd_flags & SWF_FAKE) 417 continue; 418 if (pgno >= sdp->swd_drumoffset && 419 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 420 return sdp; 421 } 422 } 423 } 424 return NULL; 425 } 426 427 428 /* 429 * sys_swapctl: main entry point for swapctl(2) system call 430 * [with two helper functions: swap_on and swap_off] 431 */ 432 int 433 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval) 434 { 435 /* { 436 syscallarg(int) cmd; 437 syscallarg(void *) arg; 438 syscallarg(int) misc; 439 } */ 440 struct vnode *vp; 441 struct nameidata nd; 442 struct swappri *spp; 443 struct swapdev *sdp; 444 struct swapent *sep; 445 #define SWAP_PATH_MAX (PATH_MAX + 1) 446 char *userpath; 447 size_t len; 448 int error, misc; 449 int priority; 450 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist); 451 452 misc = SCARG(uap, misc); 453 454 /* 455 * ensure serialized syscall access by grabbing the swap_syscall_lock 456 */ 457 rw_enter(&swap_syscall_lock, RW_WRITER); 458 459 userpath = malloc(SWAP_PATH_MAX, M_TEMP, M_WAITOK); 460 /* 461 * we handle the non-priv NSWAP and STATS request first. 462 * 463 * SWAP_NSWAP: return number of config'd swap devices 464 * [can also be obtained with uvmexp sysctl] 465 */ 466 if (SCARG(uap, cmd) == SWAP_NSWAP) { 467 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev, 468 0, 0, 0); 469 *retval = uvmexp.nswapdev; 470 error = 0; 471 goto out; 472 } 473 474 /* 475 * SWAP_STATS: get stats on current # of configured swap devs 476 * 477 * note that the swap_priority list can't change as long 478 * as we are holding the swap_syscall_lock. we don't want 479 * to grab the uvm_swap_data_lock because we may fault&sleep during 480 * copyout() and we don't want to be holding that lock then! 481 */ 482 if (SCARG(uap, cmd) == SWAP_STATS 483 #if defined(COMPAT_13) 484 || SCARG(uap, cmd) == SWAP_OSTATS 485 #endif 486 ) { 487 if ((size_t)misc > (size_t)uvmexp.nswapdev) 488 misc = uvmexp.nswapdev; 489 #if defined(COMPAT_13) 490 if (SCARG(uap, cmd) == SWAP_OSTATS) 491 len = sizeof(struct oswapent) * misc; 492 else 493 #endif 494 len = sizeof(struct swapent) * misc; 495 sep = (struct swapent *)malloc(len, M_TEMP, M_WAITOK); 496 497 uvm_swap_stats_locked(SCARG(uap, cmd), sep, misc, retval); 498 error = copyout(sep, SCARG(uap, arg), len); 499 500 free(sep, M_TEMP); 501 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); 502 goto out; 503 } 504 if (SCARG(uap, cmd) == SWAP_GETDUMPDEV) { 505 dev_t *devp = (dev_t *)SCARG(uap, arg); 506 507 error = copyout(&dumpdev, devp, sizeof(dumpdev)); 508 goto out; 509 } 510 511 /* 512 * all other requests require superuser privs. verify. 513 */ 514 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL, 515 0, NULL, NULL, NULL))) 516 goto out; 517 518 if (SCARG(uap, cmd) == SWAP_DUMPOFF) { 519 /* drop the current dump device */ 520 dumpdev = NODEV; 521 dumpcdev = NODEV; 522 cpu_dumpconf(); 523 goto out; 524 } 525 526 /* 527 * at this point we expect a path name in arg. we will 528 * use namei() to gain a vnode reference (vref), and lock 529 * the vnode (VOP_LOCK). 530 * 531 * XXX: a NULL arg means use the root vnode pointer (e.g. for 532 * miniroot) 533 */ 534 if (SCARG(uap, arg) == NULL) { 535 vp = rootvp; /* miniroot */ 536 if (vget(vp, LK_EXCLUSIVE)) { 537 error = EBUSY; 538 goto out; 539 } 540 if (SCARG(uap, cmd) == SWAP_ON && 541 copystr("miniroot", userpath, SWAP_PATH_MAX, &len)) 542 panic("swapctl: miniroot copy failed"); 543 } else { 544 int space; 545 char *where; 546 547 if (SCARG(uap, cmd) == SWAP_ON) { 548 if ((error = copyinstr(SCARG(uap, arg), userpath, 549 SWAP_PATH_MAX, &len))) 550 goto out; 551 space = UIO_SYSSPACE; 552 where = userpath; 553 } else { 554 space = UIO_USERSPACE; 555 where = (char *)SCARG(uap, arg); 556 } 557 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, 558 space, where); 559 if ((error = namei(&nd))) 560 goto out; 561 vp = nd.ni_vp; 562 } 563 /* note: "vp" is referenced and locked */ 564 565 error = 0; /* assume no error */ 566 switch(SCARG(uap, cmd)) { 567 568 case SWAP_DUMPDEV: 569 if (vp->v_type != VBLK) { 570 error = ENOTBLK; 571 break; 572 } 573 if (bdevsw_lookup(vp->v_rdev)) { 574 dumpdev = vp->v_rdev; 575 dumpcdev = devsw_blk2chr(dumpdev); 576 } else 577 dumpdev = NODEV; 578 cpu_dumpconf(); 579 break; 580 581 case SWAP_CTL: 582 /* 583 * get new priority, remove old entry (if any) and then 584 * reinsert it in the correct place. finally, prune out 585 * any empty priority structures. 586 */ 587 priority = SCARG(uap, misc); 588 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 589 mutex_enter(&uvm_swap_data_lock); 590 if ((sdp = swaplist_find(vp, true)) == NULL) { 591 error = ENOENT; 592 } else { 593 swaplist_insert(sdp, spp, priority); 594 swaplist_trim(); 595 } 596 mutex_exit(&uvm_swap_data_lock); 597 if (error) 598 free(spp, M_VMSWAP); 599 break; 600 601 case SWAP_ON: 602 603 /* 604 * check for duplicates. if none found, then insert a 605 * dummy entry on the list to prevent someone else from 606 * trying to enable this device while we are working on 607 * it. 608 */ 609 610 priority = SCARG(uap, misc); 611 sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK); 612 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 613 memset(sdp, 0, sizeof(*sdp)); 614 sdp->swd_flags = SWF_FAKE; 615 sdp->swd_vp = vp; 616 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 617 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK); 618 mutex_enter(&uvm_swap_data_lock); 619 if (swaplist_find(vp, false) != NULL) { 620 error = EBUSY; 621 mutex_exit(&uvm_swap_data_lock); 622 bufq_free(sdp->swd_tab); 623 free(sdp, M_VMSWAP); 624 free(spp, M_VMSWAP); 625 break; 626 } 627 swaplist_insert(sdp, spp, priority); 628 mutex_exit(&uvm_swap_data_lock); 629 630 sdp->swd_pathlen = len; 631 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK); 632 if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0) 633 panic("swapctl: copystr"); 634 635 /* 636 * we've now got a FAKE placeholder in the swap list. 637 * now attempt to enable swap on it. if we fail, undo 638 * what we've done and kill the fake entry we just inserted. 639 * if swap_on is a success, it will clear the SWF_FAKE flag 640 */ 641 642 if ((error = swap_on(l, sdp)) != 0) { 643 mutex_enter(&uvm_swap_data_lock); 644 (void) swaplist_find(vp, true); /* kill fake entry */ 645 swaplist_trim(); 646 mutex_exit(&uvm_swap_data_lock); 647 bufq_free(sdp->swd_tab); 648 free(sdp->swd_path, M_VMSWAP); 649 free(sdp, M_VMSWAP); 650 break; 651 } 652 break; 653 654 case SWAP_OFF: 655 mutex_enter(&uvm_swap_data_lock); 656 if ((sdp = swaplist_find(vp, false)) == NULL) { 657 mutex_exit(&uvm_swap_data_lock); 658 error = ENXIO; 659 break; 660 } 661 662 /* 663 * If a device isn't in use or enabled, we 664 * can't stop swapping from it (again). 665 */ 666 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 667 mutex_exit(&uvm_swap_data_lock); 668 error = EBUSY; 669 break; 670 } 671 672 /* 673 * do the real work. 674 */ 675 error = swap_off(l, sdp); 676 break; 677 678 default: 679 error = EINVAL; 680 } 681 682 /* 683 * done! release the ref gained by namei() and unlock. 684 */ 685 vput(vp); 686 687 out: 688 free(userpath, M_TEMP); 689 rw_exit(&swap_syscall_lock); 690 691 UVMHIST_LOG(pdhist, "<- done! error=%d", error, 0, 0, 0); 692 return (error); 693 } 694 695 /* 696 * swap_stats: implements swapctl(SWAP_STATS). The function is kept 697 * away from sys_swapctl() in order to allow COMPAT_* swapctl() 698 * emulation to use it directly without going through sys_swapctl(). 699 * The problem with using sys_swapctl() there is that it involves 700 * copying the swapent array to the stackgap, and this array's size 701 * is not known at build time. Hence it would not be possible to 702 * ensure it would fit in the stackgap in any case. 703 */ 704 void 705 uvm_swap_stats(int cmd, struct swapent *sep, int sec, register_t *retval) 706 { 707 708 rw_enter(&swap_syscall_lock, RW_READER); 709 uvm_swap_stats_locked(cmd, sep, sec, retval); 710 rw_exit(&swap_syscall_lock); 711 } 712 713 static void 714 uvm_swap_stats_locked(int cmd, struct swapent *sep, int sec, register_t *retval) 715 { 716 struct swappri *spp; 717 struct swapdev *sdp; 718 int count = 0; 719 720 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 721 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev); 722 sdp != (void *)&spp->spi_swapdev && sec-- > 0; 723 sdp = CIRCLEQ_NEXT(sdp, swd_next)) { 724 /* 725 * backwards compatibility for system call. 726 * note that we use 'struct oswapent' as an 727 * overlay into both 'struct swapdev' and 728 * the userland 'struct swapent', as we 729 * want to retain backwards compatibility 730 * with NetBSD 1.3. 731 */ 732 sdp->swd_ose.ose_inuse = 733 btodb((uint64_t)sdp->swd_npginuse << 734 PAGE_SHIFT); 735 (void)memcpy(sep, &sdp->swd_ose, 736 sizeof(struct oswapent)); 737 738 /* now copy out the path if necessary */ 739 #if !defined(COMPAT_13) 740 (void) cmd; 741 #endif 742 #if defined(COMPAT_13) 743 if (cmd == SWAP_STATS) 744 #endif 745 (void)memcpy(&sep->se_path, sdp->swd_path, 746 sdp->swd_pathlen); 747 748 count++; 749 #if defined(COMPAT_13) 750 if (cmd == SWAP_OSTATS) 751 sep = (struct swapent *) 752 ((struct oswapent *)sep + 1); 753 else 754 #endif 755 sep++; 756 } 757 } 758 759 *retval = count; 760 return; 761 } 762 763 /* 764 * swap_on: attempt to enable a swapdev for swapping. note that the 765 * swapdev is already on the global list, but disabled (marked 766 * SWF_FAKE). 767 * 768 * => we avoid the start of the disk (to protect disk labels) 769 * => we also avoid the miniroot, if we are swapping to root. 770 * => caller should leave uvm_swap_data_lock unlocked, we may lock it 771 * if needed. 772 */ 773 static int 774 swap_on(struct lwp *l, struct swapdev *sdp) 775 { 776 struct vnode *vp; 777 int error, npages, nblocks, size; 778 long addr; 779 u_long result; 780 struct vattr va; 781 #ifdef NFS 782 extern int (**nfsv2_vnodeop_p)(void *); 783 #endif /* NFS */ 784 const struct bdevsw *bdev; 785 dev_t dev; 786 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist); 787 788 /* 789 * we want to enable swapping on sdp. the swd_vp contains 790 * the vnode we want (locked and ref'd), and the swd_dev 791 * contains the dev_t of the file, if it a block device. 792 */ 793 794 vp = sdp->swd_vp; 795 dev = sdp->swd_dev; 796 797 /* 798 * open the swap file (mostly useful for block device files to 799 * let device driver know what is up). 800 * 801 * we skip the open/close for root on swap because the root 802 * has already been opened when root was mounted (mountroot). 803 */ 804 if (vp != rootvp) { 805 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred))) 806 return (error); 807 } 808 809 /* XXX this only works for block devices */ 810 UVMHIST_LOG(pdhist, " dev=%d, major(dev)=%d", dev, major(dev), 0,0); 811 812 /* 813 * we now need to determine the size of the swap area. for 814 * block specials we can call the d_psize function. 815 * for normal files, we must stat [get attrs]. 816 * 817 * we put the result in nblks. 818 * for normal files, we also want the filesystem block size 819 * (which we get with statfs). 820 */ 821 switch (vp->v_type) { 822 case VBLK: 823 bdev = bdevsw_lookup(dev); 824 if (bdev == NULL || bdev->d_psize == NULL || 825 (nblocks = (*bdev->d_psize)(dev)) == -1) { 826 error = ENXIO; 827 goto bad; 828 } 829 break; 830 831 case VREG: 832 if ((error = VOP_GETATTR(vp, &va, l->l_cred))) 833 goto bad; 834 nblocks = (int)btodb(va.va_size); 835 if ((error = 836 VFS_STATVFS(vp->v_mount, &vp->v_mount->mnt_stat)) != 0) 837 goto bad; 838 839 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize; 840 /* 841 * limit the max # of outstanding I/O requests we issue 842 * at any one time. take it easy on NFS servers. 843 */ 844 #ifdef NFS 845 if (vp->v_op == nfsv2_vnodeop_p) 846 sdp->swd_maxactive = 2; /* XXX */ 847 else 848 #endif /* NFS */ 849 sdp->swd_maxactive = 8; /* XXX */ 850 break; 851 852 default: 853 error = ENXIO; 854 goto bad; 855 } 856 857 /* 858 * save nblocks in a safe place and convert to pages. 859 */ 860 861 sdp->swd_ose.ose_nblks = nblocks; 862 npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT; 863 864 /* 865 * for block special files, we want to make sure that leave 866 * the disklabel and bootblocks alone, so we arrange to skip 867 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 868 * note that because of this the "size" can be less than the 869 * actual number of blocks on the device. 870 */ 871 if (vp->v_type == VBLK) { 872 /* we use pages 1 to (size - 1) [inclusive] */ 873 size = npages - 1; 874 addr = 1; 875 } else { 876 /* we use pages 0 to (size - 1) [inclusive] */ 877 size = npages; 878 addr = 0; 879 } 880 881 /* 882 * make sure we have enough blocks for a reasonable sized swap 883 * area. we want at least one page. 884 */ 885 886 if (size < 1) { 887 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); 888 error = EINVAL; 889 goto bad; 890 } 891 892 UVMHIST_LOG(pdhist, " dev=%x: size=%d addr=%ld\n", dev, size, addr, 0); 893 894 /* 895 * now we need to allocate an extent to manage this swap device 896 */ 897 898 sdp->swd_blist = blist_create(npages); 899 /* mark all expect the `saved' region free. */ 900 blist_free(sdp->swd_blist, addr, size); 901 902 /* 903 * if the vnode we are swapping to is the root vnode 904 * (i.e. we are swapping to the miniroot) then we want 905 * to make sure we don't overwrite it. do a statfs to 906 * find its size and skip over it. 907 */ 908 if (vp == rootvp) { 909 struct mount *mp; 910 struct statvfs *sp; 911 int rootblocks, rootpages; 912 913 mp = rootvnode->v_mount; 914 sp = &mp->mnt_stat; 915 rootblocks = sp->f_blocks * btodb(sp->f_frsize); 916 /* 917 * XXX: sp->f_blocks isn't the total number of 918 * blocks in the filesystem, it's the number of 919 * data blocks. so, our rootblocks almost 920 * definitely underestimates the total size 921 * of the filesystem - how badly depends on the 922 * details of the filesystem type. there isn't 923 * an obvious way to deal with this cleanly 924 * and perfectly, so for now we just pad our 925 * rootblocks estimate with an extra 5 percent. 926 */ 927 rootblocks += (rootblocks >> 5) + 928 (rootblocks >> 6) + 929 (rootblocks >> 7); 930 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; 931 if (rootpages > size) 932 panic("swap_on: miniroot larger than swap?"); 933 934 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) { 935 panic("swap_on: unable to preserve miniroot"); 936 } 937 938 size -= rootpages; 939 printf("Preserved %d pages of miniroot ", rootpages); 940 printf("leaving %d pages of swap\n", size); 941 } 942 943 /* 944 * add a ref to vp to reflect usage as a swap device. 945 */ 946 vref(vp); 947 948 /* 949 * now add the new swapdev to the drum and enable. 950 */ 951 result = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP); 952 if (result == 0) 953 panic("swapdrum_add"); 954 /* 955 * If this is the first regular swap create the workqueue. 956 * => Protected by swap_syscall_lock. 957 */ 958 if (vp->v_type != VBLK) { 959 if (sw_reg_count++ == 0) { 960 KASSERT(sw_reg_workqueue == NULL); 961 if (workqueue_create(&sw_reg_workqueue, "swapiod", 962 sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0) 963 panic("swap_add: workqueue_create failed"); 964 } 965 } 966 967 sdp->swd_drumoffset = (int)result; 968 sdp->swd_drumsize = npages; 969 sdp->swd_npages = size; 970 mutex_enter(&uvm_swap_data_lock); 971 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 972 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 973 uvmexp.swpages += size; 974 uvmexp.swpgavail += size; 975 mutex_exit(&uvm_swap_data_lock); 976 return (0); 977 978 /* 979 * failure: clean up and return error. 980 */ 981 982 bad: 983 if (sdp->swd_blist) { 984 blist_destroy(sdp->swd_blist); 985 } 986 if (vp != rootvp) { 987 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred); 988 } 989 return (error); 990 } 991 992 /* 993 * swap_off: stop swapping on swapdev 994 * 995 * => swap data should be locked, we will unlock. 996 */ 997 static int 998 swap_off(struct lwp *l, struct swapdev *sdp) 999 { 1000 int npages = sdp->swd_npages; 1001 int error = 0; 1002 1003 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist); 1004 UVMHIST_LOG(pdhist, " dev=%x, npages=%d", sdp->swd_dev,npages,0,0); 1005 1006 /* disable the swap area being removed */ 1007 sdp->swd_flags &= ~SWF_ENABLE; 1008 uvmexp.swpgavail -= npages; 1009 mutex_exit(&uvm_swap_data_lock); 1010 1011 /* 1012 * the idea is to find all the pages that are paged out to this 1013 * device, and page them all in. in uvm, swap-backed pageable 1014 * memory can take two forms: aobjs and anons. call the 1015 * swapoff hook for each subsystem to bring in pages. 1016 */ 1017 1018 if (uao_swap_off(sdp->swd_drumoffset, 1019 sdp->swd_drumoffset + sdp->swd_drumsize) || 1020 amap_swap_off(sdp->swd_drumoffset, 1021 sdp->swd_drumoffset + sdp->swd_drumsize)) { 1022 error = ENOMEM; 1023 } else if (sdp->swd_npginuse > sdp->swd_npgbad) { 1024 error = EBUSY; 1025 } 1026 1027 if (error) { 1028 mutex_enter(&uvm_swap_data_lock); 1029 sdp->swd_flags |= SWF_ENABLE; 1030 uvmexp.swpgavail += npages; 1031 mutex_exit(&uvm_swap_data_lock); 1032 1033 return error; 1034 } 1035 1036 /* 1037 * If this is the last regular swap destroy the workqueue. 1038 * => Protected by swap_syscall_lock. 1039 */ 1040 if (sdp->swd_vp->v_type != VBLK) { 1041 KASSERT(sw_reg_count > 0); 1042 KASSERT(sw_reg_workqueue != NULL); 1043 if (--sw_reg_count == 0) { 1044 workqueue_destroy(sw_reg_workqueue); 1045 sw_reg_workqueue = NULL; 1046 } 1047 } 1048 1049 /* 1050 * done with the vnode. 1051 * drop our ref on the vnode before calling VOP_CLOSE() 1052 * so that spec_close() can tell if this is the last close. 1053 */ 1054 vrele(sdp->swd_vp); 1055 if (sdp->swd_vp != rootvp) { 1056 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred); 1057 } 1058 1059 mutex_enter(&uvm_swap_data_lock); 1060 uvmexp.swpages -= npages; 1061 uvmexp.swpginuse -= sdp->swd_npgbad; 1062 1063 if (swaplist_find(sdp->swd_vp, true) == NULL) 1064 panic("swap_off: swapdev not in list"); 1065 swaplist_trim(); 1066 mutex_exit(&uvm_swap_data_lock); 1067 1068 /* 1069 * free all resources! 1070 */ 1071 vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize); 1072 blist_destroy(sdp->swd_blist); 1073 bufq_free(sdp->swd_tab); 1074 free(sdp, M_VMSWAP); 1075 return (0); 1076 } 1077 1078 /* 1079 * /dev/drum interface and i/o functions 1080 */ 1081 1082 /* 1083 * swstrategy: perform I/O on the drum 1084 * 1085 * => we must map the i/o request from the drum to the correct swapdev. 1086 */ 1087 static void 1088 swstrategy(struct buf *bp) 1089 { 1090 struct swapdev *sdp; 1091 struct vnode *vp; 1092 int pageno, bn; 1093 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist); 1094 1095 /* 1096 * convert block number to swapdev. note that swapdev can't 1097 * be yanked out from under us because we are holding resources 1098 * in it (i.e. the blocks we are doing I/O on). 1099 */ 1100 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT; 1101 mutex_enter(&uvm_swap_data_lock); 1102 sdp = swapdrum_getsdp(pageno); 1103 mutex_exit(&uvm_swap_data_lock); 1104 if (sdp == NULL) { 1105 bp->b_error = EINVAL; 1106 biodone(bp); 1107 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); 1108 return; 1109 } 1110 1111 /* 1112 * convert drum page number to block number on this swapdev. 1113 */ 1114 1115 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1116 bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ 1117 1118 UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld", 1119 ((bp->b_flags & B_READ) == 0) ? "write" : "read", 1120 sdp->swd_drumoffset, bn, bp->b_bcount); 1121 1122 /* 1123 * for block devices we finish up here. 1124 * for regular files we have to do more work which we delegate 1125 * to sw_reg_strategy(). 1126 */ 1127 1128 vp = sdp->swd_vp; /* swapdev vnode pointer */ 1129 switch (vp->v_type) { 1130 default: 1131 panic("swstrategy: vnode type 0x%x", vp->v_type); 1132 1133 case VBLK: 1134 1135 /* 1136 * must convert "bp" from an I/O on /dev/drum to an I/O 1137 * on the swapdev (sdp). 1138 */ 1139 bp->b_blkno = bn; /* swapdev block number */ 1140 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */ 1141 1142 /* 1143 * if we are doing a write, we have to redirect the i/o on 1144 * drum's v_numoutput counter to the swapdevs. 1145 */ 1146 if ((bp->b_flags & B_READ) == 0) { 1147 mutex_enter(bp->b_objlock); 1148 vwakeup(bp); /* kills one 'v_numoutput' on drum */ 1149 mutex_exit(bp->b_objlock); 1150 mutex_enter(&vp->v_interlock); 1151 vp->v_numoutput++; /* put it on swapdev */ 1152 mutex_exit(&vp->v_interlock); 1153 } 1154 1155 /* 1156 * finally plug in swapdev vnode and start I/O 1157 */ 1158 bp->b_vp = vp; 1159 bp->b_objlock = &vp->v_interlock; 1160 VOP_STRATEGY(vp, bp); 1161 return; 1162 1163 case VREG: 1164 /* 1165 * delegate to sw_reg_strategy function. 1166 */ 1167 sw_reg_strategy(sdp, bp, bn); 1168 return; 1169 } 1170 /* NOTREACHED */ 1171 } 1172 1173 /* 1174 * swread: the read function for the drum (just a call to physio) 1175 */ 1176 /*ARGSUSED*/ 1177 static int 1178 swread(dev_t dev, struct uio *uio, int ioflag) 1179 { 1180 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist); 1181 1182 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); 1183 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); 1184 } 1185 1186 /* 1187 * swwrite: the write function for the drum (just a call to physio) 1188 */ 1189 /*ARGSUSED*/ 1190 static int 1191 swwrite(dev_t dev, struct uio *uio, int ioflag) 1192 { 1193 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist); 1194 1195 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); 1196 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); 1197 } 1198 1199 const struct bdevsw swap_bdevsw = { 1200 nullopen, nullclose, swstrategy, noioctl, nodump, nosize, D_OTHER, 1201 }; 1202 1203 const struct cdevsw swap_cdevsw = { 1204 nullopen, nullclose, swread, swwrite, noioctl, 1205 nostop, notty, nopoll, nommap, nokqfilter, D_OTHER, 1206 }; 1207 1208 /* 1209 * sw_reg_strategy: handle swap i/o to regular files 1210 */ 1211 static void 1212 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) 1213 { 1214 struct vnode *vp; 1215 struct vndxfer *vnx; 1216 daddr_t nbn; 1217 char *addr; 1218 off_t byteoff; 1219 int s, off, nra, error, sz, resid; 1220 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist); 1221 1222 /* 1223 * allocate a vndxfer head for this transfer and point it to 1224 * our buffer. 1225 */ 1226 vnx = pool_get(&vndxfer_pool, PR_WAITOK); 1227 vnx->vx_flags = VX_BUSY; 1228 vnx->vx_error = 0; 1229 vnx->vx_pending = 0; 1230 vnx->vx_bp = bp; 1231 vnx->vx_sdp = sdp; 1232 1233 /* 1234 * setup for main loop where we read filesystem blocks into 1235 * our buffer. 1236 */ 1237 error = 0; 1238 bp->b_resid = bp->b_bcount; /* nothing transfered yet! */ 1239 addr = bp->b_data; /* current position in buffer */ 1240 byteoff = dbtob((uint64_t)bn); 1241 1242 for (resid = bp->b_resid; resid; resid -= sz) { 1243 struct vndbuf *nbp; 1244 1245 /* 1246 * translate byteoffset into block number. return values: 1247 * vp = vnode of underlying device 1248 * nbn = new block number (on underlying vnode dev) 1249 * nra = num blocks we can read-ahead (excludes requested 1250 * block) 1251 */ 1252 nra = 0; 1253 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1254 &vp, &nbn, &nra); 1255 1256 if (error == 0 && nbn == (daddr_t)-1) { 1257 /* 1258 * this used to just set error, but that doesn't 1259 * do the right thing. Instead, it causes random 1260 * memory errors. The panic() should remain until 1261 * this condition doesn't destabilize the system. 1262 */ 1263 #if 1 1264 panic("sw_reg_strategy: swap to sparse file"); 1265 #else 1266 error = EIO; /* failure */ 1267 #endif 1268 } 1269 1270 /* 1271 * punt if there was an error or a hole in the file. 1272 * we must wait for any i/o ops we have already started 1273 * to finish before returning. 1274 * 1275 * XXX we could deal with holes here but it would be 1276 * a hassle (in the write case). 1277 */ 1278 if (error) { 1279 s = splbio(); 1280 vnx->vx_error = error; /* pass error up */ 1281 goto out; 1282 } 1283 1284 /* 1285 * compute the size ("sz") of this transfer (in bytes). 1286 */ 1287 off = byteoff % sdp->swd_bsize; 1288 sz = (1 + nra) * sdp->swd_bsize - off; 1289 if (sz > resid) 1290 sz = resid; 1291 1292 UVMHIST_LOG(pdhist, "sw_reg_strategy: " 1293 "vp %p/%p offset 0x%x/0x%x", 1294 sdp->swd_vp, vp, byteoff, nbn); 1295 1296 /* 1297 * now get a buf structure. note that the vb_buf is 1298 * at the front of the nbp structure so that you can 1299 * cast pointers between the two structure easily. 1300 */ 1301 nbp = pool_get(&vndbuf_pool, PR_WAITOK); 1302 buf_init(&nbp->vb_buf); 1303 nbp->vb_buf.b_flags = bp->b_flags; 1304 nbp->vb_buf.b_cflags = bp->b_cflags; 1305 nbp->vb_buf.b_oflags = bp->b_oflags; 1306 nbp->vb_buf.b_bcount = sz; 1307 nbp->vb_buf.b_bufsize = sz; 1308 nbp->vb_buf.b_error = 0; 1309 nbp->vb_buf.b_data = addr; 1310 nbp->vb_buf.b_lblkno = 0; 1311 nbp->vb_buf.b_blkno = nbn + btodb(off); 1312 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno; 1313 nbp->vb_buf.b_iodone = sw_reg_biodone; 1314 nbp->vb_buf.b_vp = vp; 1315 nbp->vb_buf.b_objlock = &vp->v_interlock; 1316 if (vp->v_type == VBLK) { 1317 nbp->vb_buf.b_dev = vp->v_rdev; 1318 } 1319 1320 nbp->vb_xfer = vnx; /* patch it back in to vnx */ 1321 1322 /* 1323 * Just sort by block number 1324 */ 1325 s = splbio(); 1326 if (vnx->vx_error != 0) { 1327 buf_destroy(&nbp->vb_buf); 1328 pool_put(&vndbuf_pool, nbp); 1329 goto out; 1330 } 1331 vnx->vx_pending++; 1332 1333 /* sort it in and start I/O if we are not over our limit */ 1334 /* XXXAD locking */ 1335 BUFQ_PUT(sdp->swd_tab, &nbp->vb_buf); 1336 sw_reg_start(sdp); 1337 splx(s); 1338 1339 /* 1340 * advance to the next I/O 1341 */ 1342 byteoff += sz; 1343 addr += sz; 1344 } 1345 1346 s = splbio(); 1347 1348 out: /* Arrive here at splbio */ 1349 vnx->vx_flags &= ~VX_BUSY; 1350 if (vnx->vx_pending == 0) { 1351 error = vnx->vx_error; 1352 pool_put(&vndxfer_pool, vnx); 1353 bp->b_error = error; 1354 biodone(bp); 1355 } 1356 splx(s); 1357 } 1358 1359 /* 1360 * sw_reg_start: start an I/O request on the requested swapdev 1361 * 1362 * => reqs are sorted by b_rawblkno (above) 1363 */ 1364 static void 1365 sw_reg_start(struct swapdev *sdp) 1366 { 1367 struct buf *bp; 1368 struct vnode *vp; 1369 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist); 1370 1371 /* recursion control */ 1372 if ((sdp->swd_flags & SWF_BUSY) != 0) 1373 return; 1374 1375 sdp->swd_flags |= SWF_BUSY; 1376 1377 while (sdp->swd_active < sdp->swd_maxactive) { 1378 bp = BUFQ_GET(sdp->swd_tab); 1379 if (bp == NULL) 1380 break; 1381 sdp->swd_active++; 1382 1383 UVMHIST_LOG(pdhist, 1384 "sw_reg_start: bp %p vp %p blkno %p cnt %lx", 1385 bp, bp->b_vp, bp->b_blkno, bp->b_bcount); 1386 vp = bp->b_vp; 1387 KASSERT(bp->b_objlock == &vp->v_interlock); 1388 if ((bp->b_flags & B_READ) == 0) { 1389 mutex_enter(&vp->v_interlock); 1390 vp->v_numoutput++; 1391 mutex_exit(&vp->v_interlock); 1392 } 1393 VOP_STRATEGY(vp, bp); 1394 } 1395 sdp->swd_flags &= ~SWF_BUSY; 1396 } 1397 1398 /* 1399 * sw_reg_biodone: one of our i/o's has completed 1400 */ 1401 static void 1402 sw_reg_biodone(struct buf *bp) 1403 { 1404 workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL); 1405 } 1406 1407 /* 1408 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1409 * 1410 * => note that we can recover the vndbuf struct by casting the buf ptr 1411 */ 1412 static void 1413 sw_reg_iodone(struct work *wk, void *dummy) 1414 { 1415 struct vndbuf *vbp = (void *)wk; 1416 struct vndxfer *vnx = vbp->vb_xfer; 1417 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1418 struct swapdev *sdp = vnx->vx_sdp; 1419 int s, resid, error; 1420 KASSERT(&vbp->vb_buf.b_work == wk); 1421 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist); 1422 1423 UVMHIST_LOG(pdhist, " vbp=%p vp=%p blkno=%x addr=%p", 1424 vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data); 1425 UVMHIST_LOG(pdhist, " cnt=%lx resid=%lx", 1426 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); 1427 1428 /* 1429 * protect vbp at splbio and update. 1430 */ 1431 1432 s = splbio(); 1433 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1434 pbp->b_resid -= resid; 1435 vnx->vx_pending--; 1436 1437 if (vbp->vb_buf.b_error != 0) { 1438 /* pass error upward */ 1439 error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO; 1440 UVMHIST_LOG(pdhist, " got error=%d !", error, 0, 0, 0); 1441 vnx->vx_error = error; 1442 } 1443 1444 /* 1445 * kill vbp structure 1446 */ 1447 buf_destroy(&vbp->vb_buf); 1448 pool_put(&vndbuf_pool, vbp); 1449 1450 /* 1451 * wrap up this transaction if it has run to completion or, in 1452 * case of an error, when all auxiliary buffers have returned. 1453 */ 1454 if (vnx->vx_error != 0) { 1455 /* pass error upward */ 1456 error = vnx->vx_error; 1457 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1458 pbp->b_error = error; 1459 biodone(pbp); 1460 pool_put(&vndxfer_pool, vnx); 1461 } 1462 } else if (pbp->b_resid == 0) { 1463 KASSERT(vnx->vx_pending == 0); 1464 if ((vnx->vx_flags & VX_BUSY) == 0) { 1465 UVMHIST_LOG(pdhist, " iodone error=%d !", 1466 pbp, vnx->vx_error, 0, 0); 1467 biodone(pbp); 1468 pool_put(&vndxfer_pool, vnx); 1469 } 1470 } 1471 1472 /* 1473 * done! start next swapdev I/O if one is pending 1474 */ 1475 sdp->swd_active--; 1476 sw_reg_start(sdp); 1477 splx(s); 1478 } 1479 1480 1481 /* 1482 * uvm_swap_alloc: allocate space on swap 1483 * 1484 * => allocation is done "round robin" down the priority list, as we 1485 * allocate in a priority we "rotate" the circle queue. 1486 * => space can be freed with uvm_swap_free 1487 * => we return the page slot number in /dev/drum (0 == invalid slot) 1488 * => we lock uvm_swap_data_lock 1489 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1490 */ 1491 int 1492 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok) 1493 { 1494 struct swapdev *sdp; 1495 struct swappri *spp; 1496 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist); 1497 1498 /* 1499 * no swap devices configured yet? definite failure. 1500 */ 1501 if (uvmexp.nswapdev < 1) 1502 return 0; 1503 1504 /* 1505 * lock data lock, convert slots into blocks, and enter loop 1506 */ 1507 mutex_enter(&uvm_swap_data_lock); 1508 1509 ReTry: /* XXXMRG */ 1510 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1511 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1512 uint64_t result; 1513 1514 /* if it's not enabled, then we can't swap from it */ 1515 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1516 continue; 1517 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1518 continue; 1519 result = blist_alloc(sdp->swd_blist, *nslots); 1520 if (result == BLIST_NONE) { 1521 continue; 1522 } 1523 KASSERT(result < sdp->swd_drumsize); 1524 1525 /* 1526 * successful allocation! now rotate the circleq. 1527 */ 1528 CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1529 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1530 sdp->swd_npginuse += *nslots; 1531 uvmexp.swpginuse += *nslots; 1532 mutex_exit(&uvm_swap_data_lock); 1533 /* done! return drum slot number */ 1534 UVMHIST_LOG(pdhist, 1535 "success! returning %d slots starting at %d", 1536 *nslots, result + sdp->swd_drumoffset, 0, 0); 1537 return (result + sdp->swd_drumoffset); 1538 } 1539 } 1540 1541 /* XXXMRG: BEGIN HACK */ 1542 if (*nslots > 1 && lessok) { 1543 *nslots = 1; 1544 /* XXXMRG: ugh! blist should support this for us */ 1545 goto ReTry; 1546 } 1547 /* XXXMRG: END HACK */ 1548 1549 mutex_exit(&uvm_swap_data_lock); 1550 return 0; 1551 } 1552 1553 bool 1554 uvm_swapisfull(void) 1555 { 1556 bool rv; 1557 1558 mutex_enter(&uvm_swap_data_lock); 1559 KASSERT(uvmexp.swpgonly <= uvmexp.swpages); 1560 rv = (uvmexp.swpgonly >= uvmexp.swpgavail); 1561 mutex_exit(&uvm_swap_data_lock); 1562 1563 return (rv); 1564 } 1565 1566 /* 1567 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1568 * 1569 * => we lock uvm_swap_data_lock 1570 */ 1571 void 1572 uvm_swap_markbad(int startslot, int nslots) 1573 { 1574 struct swapdev *sdp; 1575 UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist); 1576 1577 mutex_enter(&uvm_swap_data_lock); 1578 sdp = swapdrum_getsdp(startslot); 1579 KASSERT(sdp != NULL); 1580 1581 /* 1582 * we just keep track of how many pages have been marked bad 1583 * in this device, to make everything add up in swap_off(). 1584 * we assume here that the range of slots will all be within 1585 * one swap device. 1586 */ 1587 1588 KASSERT(uvmexp.swpgonly >= nslots); 1589 uvmexp.swpgonly -= nslots; 1590 sdp->swd_npgbad += nslots; 1591 UVMHIST_LOG(pdhist, "now %d bad", sdp->swd_npgbad, 0,0,0); 1592 mutex_exit(&uvm_swap_data_lock); 1593 } 1594 1595 /* 1596 * uvm_swap_free: free swap slots 1597 * 1598 * => this can be all or part of an allocation made by uvm_swap_alloc 1599 * => we lock uvm_swap_data_lock 1600 */ 1601 void 1602 uvm_swap_free(int startslot, int nslots) 1603 { 1604 struct swapdev *sdp; 1605 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist); 1606 1607 UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots, 1608 startslot, 0, 0); 1609 1610 /* 1611 * ignore attempts to free the "bad" slot. 1612 */ 1613 1614 if (startslot == SWSLOT_BAD) { 1615 return; 1616 } 1617 1618 /* 1619 * convert drum slot offset back to sdp, free the blocks 1620 * in the extent, and return. must hold pri lock to do 1621 * lookup and access the extent. 1622 */ 1623 1624 mutex_enter(&uvm_swap_data_lock); 1625 sdp = swapdrum_getsdp(startslot); 1626 KASSERT(uvmexp.nswapdev >= 1); 1627 KASSERT(sdp != NULL); 1628 KASSERT(sdp->swd_npginuse >= nslots); 1629 blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots); 1630 sdp->swd_npginuse -= nslots; 1631 uvmexp.swpginuse -= nslots; 1632 mutex_exit(&uvm_swap_data_lock); 1633 } 1634 1635 /* 1636 * uvm_swap_put: put any number of pages into a contig place on swap 1637 * 1638 * => can be sync or async 1639 */ 1640 1641 int 1642 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) 1643 { 1644 int error; 1645 1646 error = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1647 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1648 return error; 1649 } 1650 1651 /* 1652 * uvm_swap_get: get a single page from swap 1653 * 1654 * => usually a sync op (from fault) 1655 */ 1656 1657 int 1658 uvm_swap_get(struct vm_page *page, int swslot, int flags) 1659 { 1660 int error; 1661 1662 uvmexp.nswget++; 1663 KASSERT(flags & PGO_SYNCIO); 1664 if (swslot == SWSLOT_BAD) { 1665 return EIO; 1666 } 1667 1668 error = uvm_swap_io(&page, swslot, 1, B_READ | 1669 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1670 if (error == 0) { 1671 1672 /* 1673 * this page is no longer only in swap. 1674 */ 1675 1676 mutex_enter(&uvm_swap_data_lock); 1677 KASSERT(uvmexp.swpgonly > 0); 1678 uvmexp.swpgonly--; 1679 mutex_exit(&uvm_swap_data_lock); 1680 } 1681 return error; 1682 } 1683 1684 /* 1685 * uvm_swap_io: do an i/o operation to swap 1686 */ 1687 1688 static int 1689 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) 1690 { 1691 daddr_t startblk; 1692 struct buf *bp; 1693 vaddr_t kva; 1694 int error, mapinflags; 1695 bool write, async; 1696 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist); 1697 1698 UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d", 1699 startslot, npages, flags, 0); 1700 1701 write = (flags & B_READ) == 0; 1702 async = (flags & B_ASYNC) != 0; 1703 1704 /* 1705 * allocate a buf for the i/o. 1706 */ 1707 1708 KASSERT(curlwp != uvm.pagedaemon_lwp || (write && async)); 1709 bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp); 1710 if (bp == NULL) { 1711 uvm_aio_aiodone_pages(pps, npages, true, ENOMEM); 1712 return ENOMEM; 1713 } 1714 1715 /* 1716 * convert starting drum slot to block number 1717 */ 1718 1719 startblk = btodb((uint64_t)startslot << PAGE_SHIFT); 1720 1721 /* 1722 * first, map the pages into the kernel. 1723 */ 1724 1725 mapinflags = !write ? 1726 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ : 1727 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE; 1728 kva = uvm_pagermapin(pps, npages, mapinflags); 1729 1730 /* 1731 * fill in the bp/sbp. we currently route our i/o through 1732 * /dev/drum's vnode [swapdev_vp]. 1733 */ 1734 1735 bp->b_cflags = BC_BUSY | BC_NOCACHE; 1736 bp->b_flags = (flags & (B_READ|B_ASYNC)); 1737 bp->b_proc = &proc0; /* XXX */ 1738 bp->b_vnbufs.le_next = NOLIST; 1739 bp->b_data = (void *)kva; 1740 bp->b_blkno = startblk; 1741 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT; 1742 1743 /* 1744 * bump v_numoutput (counter of number of active outputs). 1745 */ 1746 1747 if (write) { 1748 mutex_enter(&swapdev_vp->v_interlock); 1749 swapdev_vp->v_numoutput++; 1750 mutex_exit(&swapdev_vp->v_interlock); 1751 } 1752 1753 /* 1754 * for async ops we must set up the iodone handler. 1755 */ 1756 1757 if (async) { 1758 bp->b_iodone = uvm_aio_biodone; 1759 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); 1760 if (curlwp == uvm.pagedaemon_lwp) 1761 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1762 else 1763 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 1764 } else { 1765 bp->b_iodone = NULL; 1766 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1767 } 1768 UVMHIST_LOG(pdhist, 1769 "about to start io: data = %p blkno = 0x%x, bcount = %ld", 1770 bp->b_data, bp->b_blkno, bp->b_bcount, 0); 1771 1772 /* 1773 * now we start the I/O, and if async, return. 1774 */ 1775 1776 VOP_STRATEGY(swapdev_vp, bp); 1777 if (async) 1778 return 0; 1779 1780 /* 1781 * must be sync i/o. wait for it to finish 1782 */ 1783 1784 error = biowait(bp); 1785 1786 /* 1787 * kill the pager mapping 1788 */ 1789 1790 uvm_pagermapout(kva, npages); 1791 1792 /* 1793 * now dispose of the buf and we're done. 1794 */ 1795 1796 if (write) { 1797 mutex_enter(&swapdev_vp->v_interlock); 1798 vwakeup(bp); 1799 mutex_exit(&swapdev_vp->v_interlock); 1800 } 1801 putiobuf(bp); 1802 UVMHIST_LOG(pdhist, "<- done (sync) error=%d", error, 0, 0, 0); 1803 1804 return (error); 1805 } 1806