1 /* $NetBSD: uvm_swap.c,v 1.97 2005/12/11 12:25:29 christos Exp $ */ 2 3 /* 4 * Copyright (c) 1995, 1996, 1997 Matthew R. Green 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. The name of the author may not be used to endorse or promote products 16 * derived from this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 31 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 32 */ 33 34 #include <sys/cdefs.h> 35 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.97 2005/12/11 12:25:29 christos Exp $"); 36 37 #include "fs_nfs.h" 38 #include "opt_uvmhist.h" 39 #include "opt_compat_netbsd.h" 40 #include "opt_ddb.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/buf.h> 45 #include <sys/bufq.h> 46 #include <sys/conf.h> 47 #include <sys/proc.h> 48 #include <sys/namei.h> 49 #include <sys/disklabel.h> 50 #include <sys/errno.h> 51 #include <sys/kernel.h> 52 #include <sys/malloc.h> 53 #include <sys/vnode.h> 54 #include <sys/file.h> 55 #include <sys/extent.h> 56 #include <sys/blist.h> 57 #include <sys/mount.h> 58 #include <sys/pool.h> 59 #include <sys/sa.h> 60 #include <sys/syscallargs.h> 61 #include <sys/swap.h> 62 63 #include <uvm/uvm.h> 64 65 #include <miscfs/specfs/specdev.h> 66 67 /* 68 * uvm_swap.c: manage configuration and i/o to swap space. 69 */ 70 71 /* 72 * swap space is managed in the following way: 73 * 74 * each swap partition or file is described by a "swapdev" structure. 75 * each "swapdev" structure contains a "swapent" structure which contains 76 * information that is passed up to the user (via system calls). 77 * 78 * each swap partition is assigned a "priority" (int) which controls 79 * swap parition usage. 80 * 81 * the system maintains a global data structure describing all swap 82 * partitions/files. there is a sorted LIST of "swappri" structures 83 * which describe "swapdev"'s at that priority. this LIST is headed 84 * by the "swap_priority" global var. each "swappri" contains a 85 * CIRCLEQ of "swapdev" structures at that priority. 86 * 87 * locking: 88 * - swap_syscall_lock (sleep lock): this lock serializes the swapctl 89 * system call and prevents the swap priority list from changing 90 * while we are in the middle of a system call (e.g. SWAP_STATS). 91 * - uvm.swap_data_lock (simple_lock): this lock protects all swap data 92 * structures including the priority list, the swapdev structures, 93 * and the swapmap extent. 94 * 95 * each swap device has the following info: 96 * - swap device in use (could be disabled, preventing future use) 97 * - swap enabled (allows new allocations on swap) 98 * - map info in /dev/drum 99 * - vnode pointer 100 * for swap files only: 101 * - block size 102 * - max byte count in buffer 103 * - buffer 104 * 105 * userland controls and configures swap with the swapctl(2) system call. 106 * the sys_swapctl performs the following operations: 107 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 108 * [2] SWAP_STATS: given a pointer to an array of swapent structures 109 * (passed in via "arg") of a size passed in via "misc" ... we load 110 * the current swap config into the array. The actual work is done 111 * in the uvm_swap_stats(9) function. 112 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 113 * priority in "misc", start swapping on it. 114 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 115 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 116 * "misc") 117 */ 118 119 /* 120 * swapdev: describes a single swap partition/file 121 * 122 * note the following should be true: 123 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 124 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] 125 */ 126 struct swapdev { 127 struct oswapent swd_ose; 128 #define swd_dev swd_ose.ose_dev /* device id */ 129 #define swd_flags swd_ose.ose_flags /* flags:inuse/enable/fake */ 130 #define swd_priority swd_ose.ose_priority /* our priority */ 131 /* also: swd_ose.ose_nblks, swd_ose.ose_inuse */ 132 char *swd_path; /* saved pathname of device */ 133 int swd_pathlen; /* length of pathname */ 134 int swd_npages; /* #pages we can use */ 135 int swd_npginuse; /* #pages in use */ 136 int swd_npgbad; /* #pages bad */ 137 int swd_drumoffset; /* page0 offset in drum */ 138 int swd_drumsize; /* #pages in drum */ 139 blist_t swd_blist; /* blist for this swapdev */ 140 struct vnode *swd_vp; /* backing vnode */ 141 CIRCLEQ_ENTRY(swapdev) swd_next; /* priority circleq */ 142 143 int swd_bsize; /* blocksize (bytes) */ 144 int swd_maxactive; /* max active i/o reqs */ 145 struct bufq_state *swd_tab; /* buffer list */ 146 int swd_active; /* number of active buffers */ 147 }; 148 149 /* 150 * swap device priority entry; the list is kept sorted on `spi_priority'. 151 */ 152 struct swappri { 153 int spi_priority; /* priority */ 154 CIRCLEQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 155 /* circleq of swapdevs at this priority */ 156 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 157 }; 158 159 /* 160 * The following two structures are used to keep track of data transfers 161 * on swap devices associated with regular files. 162 * NOTE: this code is more or less a copy of vnd.c; we use the same 163 * structure names here to ease porting.. 164 */ 165 struct vndxfer { 166 struct buf *vx_bp; /* Pointer to parent buffer */ 167 struct swapdev *vx_sdp; 168 int vx_error; 169 int vx_pending; /* # of pending aux buffers */ 170 int vx_flags; 171 #define VX_BUSY 1 172 #define VX_DEAD 2 173 }; 174 175 struct vndbuf { 176 struct buf vb_buf; 177 struct vndxfer *vb_xfer; 178 }; 179 180 181 /* 182 * We keep a of pool vndbuf's and vndxfer structures. 183 */ 184 POOL_INIT(vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx", NULL); 185 POOL_INIT(vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd", NULL); 186 187 #define getvndxfer(vnx) do { \ 188 int sp = splbio(); \ 189 vnx = pool_get(&vndxfer_pool, PR_WAITOK); \ 190 splx(sp); \ 191 } while (/*CONSTCOND*/ 0) 192 193 #define putvndxfer(vnx) { \ 194 pool_put(&vndxfer_pool, (void *)(vnx)); \ 195 } 196 197 #define getvndbuf(vbp) do { \ 198 int sp = splbio(); \ 199 vbp = pool_get(&vndbuf_pool, PR_WAITOK); \ 200 splx(sp); \ 201 } while (/*CONSTCOND*/ 0) 202 203 #define putvndbuf(vbp) { \ 204 pool_put(&vndbuf_pool, (void *)(vbp)); \ 205 } 206 207 /* 208 * local variables 209 */ 210 static struct extent *swapmap; /* controls the mapping of /dev/drum */ 211 212 MALLOC_DEFINE(M_VMSWAP, "VM swap", "VM swap structures"); 213 214 /* list of all active swap devices [by priority] */ 215 LIST_HEAD(swap_priority, swappri); 216 static struct swap_priority swap_priority; 217 218 /* locks */ 219 static struct lock swap_syscall_lock; 220 221 /* 222 * prototypes 223 */ 224 static struct swapdev *swapdrum_getsdp(int); 225 226 static struct swapdev *swaplist_find(struct vnode *, int); 227 static void swaplist_insert(struct swapdev *, 228 struct swappri *, int); 229 static void swaplist_trim(void); 230 231 static int swap_on(struct lwp *, struct swapdev *); 232 static int swap_off(struct lwp *, struct swapdev *); 233 234 static void uvm_swap_stats_locked(int, struct swapent *, int, register_t *); 235 236 static void sw_reg_strategy(struct swapdev *, struct buf *, int); 237 static void sw_reg_iodone(struct buf *); 238 static void sw_reg_start(struct swapdev *); 239 240 static int uvm_swap_io(struct vm_page **, int, int, int); 241 242 /* 243 * uvm_swap_init: init the swap system data structures and locks 244 * 245 * => called at boot time from init_main.c after the filesystems 246 * are brought up (which happens after uvm_init()) 247 */ 248 void 249 uvm_swap_init(void) 250 { 251 UVMHIST_FUNC("uvm_swap_init"); 252 253 UVMHIST_CALLED(pdhist); 254 /* 255 * first, init the swap list, its counter, and its lock. 256 * then get a handle on the vnode for /dev/drum by using 257 * the its dev_t number ("swapdev", from MD conf.c). 258 */ 259 260 LIST_INIT(&swap_priority); 261 uvmexp.nswapdev = 0; 262 lockinit(&swap_syscall_lock, PVM, "swapsys", 0, 0); 263 simple_lock_init(&uvm.swap_data_lock); 264 265 if (bdevvp(swapdev, &swapdev_vp)) 266 panic("uvm_swap_init: can't get vnode for swap device"); 267 268 /* 269 * create swap block resource map to map /dev/drum. the range 270 * from 1 to INT_MAX allows 2 gigablocks of swap space. note 271 * that block 0 is reserved (used to indicate an allocation 272 * failure, or no allocation). 273 */ 274 swapmap = extent_create("swapmap", 1, INT_MAX, 275 M_VMSWAP, 0, 0, EX_NOWAIT); 276 if (swapmap == 0) 277 panic("uvm_swap_init: extent_create failed"); 278 279 /* 280 * done! 281 */ 282 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); 283 } 284 285 /* 286 * swaplist functions: functions that operate on the list of swap 287 * devices on the system. 288 */ 289 290 /* 291 * swaplist_insert: insert swap device "sdp" into the global list 292 * 293 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 294 * => caller must provide a newly malloc'd swappri structure (we will 295 * FREE it if we don't need it... this it to prevent malloc blocking 296 * here while adding swap) 297 */ 298 static void 299 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) 300 { 301 struct swappri *spp, *pspp; 302 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist); 303 304 /* 305 * find entry at or after which to insert the new device. 306 */ 307 pspp = NULL; 308 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 309 if (priority <= spp->spi_priority) 310 break; 311 pspp = spp; 312 } 313 314 /* 315 * new priority? 316 */ 317 if (spp == NULL || spp->spi_priority != priority) { 318 spp = newspp; /* use newspp! */ 319 UVMHIST_LOG(pdhist, "created new swappri = %d", 320 priority, 0, 0, 0); 321 322 spp->spi_priority = priority; 323 CIRCLEQ_INIT(&spp->spi_swapdev); 324 325 if (pspp) 326 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 327 else 328 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 329 } else { 330 /* we don't need a new priority structure, free it */ 331 FREE(newspp, M_VMSWAP); 332 } 333 334 /* 335 * priority found (or created). now insert on the priority's 336 * circleq list and bump the total number of swapdevs. 337 */ 338 sdp->swd_priority = priority; 339 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 340 uvmexp.nswapdev++; 341 } 342 343 /* 344 * swaplist_find: find and optionally remove a swap device from the 345 * global list. 346 * 347 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 348 * => we return the swapdev we found (and removed) 349 */ 350 static struct swapdev * 351 swaplist_find(struct vnode *vp, boolean_t remove) 352 { 353 struct swapdev *sdp; 354 struct swappri *spp; 355 356 /* 357 * search the lists for the requested vp 358 */ 359 360 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 361 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 362 if (sdp->swd_vp == vp) { 363 if (remove) { 364 CIRCLEQ_REMOVE(&spp->spi_swapdev, 365 sdp, swd_next); 366 uvmexp.nswapdev--; 367 } 368 return(sdp); 369 } 370 } 371 } 372 return (NULL); 373 } 374 375 376 /* 377 * swaplist_trim: scan priority list for empty priority entries and kill 378 * them. 379 * 380 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 381 */ 382 static void 383 swaplist_trim(void) 384 { 385 struct swappri *spp, *nextspp; 386 387 for (spp = LIST_FIRST(&swap_priority); spp != NULL; spp = nextspp) { 388 nextspp = LIST_NEXT(spp, spi_swappri); 389 if (CIRCLEQ_FIRST(&spp->spi_swapdev) != 390 (void *)&spp->spi_swapdev) 391 continue; 392 LIST_REMOVE(spp, spi_swappri); 393 free(spp, M_VMSWAP); 394 } 395 } 396 397 /* 398 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 399 * to the "swapdev" that maps that section of the drum. 400 * 401 * => each swapdev takes one big contig chunk of the drum 402 * => caller must hold uvm.swap_data_lock 403 */ 404 static struct swapdev * 405 swapdrum_getsdp(int pgno) 406 { 407 struct swapdev *sdp; 408 struct swappri *spp; 409 410 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 411 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 412 if (sdp->swd_flags & SWF_FAKE) 413 continue; 414 if (pgno >= sdp->swd_drumoffset && 415 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 416 return sdp; 417 } 418 } 419 } 420 return NULL; 421 } 422 423 424 /* 425 * sys_swapctl: main entry point for swapctl(2) system call 426 * [with two helper functions: swap_on and swap_off] 427 */ 428 int 429 sys_swapctl(struct lwp *l, void *v, register_t *retval) 430 { 431 struct sys_swapctl_args /* { 432 syscallarg(int) cmd; 433 syscallarg(void *) arg; 434 syscallarg(int) misc; 435 } */ *uap = (struct sys_swapctl_args *)v; 436 struct proc *p = l->l_proc; 437 struct vnode *vp; 438 struct nameidata nd; 439 struct swappri *spp; 440 struct swapdev *sdp; 441 struct swapent *sep; 442 char userpath[PATH_MAX + 1]; 443 size_t len; 444 int error, misc; 445 int priority; 446 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist); 447 448 misc = SCARG(uap, misc); 449 450 /* 451 * ensure serialized syscall access by grabbing the swap_syscall_lock 452 */ 453 lockmgr(&swap_syscall_lock, LK_EXCLUSIVE, NULL); 454 455 /* 456 * we handle the non-priv NSWAP and STATS request first. 457 * 458 * SWAP_NSWAP: return number of config'd swap devices 459 * [can also be obtained with uvmexp sysctl] 460 */ 461 if (SCARG(uap, cmd) == SWAP_NSWAP) { 462 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev, 463 0, 0, 0); 464 *retval = uvmexp.nswapdev; 465 error = 0; 466 goto out; 467 } 468 469 /* 470 * SWAP_STATS: get stats on current # of configured swap devs 471 * 472 * note that the swap_priority list can't change as long 473 * as we are holding the swap_syscall_lock. we don't want 474 * to grab the uvm.swap_data_lock because we may fault&sleep during 475 * copyout() and we don't want to be holding that lock then! 476 */ 477 if (SCARG(uap, cmd) == SWAP_STATS 478 #if defined(COMPAT_13) 479 || SCARG(uap, cmd) == SWAP_OSTATS 480 #endif 481 ) { 482 if ((size_t)misc > (size_t)uvmexp.nswapdev) 483 misc = uvmexp.nswapdev; 484 #if defined(COMPAT_13) 485 if (SCARG(uap, cmd) == SWAP_OSTATS) 486 len = sizeof(struct oswapent) * misc; 487 else 488 #endif 489 len = sizeof(struct swapent) * misc; 490 sep = (struct swapent *)malloc(len, M_TEMP, M_WAITOK); 491 492 uvm_swap_stats_locked(SCARG(uap, cmd), sep, misc, retval); 493 error = copyout(sep, SCARG(uap, arg), len); 494 495 free(sep, M_TEMP); 496 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); 497 goto out; 498 } 499 if (SCARG(uap, cmd) == SWAP_GETDUMPDEV) { 500 dev_t *devp = (dev_t *)SCARG(uap, arg); 501 502 error = copyout(&dumpdev, devp, sizeof(dumpdev)); 503 goto out; 504 } 505 506 /* 507 * all other requests require superuser privs. verify. 508 */ 509 if ((error = suser(p->p_ucred, &p->p_acflag))) 510 goto out; 511 512 /* 513 * at this point we expect a path name in arg. we will 514 * use namei() to gain a vnode reference (vref), and lock 515 * the vnode (VOP_LOCK). 516 * 517 * XXX: a NULL arg means use the root vnode pointer (e.g. for 518 * miniroot) 519 */ 520 if (SCARG(uap, arg) == NULL) { 521 vp = rootvp; /* miniroot */ 522 if (vget(vp, LK_EXCLUSIVE)) { 523 error = EBUSY; 524 goto out; 525 } 526 if (SCARG(uap, cmd) == SWAP_ON && 527 copystr("miniroot", userpath, sizeof userpath, &len)) 528 panic("swapctl: miniroot copy failed"); 529 } else { 530 int space; 531 char *where; 532 533 if (SCARG(uap, cmd) == SWAP_ON) { 534 if ((error = copyinstr(SCARG(uap, arg), userpath, 535 sizeof userpath, &len))) 536 goto out; 537 space = UIO_SYSSPACE; 538 where = userpath; 539 } else { 540 space = UIO_USERSPACE; 541 where = (char *)SCARG(uap, arg); 542 } 543 NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, space, where, l); 544 if ((error = namei(&nd))) 545 goto out; 546 vp = nd.ni_vp; 547 } 548 /* note: "vp" is referenced and locked */ 549 550 error = 0; /* assume no error */ 551 switch(SCARG(uap, cmd)) { 552 553 case SWAP_DUMPDEV: 554 if (vp->v_type != VBLK) { 555 error = ENOTBLK; 556 break; 557 } 558 dumpdev = vp->v_rdev; 559 cpu_dumpconf(); 560 break; 561 562 case SWAP_CTL: 563 /* 564 * get new priority, remove old entry (if any) and then 565 * reinsert it in the correct place. finally, prune out 566 * any empty priority structures. 567 */ 568 priority = SCARG(uap, misc); 569 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 570 simple_lock(&uvm.swap_data_lock); 571 if ((sdp = swaplist_find(vp, 1)) == NULL) { 572 error = ENOENT; 573 } else { 574 swaplist_insert(sdp, spp, priority); 575 swaplist_trim(); 576 } 577 simple_unlock(&uvm.swap_data_lock); 578 if (error) 579 free(spp, M_VMSWAP); 580 break; 581 582 case SWAP_ON: 583 584 /* 585 * check for duplicates. if none found, then insert a 586 * dummy entry on the list to prevent someone else from 587 * trying to enable this device while we are working on 588 * it. 589 */ 590 591 priority = SCARG(uap, misc); 592 sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK); 593 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 594 memset(sdp, 0, sizeof(*sdp)); 595 sdp->swd_flags = SWF_FAKE; 596 sdp->swd_vp = vp; 597 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 598 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK); 599 simple_lock(&uvm.swap_data_lock); 600 if (swaplist_find(vp, 0) != NULL) { 601 error = EBUSY; 602 simple_unlock(&uvm.swap_data_lock); 603 bufq_free(sdp->swd_tab); 604 free(sdp, M_VMSWAP); 605 free(spp, M_VMSWAP); 606 break; 607 } 608 swaplist_insert(sdp, spp, priority); 609 simple_unlock(&uvm.swap_data_lock); 610 611 sdp->swd_pathlen = len; 612 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK); 613 if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0) 614 panic("swapctl: copystr"); 615 616 /* 617 * we've now got a FAKE placeholder in the swap list. 618 * now attempt to enable swap on it. if we fail, undo 619 * what we've done and kill the fake entry we just inserted. 620 * if swap_on is a success, it will clear the SWF_FAKE flag 621 */ 622 623 if ((error = swap_on(l, sdp)) != 0) { 624 simple_lock(&uvm.swap_data_lock); 625 (void) swaplist_find(vp, 1); /* kill fake entry */ 626 swaplist_trim(); 627 simple_unlock(&uvm.swap_data_lock); 628 bufq_free(sdp->swd_tab); 629 free(sdp->swd_path, M_VMSWAP); 630 free(sdp, M_VMSWAP); 631 break; 632 } 633 break; 634 635 case SWAP_OFF: 636 simple_lock(&uvm.swap_data_lock); 637 if ((sdp = swaplist_find(vp, 0)) == NULL) { 638 simple_unlock(&uvm.swap_data_lock); 639 error = ENXIO; 640 break; 641 } 642 643 /* 644 * If a device isn't in use or enabled, we 645 * can't stop swapping from it (again). 646 */ 647 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 648 simple_unlock(&uvm.swap_data_lock); 649 error = EBUSY; 650 break; 651 } 652 653 /* 654 * do the real work. 655 */ 656 error = swap_off(l, sdp); 657 break; 658 659 default: 660 error = EINVAL; 661 } 662 663 /* 664 * done! release the ref gained by namei() and unlock. 665 */ 666 vput(vp); 667 668 out: 669 lockmgr(&swap_syscall_lock, LK_RELEASE, NULL); 670 671 UVMHIST_LOG(pdhist, "<- done! error=%d", error, 0, 0, 0); 672 return (error); 673 } 674 675 /* 676 * swap_stats: implements swapctl(SWAP_STATS). The function is kept 677 * away from sys_swapctl() in order to allow COMPAT_* swapctl() 678 * emulation to use it directly without going through sys_swapctl(). 679 * The problem with using sys_swapctl() there is that it involves 680 * copying the swapent array to the stackgap, and this array's size 681 * is not known at build time. Hence it would not be possible to 682 * ensure it would fit in the stackgap in any case. 683 */ 684 void 685 uvm_swap_stats(int cmd, struct swapent *sep, int sec, register_t *retval) 686 { 687 688 lockmgr(&swap_syscall_lock, LK_EXCLUSIVE, NULL); 689 uvm_swap_stats_locked(cmd, sep, sec, retval); 690 lockmgr(&swap_syscall_lock, LK_RELEASE, NULL); 691 } 692 693 static void 694 uvm_swap_stats_locked(int cmd, struct swapent *sep, int sec, register_t *retval) 695 { 696 struct swappri *spp; 697 struct swapdev *sdp; 698 int count = 0; 699 700 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 701 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev); 702 sdp != (void *)&spp->spi_swapdev && sec-- > 0; 703 sdp = CIRCLEQ_NEXT(sdp, swd_next)) { 704 /* 705 * backwards compatibility for system call. 706 * note that we use 'struct oswapent' as an 707 * overlay into both 'struct swapdev' and 708 * the userland 'struct swapent', as we 709 * want to retain backwards compatibility 710 * with NetBSD 1.3. 711 */ 712 sdp->swd_ose.ose_inuse = 713 btodb((u_int64_t)sdp->swd_npginuse << 714 PAGE_SHIFT); 715 (void)memcpy(sep, &sdp->swd_ose, 716 sizeof(struct oswapent)); 717 718 /* now copy out the path if necessary */ 719 #if defined(COMPAT_13) 720 if (cmd == SWAP_STATS) 721 #endif 722 (void)memcpy(&sep->se_path, sdp->swd_path, 723 sdp->swd_pathlen); 724 725 count++; 726 #if defined(COMPAT_13) 727 if (cmd == SWAP_OSTATS) 728 sep = (struct swapent *) 729 ((struct oswapent *)sep + 1); 730 else 731 #endif 732 sep++; 733 } 734 } 735 736 *retval = count; 737 return; 738 } 739 740 /* 741 * swap_on: attempt to enable a swapdev for swapping. note that the 742 * swapdev is already on the global list, but disabled (marked 743 * SWF_FAKE). 744 * 745 * => we avoid the start of the disk (to protect disk labels) 746 * => we also avoid the miniroot, if we are swapping to root. 747 * => caller should leave uvm.swap_data_lock unlocked, we may lock it 748 * if needed. 749 */ 750 static int 751 swap_on(struct lwp *l, struct swapdev *sdp) 752 { 753 struct vnode *vp; 754 struct proc *p = l->l_proc; 755 int error, npages, nblocks, size; 756 long addr; 757 u_long result; 758 struct vattr va; 759 #ifdef NFS 760 extern int (**nfsv2_vnodeop_p)(void *); 761 #endif /* NFS */ 762 const struct bdevsw *bdev; 763 dev_t dev; 764 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist); 765 766 /* 767 * we want to enable swapping on sdp. the swd_vp contains 768 * the vnode we want (locked and ref'd), and the swd_dev 769 * contains the dev_t of the file, if it a block device. 770 */ 771 772 vp = sdp->swd_vp; 773 dev = sdp->swd_dev; 774 775 /* 776 * open the swap file (mostly useful for block device files to 777 * let device driver know what is up). 778 * 779 * we skip the open/close for root on swap because the root 780 * has already been opened when root was mounted (mountroot). 781 */ 782 if (vp != rootvp) { 783 if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, l))) 784 return (error); 785 } 786 787 /* XXX this only works for block devices */ 788 UVMHIST_LOG(pdhist, " dev=%d, major(dev)=%d", dev, major(dev), 0,0); 789 790 /* 791 * we now need to determine the size of the swap area. for 792 * block specials we can call the d_psize function. 793 * for normal files, we must stat [get attrs]. 794 * 795 * we put the result in nblks. 796 * for normal files, we also want the filesystem block size 797 * (which we get with statfs). 798 */ 799 switch (vp->v_type) { 800 case VBLK: 801 bdev = bdevsw_lookup(dev); 802 if (bdev == NULL || bdev->d_psize == NULL || 803 (nblocks = (*bdev->d_psize)(dev)) == -1) { 804 error = ENXIO; 805 goto bad; 806 } 807 break; 808 809 case VREG: 810 if ((error = VOP_GETATTR(vp, &va, p->p_ucred, l))) 811 goto bad; 812 nblocks = (int)btodb(va.va_size); 813 if ((error = 814 VFS_STATVFS(vp->v_mount, &vp->v_mount->mnt_stat, l)) != 0) 815 goto bad; 816 817 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize; 818 /* 819 * limit the max # of outstanding I/O requests we issue 820 * at any one time. take it easy on NFS servers. 821 */ 822 #ifdef NFS 823 if (vp->v_op == nfsv2_vnodeop_p) 824 sdp->swd_maxactive = 2; /* XXX */ 825 else 826 #endif /* NFS */ 827 sdp->swd_maxactive = 8; /* XXX */ 828 break; 829 830 default: 831 error = ENXIO; 832 goto bad; 833 } 834 835 /* 836 * save nblocks in a safe place and convert to pages. 837 */ 838 839 sdp->swd_ose.ose_nblks = nblocks; 840 npages = dbtob((u_int64_t)nblocks) >> PAGE_SHIFT; 841 842 /* 843 * for block special files, we want to make sure that leave 844 * the disklabel and bootblocks alone, so we arrange to skip 845 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 846 * note that because of this the "size" can be less than the 847 * actual number of blocks on the device. 848 */ 849 if (vp->v_type == VBLK) { 850 /* we use pages 1 to (size - 1) [inclusive] */ 851 size = npages - 1; 852 addr = 1; 853 } else { 854 /* we use pages 0 to (size - 1) [inclusive] */ 855 size = npages; 856 addr = 0; 857 } 858 859 /* 860 * make sure we have enough blocks for a reasonable sized swap 861 * area. we want at least one page. 862 */ 863 864 if (size < 1) { 865 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); 866 error = EINVAL; 867 goto bad; 868 } 869 870 UVMHIST_LOG(pdhist, " dev=%x: size=%d addr=%ld\n", dev, size, addr, 0); 871 872 /* 873 * now we need to allocate an extent to manage this swap device 874 */ 875 876 sdp->swd_blist = blist_create(npages); 877 /* mark all expect the `saved' region free. */ 878 blist_free(sdp->swd_blist, addr, size); 879 880 /* 881 * if the vnode we are swapping to is the root vnode 882 * (i.e. we are swapping to the miniroot) then we want 883 * to make sure we don't overwrite it. do a statfs to 884 * find its size and skip over it. 885 */ 886 if (vp == rootvp) { 887 struct mount *mp; 888 struct statvfs *sp; 889 int rootblocks, rootpages; 890 891 mp = rootvnode->v_mount; 892 sp = &mp->mnt_stat; 893 rootblocks = sp->f_blocks * btodb(sp->f_frsize); 894 /* 895 * XXX: sp->f_blocks isn't the total number of 896 * blocks in the filesystem, it's the number of 897 * data blocks. so, our rootblocks almost 898 * definitely underestimates the total size 899 * of the filesystem - how badly depends on the 900 * details of the filesystem type. there isn't 901 * an obvious way to deal with this cleanly 902 * and perfectly, so for now we just pad our 903 * rootblocks estimate with an extra 5 percent. 904 */ 905 rootblocks += (rootblocks >> 5) + 906 (rootblocks >> 6) + 907 (rootblocks >> 7); 908 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; 909 if (rootpages > size) 910 panic("swap_on: miniroot larger than swap?"); 911 912 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) { 913 panic("swap_on: unable to preserve miniroot"); 914 } 915 916 size -= rootpages; 917 printf("Preserved %d pages of miniroot ", rootpages); 918 printf("leaving %d pages of swap\n", size); 919 } 920 921 /* 922 * add a ref to vp to reflect usage as a swap device. 923 */ 924 vref(vp); 925 926 /* 927 * now add the new swapdev to the drum and enable. 928 */ 929 if (extent_alloc(swapmap, npages, EX_NOALIGN, EX_NOBOUNDARY, 930 EX_WAITOK, &result)) 931 panic("swapdrum_add"); 932 933 sdp->swd_drumoffset = (int)result; 934 sdp->swd_drumsize = npages; 935 sdp->swd_npages = size; 936 simple_lock(&uvm.swap_data_lock); 937 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 938 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 939 uvmexp.swpages += size; 940 uvmexp.swpgavail += size; 941 simple_unlock(&uvm.swap_data_lock); 942 return (0); 943 944 /* 945 * failure: clean up and return error. 946 */ 947 948 bad: 949 if (sdp->swd_blist) { 950 blist_destroy(sdp->swd_blist); 951 } 952 if (vp != rootvp) { 953 (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, l); 954 } 955 return (error); 956 } 957 958 /* 959 * swap_off: stop swapping on swapdev 960 * 961 * => swap data should be locked, we will unlock. 962 */ 963 static int 964 swap_off(struct lwp *l, struct swapdev *sdp) 965 { 966 struct proc *p = l->l_proc; 967 int npages = sdp->swd_npages; 968 int error = 0; 969 970 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist); 971 UVMHIST_LOG(pdhist, " dev=%x, npages=%d", sdp->swd_dev,npages,0,0); 972 973 /* disable the swap area being removed */ 974 sdp->swd_flags &= ~SWF_ENABLE; 975 uvmexp.swpgavail -= npages; 976 simple_unlock(&uvm.swap_data_lock); 977 978 /* 979 * the idea is to find all the pages that are paged out to this 980 * device, and page them all in. in uvm, swap-backed pageable 981 * memory can take two forms: aobjs and anons. call the 982 * swapoff hook for each subsystem to bring in pages. 983 */ 984 985 if (uao_swap_off(sdp->swd_drumoffset, 986 sdp->swd_drumoffset + sdp->swd_drumsize) || 987 amap_swap_off(sdp->swd_drumoffset, 988 sdp->swd_drumoffset + sdp->swd_drumsize)) { 989 error = ENOMEM; 990 } else if (sdp->swd_npginuse > sdp->swd_npgbad) { 991 error = EBUSY; 992 } 993 994 if (error) { 995 simple_lock(&uvm.swap_data_lock); 996 sdp->swd_flags |= SWF_ENABLE; 997 uvmexp.swpgavail += npages; 998 simple_unlock(&uvm.swap_data_lock); 999 1000 return error; 1001 } 1002 1003 /* 1004 * done with the vnode. 1005 * drop our ref on the vnode before calling VOP_CLOSE() 1006 * so that spec_close() can tell if this is the last close. 1007 */ 1008 vrele(sdp->swd_vp); 1009 if (sdp->swd_vp != rootvp) { 1010 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, l); 1011 } 1012 1013 simple_lock(&uvm.swap_data_lock); 1014 uvmexp.swpages -= npages; 1015 uvmexp.swpginuse -= sdp->swd_npgbad; 1016 1017 if (swaplist_find(sdp->swd_vp, 1) == NULL) 1018 panic("swap_off: swapdev not in list"); 1019 swaplist_trim(); 1020 simple_unlock(&uvm.swap_data_lock); 1021 1022 /* 1023 * free all resources! 1024 */ 1025 extent_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize, 1026 EX_WAITOK); 1027 blist_destroy(sdp->swd_blist); 1028 bufq_free(sdp->swd_tab); 1029 free(sdp, M_VMSWAP); 1030 return (0); 1031 } 1032 1033 /* 1034 * /dev/drum interface and i/o functions 1035 */ 1036 1037 /* 1038 * swstrategy: perform I/O on the drum 1039 * 1040 * => we must map the i/o request from the drum to the correct swapdev. 1041 */ 1042 static void 1043 swstrategy(struct buf *bp) 1044 { 1045 struct swapdev *sdp; 1046 struct vnode *vp; 1047 int s, pageno, bn; 1048 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist); 1049 1050 /* 1051 * convert block number to swapdev. note that swapdev can't 1052 * be yanked out from under us because we are holding resources 1053 * in it (i.e. the blocks we are doing I/O on). 1054 */ 1055 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT; 1056 simple_lock(&uvm.swap_data_lock); 1057 sdp = swapdrum_getsdp(pageno); 1058 simple_unlock(&uvm.swap_data_lock); 1059 if (sdp == NULL) { 1060 bp->b_error = EINVAL; 1061 bp->b_flags |= B_ERROR; 1062 biodone(bp); 1063 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); 1064 return; 1065 } 1066 1067 /* 1068 * convert drum page number to block number on this swapdev. 1069 */ 1070 1071 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1072 bn = btodb((u_int64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ 1073 1074 UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld", 1075 ((bp->b_flags & B_READ) == 0) ? "write" : "read", 1076 sdp->swd_drumoffset, bn, bp->b_bcount); 1077 1078 /* 1079 * for block devices we finish up here. 1080 * for regular files we have to do more work which we delegate 1081 * to sw_reg_strategy(). 1082 */ 1083 1084 switch (sdp->swd_vp->v_type) { 1085 default: 1086 panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type); 1087 1088 case VBLK: 1089 1090 /* 1091 * must convert "bp" from an I/O on /dev/drum to an I/O 1092 * on the swapdev (sdp). 1093 */ 1094 s = splbio(); 1095 bp->b_blkno = bn; /* swapdev block number */ 1096 vp = sdp->swd_vp; /* swapdev vnode pointer */ 1097 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */ 1098 1099 /* 1100 * if we are doing a write, we have to redirect the i/o on 1101 * drum's v_numoutput counter to the swapdevs. 1102 */ 1103 if ((bp->b_flags & B_READ) == 0) { 1104 vwakeup(bp); /* kills one 'v_numoutput' on drum */ 1105 V_INCR_NUMOUTPUT(vp); /* put it on swapdev */ 1106 } 1107 1108 /* 1109 * finally plug in swapdev vnode and start I/O 1110 */ 1111 bp->b_vp = vp; 1112 splx(s); 1113 VOP_STRATEGY(vp, bp); 1114 return; 1115 1116 case VREG: 1117 /* 1118 * delegate to sw_reg_strategy function. 1119 */ 1120 sw_reg_strategy(sdp, bp, bn); 1121 return; 1122 } 1123 /* NOTREACHED */ 1124 } 1125 1126 /* 1127 * swread: the read function for the drum (just a call to physio) 1128 */ 1129 /*ARGSUSED*/ 1130 static int 1131 swread(dev_t dev, struct uio *uio, int ioflag) 1132 { 1133 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist); 1134 1135 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); 1136 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); 1137 } 1138 1139 /* 1140 * swwrite: the write function for the drum (just a call to physio) 1141 */ 1142 /*ARGSUSED*/ 1143 static int 1144 swwrite(dev_t dev, struct uio *uio, int ioflag) 1145 { 1146 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist); 1147 1148 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); 1149 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); 1150 } 1151 1152 const struct bdevsw swap_bdevsw = { 1153 noopen, noclose, swstrategy, noioctl, nodump, nosize, 1154 }; 1155 1156 const struct cdevsw swap_cdevsw = { 1157 nullopen, nullclose, swread, swwrite, noioctl, 1158 nostop, notty, nopoll, nommap, nokqfilter 1159 }; 1160 1161 /* 1162 * sw_reg_strategy: handle swap i/o to regular files 1163 */ 1164 static void 1165 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) 1166 { 1167 struct vnode *vp; 1168 struct vndxfer *vnx; 1169 daddr_t nbn; 1170 caddr_t addr; 1171 off_t byteoff; 1172 int s, off, nra, error, sz, resid; 1173 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist); 1174 1175 /* 1176 * allocate a vndxfer head for this transfer and point it to 1177 * our buffer. 1178 */ 1179 getvndxfer(vnx); 1180 vnx->vx_flags = VX_BUSY; 1181 vnx->vx_error = 0; 1182 vnx->vx_pending = 0; 1183 vnx->vx_bp = bp; 1184 vnx->vx_sdp = sdp; 1185 1186 /* 1187 * setup for main loop where we read filesystem blocks into 1188 * our buffer. 1189 */ 1190 error = 0; 1191 bp->b_resid = bp->b_bcount; /* nothing transfered yet! */ 1192 addr = bp->b_data; /* current position in buffer */ 1193 byteoff = dbtob((u_int64_t)bn); 1194 1195 for (resid = bp->b_resid; resid; resid -= sz) { 1196 struct vndbuf *nbp; 1197 1198 /* 1199 * translate byteoffset into block number. return values: 1200 * vp = vnode of underlying device 1201 * nbn = new block number (on underlying vnode dev) 1202 * nra = num blocks we can read-ahead (excludes requested 1203 * block) 1204 */ 1205 nra = 0; 1206 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1207 &vp, &nbn, &nra); 1208 1209 if (error == 0 && nbn == (daddr_t)-1) { 1210 /* 1211 * this used to just set error, but that doesn't 1212 * do the right thing. Instead, it causes random 1213 * memory errors. The panic() should remain until 1214 * this condition doesn't destabilize the system. 1215 */ 1216 #if 1 1217 panic("sw_reg_strategy: swap to sparse file"); 1218 #else 1219 error = EIO; /* failure */ 1220 #endif 1221 } 1222 1223 /* 1224 * punt if there was an error or a hole in the file. 1225 * we must wait for any i/o ops we have already started 1226 * to finish before returning. 1227 * 1228 * XXX we could deal with holes here but it would be 1229 * a hassle (in the write case). 1230 */ 1231 if (error) { 1232 s = splbio(); 1233 vnx->vx_error = error; /* pass error up */ 1234 goto out; 1235 } 1236 1237 /* 1238 * compute the size ("sz") of this transfer (in bytes). 1239 */ 1240 off = byteoff % sdp->swd_bsize; 1241 sz = (1 + nra) * sdp->swd_bsize - off; 1242 if (sz > resid) 1243 sz = resid; 1244 1245 UVMHIST_LOG(pdhist, "sw_reg_strategy: " 1246 "vp %p/%p offset 0x%x/0x%x", 1247 sdp->swd_vp, vp, byteoff, nbn); 1248 1249 /* 1250 * now get a buf structure. note that the vb_buf is 1251 * at the front of the nbp structure so that you can 1252 * cast pointers between the two structure easily. 1253 */ 1254 getvndbuf(nbp); 1255 BUF_INIT(&nbp->vb_buf); 1256 nbp->vb_buf.b_flags = bp->b_flags | B_CALL; 1257 nbp->vb_buf.b_bcount = sz; 1258 nbp->vb_buf.b_bufsize = sz; 1259 nbp->vb_buf.b_error = 0; 1260 nbp->vb_buf.b_data = addr; 1261 nbp->vb_buf.b_lblkno = 0; 1262 nbp->vb_buf.b_blkno = nbn + btodb(off); 1263 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno; 1264 nbp->vb_buf.b_iodone = sw_reg_iodone; 1265 nbp->vb_buf.b_vp = vp; 1266 if (vp->v_type == VBLK) { 1267 nbp->vb_buf.b_dev = vp->v_rdev; 1268 } 1269 1270 nbp->vb_xfer = vnx; /* patch it back in to vnx */ 1271 1272 /* 1273 * Just sort by block number 1274 */ 1275 s = splbio(); 1276 if (vnx->vx_error != 0) { 1277 putvndbuf(nbp); 1278 goto out; 1279 } 1280 vnx->vx_pending++; 1281 1282 /* sort it in and start I/O if we are not over our limit */ 1283 BUFQ_PUT(sdp->swd_tab, &nbp->vb_buf); 1284 sw_reg_start(sdp); 1285 splx(s); 1286 1287 /* 1288 * advance to the next I/O 1289 */ 1290 byteoff += sz; 1291 addr += sz; 1292 } 1293 1294 s = splbio(); 1295 1296 out: /* Arrive here at splbio */ 1297 vnx->vx_flags &= ~VX_BUSY; 1298 if (vnx->vx_pending == 0) { 1299 if (vnx->vx_error != 0) { 1300 bp->b_error = vnx->vx_error; 1301 bp->b_flags |= B_ERROR; 1302 } 1303 putvndxfer(vnx); 1304 biodone(bp); 1305 } 1306 splx(s); 1307 } 1308 1309 /* 1310 * sw_reg_start: start an I/O request on the requested swapdev 1311 * 1312 * => reqs are sorted by b_rawblkno (above) 1313 */ 1314 static void 1315 sw_reg_start(struct swapdev *sdp) 1316 { 1317 struct buf *bp; 1318 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist); 1319 1320 /* recursion control */ 1321 if ((sdp->swd_flags & SWF_BUSY) != 0) 1322 return; 1323 1324 sdp->swd_flags |= SWF_BUSY; 1325 1326 while (sdp->swd_active < sdp->swd_maxactive) { 1327 bp = BUFQ_GET(sdp->swd_tab); 1328 if (bp == NULL) 1329 break; 1330 sdp->swd_active++; 1331 1332 UVMHIST_LOG(pdhist, 1333 "sw_reg_start: bp %p vp %p blkno %p cnt %lx", 1334 bp, bp->b_vp, bp->b_blkno, bp->b_bcount); 1335 if ((bp->b_flags & B_READ) == 0) 1336 V_INCR_NUMOUTPUT(bp->b_vp); 1337 1338 VOP_STRATEGY(bp->b_vp, bp); 1339 } 1340 sdp->swd_flags &= ~SWF_BUSY; 1341 } 1342 1343 /* 1344 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1345 * 1346 * => note that we can recover the vndbuf struct by casting the buf ptr 1347 */ 1348 static void 1349 sw_reg_iodone(struct buf *bp) 1350 { 1351 struct vndbuf *vbp = (struct vndbuf *) bp; 1352 struct vndxfer *vnx = vbp->vb_xfer; 1353 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1354 struct swapdev *sdp = vnx->vx_sdp; 1355 int s, resid, error; 1356 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist); 1357 1358 UVMHIST_LOG(pdhist, " vbp=%p vp=%p blkno=%x addr=%p", 1359 vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data); 1360 UVMHIST_LOG(pdhist, " cnt=%lx resid=%lx", 1361 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); 1362 1363 /* 1364 * protect vbp at splbio and update. 1365 */ 1366 1367 s = splbio(); 1368 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1369 pbp->b_resid -= resid; 1370 vnx->vx_pending--; 1371 1372 if (vbp->vb_buf.b_flags & B_ERROR) { 1373 /* pass error upward */ 1374 error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO; 1375 UVMHIST_LOG(pdhist, " got error=%d !", error, 0, 0, 0); 1376 vnx->vx_error = error; 1377 } 1378 1379 /* 1380 * kill vbp structure 1381 */ 1382 putvndbuf(vbp); 1383 1384 /* 1385 * wrap up this transaction if it has run to completion or, in 1386 * case of an error, when all auxiliary buffers have returned. 1387 */ 1388 if (vnx->vx_error != 0) { 1389 /* pass error upward */ 1390 pbp->b_flags |= B_ERROR; 1391 pbp->b_error = vnx->vx_error; 1392 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1393 putvndxfer(vnx); 1394 biodone(pbp); 1395 } 1396 } else if (pbp->b_resid == 0) { 1397 KASSERT(vnx->vx_pending == 0); 1398 if ((vnx->vx_flags & VX_BUSY) == 0) { 1399 UVMHIST_LOG(pdhist, " iodone error=%d !", 1400 pbp, vnx->vx_error, 0, 0); 1401 putvndxfer(vnx); 1402 biodone(pbp); 1403 } 1404 } 1405 1406 /* 1407 * done! start next swapdev I/O if one is pending 1408 */ 1409 sdp->swd_active--; 1410 sw_reg_start(sdp); 1411 splx(s); 1412 } 1413 1414 1415 /* 1416 * uvm_swap_alloc: allocate space on swap 1417 * 1418 * => allocation is done "round robin" down the priority list, as we 1419 * allocate in a priority we "rotate" the circle queue. 1420 * => space can be freed with uvm_swap_free 1421 * => we return the page slot number in /dev/drum (0 == invalid slot) 1422 * => we lock uvm.swap_data_lock 1423 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1424 */ 1425 int 1426 uvm_swap_alloc(int *nslots /* IN/OUT */, boolean_t lessok) 1427 { 1428 struct swapdev *sdp; 1429 struct swappri *spp; 1430 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist); 1431 1432 /* 1433 * no swap devices configured yet? definite failure. 1434 */ 1435 if (uvmexp.nswapdev < 1) 1436 return 0; 1437 1438 /* 1439 * lock data lock, convert slots into blocks, and enter loop 1440 */ 1441 simple_lock(&uvm.swap_data_lock); 1442 1443 ReTry: /* XXXMRG */ 1444 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1445 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1446 uint64_t result; 1447 1448 /* if it's not enabled, then we can't swap from it */ 1449 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1450 continue; 1451 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1452 continue; 1453 result = blist_alloc(sdp->swd_blist, *nslots); 1454 if (result == BLIST_NONE) { 1455 continue; 1456 } 1457 KASSERT(result < sdp->swd_drumsize); 1458 1459 /* 1460 * successful allocation! now rotate the circleq. 1461 */ 1462 CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1463 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1464 sdp->swd_npginuse += *nslots; 1465 uvmexp.swpginuse += *nslots; 1466 simple_unlock(&uvm.swap_data_lock); 1467 /* done! return drum slot number */ 1468 UVMHIST_LOG(pdhist, 1469 "success! returning %d slots starting at %d", 1470 *nslots, result + sdp->swd_drumoffset, 0, 0); 1471 return (result + sdp->swd_drumoffset); 1472 } 1473 } 1474 1475 /* XXXMRG: BEGIN HACK */ 1476 if (*nslots > 1 && lessok) { 1477 *nslots = 1; 1478 /* XXXMRG: ugh! blist should support this for us */ 1479 goto ReTry; 1480 } 1481 /* XXXMRG: END HACK */ 1482 1483 simple_unlock(&uvm.swap_data_lock); 1484 return 0; 1485 } 1486 1487 boolean_t 1488 uvm_swapisfull(void) 1489 { 1490 boolean_t rv; 1491 1492 simple_lock(&uvm.swap_data_lock); 1493 KASSERT(uvmexp.swpgonly <= uvmexp.swpages); 1494 rv = (uvmexp.swpgonly >= uvmexp.swpgavail); 1495 simple_unlock(&uvm.swap_data_lock); 1496 1497 return (rv); 1498 } 1499 1500 /* 1501 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1502 * 1503 * => we lock uvm.swap_data_lock 1504 */ 1505 void 1506 uvm_swap_markbad(int startslot, int nslots) 1507 { 1508 struct swapdev *sdp; 1509 UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist); 1510 1511 simple_lock(&uvm.swap_data_lock); 1512 sdp = swapdrum_getsdp(startslot); 1513 KASSERT(sdp != NULL); 1514 1515 /* 1516 * we just keep track of how many pages have been marked bad 1517 * in this device, to make everything add up in swap_off(). 1518 * we assume here that the range of slots will all be within 1519 * one swap device. 1520 */ 1521 1522 KASSERT(uvmexp.swpgonly >= nslots); 1523 uvmexp.swpgonly -= nslots; 1524 sdp->swd_npgbad += nslots; 1525 UVMHIST_LOG(pdhist, "now %d bad", sdp->swd_npgbad, 0,0,0); 1526 simple_unlock(&uvm.swap_data_lock); 1527 } 1528 1529 /* 1530 * uvm_swap_free: free swap slots 1531 * 1532 * => this can be all or part of an allocation made by uvm_swap_alloc 1533 * => we lock uvm.swap_data_lock 1534 */ 1535 void 1536 uvm_swap_free(int startslot, int nslots) 1537 { 1538 struct swapdev *sdp; 1539 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist); 1540 1541 UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots, 1542 startslot, 0, 0); 1543 1544 /* 1545 * ignore attempts to free the "bad" slot. 1546 */ 1547 1548 if (startslot == SWSLOT_BAD) { 1549 return; 1550 } 1551 1552 /* 1553 * convert drum slot offset back to sdp, free the blocks 1554 * in the extent, and return. must hold pri lock to do 1555 * lookup and access the extent. 1556 */ 1557 1558 simple_lock(&uvm.swap_data_lock); 1559 sdp = swapdrum_getsdp(startslot); 1560 KASSERT(uvmexp.nswapdev >= 1); 1561 KASSERT(sdp != NULL); 1562 KASSERT(sdp->swd_npginuse >= nslots); 1563 blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots); 1564 sdp->swd_npginuse -= nslots; 1565 uvmexp.swpginuse -= nslots; 1566 simple_unlock(&uvm.swap_data_lock); 1567 } 1568 1569 /* 1570 * uvm_swap_put: put any number of pages into a contig place on swap 1571 * 1572 * => can be sync or async 1573 */ 1574 1575 int 1576 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) 1577 { 1578 int error; 1579 1580 error = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1581 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1582 return error; 1583 } 1584 1585 /* 1586 * uvm_swap_get: get a single page from swap 1587 * 1588 * => usually a sync op (from fault) 1589 */ 1590 1591 int 1592 uvm_swap_get(struct vm_page *page, int swslot, int flags) 1593 { 1594 int error; 1595 1596 uvmexp.nswget++; 1597 KASSERT(flags & PGO_SYNCIO); 1598 if (swslot == SWSLOT_BAD) { 1599 return EIO; 1600 } 1601 1602 error = uvm_swap_io(&page, swslot, 1, B_READ | 1603 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1604 if (error == 0) { 1605 1606 /* 1607 * this page is no longer only in swap. 1608 */ 1609 1610 simple_lock(&uvm.swap_data_lock); 1611 KASSERT(uvmexp.swpgonly > 0); 1612 uvmexp.swpgonly--; 1613 simple_unlock(&uvm.swap_data_lock); 1614 } 1615 return error; 1616 } 1617 1618 /* 1619 * uvm_swap_io: do an i/o operation to swap 1620 */ 1621 1622 static int 1623 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) 1624 { 1625 daddr_t startblk; 1626 struct buf *bp; 1627 vaddr_t kva; 1628 int error, s, mapinflags; 1629 boolean_t write, async; 1630 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist); 1631 1632 UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d", 1633 startslot, npages, flags, 0); 1634 1635 write = (flags & B_READ) == 0; 1636 async = (flags & B_ASYNC) != 0; 1637 1638 /* 1639 * convert starting drum slot to block number 1640 */ 1641 1642 startblk = btodb((u_int64_t)startslot << PAGE_SHIFT); 1643 1644 /* 1645 * first, map the pages into the kernel. 1646 */ 1647 1648 mapinflags = !write ? 1649 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ : 1650 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE; 1651 kva = uvm_pagermapin(pps, npages, mapinflags); 1652 1653 /* 1654 * now allocate a buf for the i/o. 1655 */ 1656 1657 s = splbio(); 1658 bp = pool_get(&bufpool, PR_WAITOK); 1659 splx(s); 1660 1661 /* 1662 * fill in the bp/sbp. we currently route our i/o through 1663 * /dev/drum's vnode [swapdev_vp]. 1664 */ 1665 1666 BUF_INIT(bp); 1667 bp->b_flags = B_BUSY | B_NOCACHE | (flags & (B_READ|B_ASYNC)); 1668 bp->b_proc = &proc0; /* XXX */ 1669 bp->b_vnbufs.le_next = NOLIST; 1670 bp->b_data = (caddr_t)kva; 1671 bp->b_blkno = startblk; 1672 bp->b_vp = swapdev_vp; 1673 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT; 1674 1675 /* 1676 * bump v_numoutput (counter of number of active outputs). 1677 */ 1678 1679 if (write) { 1680 s = splbio(); 1681 V_INCR_NUMOUTPUT(swapdev_vp); 1682 splx(s); 1683 } 1684 1685 /* 1686 * for async ops we must set up the iodone handler. 1687 */ 1688 1689 if (async) { 1690 bp->b_flags |= B_CALL; 1691 bp->b_iodone = uvm_aio_biodone; 1692 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); 1693 if (curproc == uvm.pagedaemon_proc) 1694 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1695 else 1696 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 1697 } else { 1698 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1699 } 1700 UVMHIST_LOG(pdhist, 1701 "about to start io: data = %p blkno = 0x%x, bcount = %ld", 1702 bp->b_data, bp->b_blkno, bp->b_bcount, 0); 1703 1704 /* 1705 * now we start the I/O, and if async, return. 1706 */ 1707 1708 VOP_STRATEGY(swapdev_vp, bp); 1709 if (async) 1710 return 0; 1711 1712 /* 1713 * must be sync i/o. wait for it to finish 1714 */ 1715 1716 error = biowait(bp); 1717 1718 /* 1719 * kill the pager mapping 1720 */ 1721 1722 uvm_pagermapout(kva, npages); 1723 1724 /* 1725 * now dispose of the buf and we're done. 1726 */ 1727 1728 s = splbio(); 1729 if (write) 1730 vwakeup(bp); 1731 pool_put(&bufpool, bp); 1732 splx(s); 1733 UVMHIST_LOG(pdhist, "<- done (sync) error=%d", error, 0, 0, 0); 1734 return (error); 1735 } 1736