1 /* $NetBSD: uvm_swap.c,v 1.52 2001/05/26 16:32:47 chs Exp $ */ 2 3 /* 4 * Copyright (c) 1995, 1996, 1997 Matthew R. Green 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. The name of the author may not be used to endorse or promote products 16 * derived from this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 31 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 32 */ 33 34 #include "fs_nfs.h" 35 #include "opt_uvmhist.h" 36 #include "opt_compat_netbsd.h" 37 #include "opt_ddb.h" 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/buf.h> 42 #include <sys/conf.h> 43 #include <sys/proc.h> 44 #include <sys/namei.h> 45 #include <sys/disklabel.h> 46 #include <sys/errno.h> 47 #include <sys/kernel.h> 48 #include <sys/malloc.h> 49 #include <sys/vnode.h> 50 #include <sys/file.h> 51 #include <sys/extent.h> 52 #include <sys/mount.h> 53 #include <sys/pool.h> 54 #include <sys/syscallargs.h> 55 #include <sys/swap.h> 56 57 #include <uvm/uvm.h> 58 59 #include <miscfs/specfs/specdev.h> 60 61 /* 62 * uvm_swap.c: manage configuration and i/o to swap space. 63 */ 64 65 /* 66 * swap space is managed in the following way: 67 * 68 * each swap partition or file is described by a "swapdev" structure. 69 * each "swapdev" structure contains a "swapent" structure which contains 70 * information that is passed up to the user (via system calls). 71 * 72 * each swap partition is assigned a "priority" (int) which controls 73 * swap parition usage. 74 * 75 * the system maintains a global data structure describing all swap 76 * partitions/files. there is a sorted LIST of "swappri" structures 77 * which describe "swapdev"'s at that priority. this LIST is headed 78 * by the "swap_priority" global var. each "swappri" contains a 79 * CIRCLEQ of "swapdev" structures at that priority. 80 * 81 * locking: 82 * - swap_syscall_lock (sleep lock): this lock serializes the swapctl 83 * system call and prevents the swap priority list from changing 84 * while we are in the middle of a system call (e.g. SWAP_STATS). 85 * - uvm.swap_data_lock (simple_lock): this lock protects all swap data 86 * structures including the priority list, the swapdev structures, 87 * and the swapmap extent. 88 * 89 * each swap device has the following info: 90 * - swap device in use (could be disabled, preventing future use) 91 * - swap enabled (allows new allocations on swap) 92 * - map info in /dev/drum 93 * - vnode pointer 94 * for swap files only: 95 * - block size 96 * - max byte count in buffer 97 * - buffer 98 * - credentials to use when doing i/o to file 99 * 100 * userland controls and configures swap with the swapctl(2) system call. 101 * the sys_swapctl performs the following operations: 102 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 103 * [2] SWAP_STATS: given a pointer to an array of swapent structures 104 * (passed in via "arg") of a size passed in via "misc" ... we load 105 * the current swap config into the array. 106 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 107 * priority in "misc", start swapping on it. 108 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 109 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 110 * "misc") 111 */ 112 113 /* 114 * swapdev: describes a single swap partition/file 115 * 116 * note the following should be true: 117 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 118 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] 119 */ 120 struct swapdev { 121 struct oswapent swd_ose; 122 #define swd_dev swd_ose.ose_dev /* device id */ 123 #define swd_flags swd_ose.ose_flags /* flags:inuse/enable/fake */ 124 #define swd_priority swd_ose.ose_priority /* our priority */ 125 /* also: swd_ose.ose_nblks, swd_ose.ose_inuse */ 126 char *swd_path; /* saved pathname of device */ 127 int swd_pathlen; /* length of pathname */ 128 int swd_npages; /* #pages we can use */ 129 int swd_npginuse; /* #pages in use */ 130 int swd_npgbad; /* #pages bad */ 131 int swd_drumoffset; /* page0 offset in drum */ 132 int swd_drumsize; /* #pages in drum */ 133 struct extent *swd_ex; /* extent for this swapdev */ 134 char swd_exname[12]; /* name of extent above */ 135 struct vnode *swd_vp; /* backing vnode */ 136 CIRCLEQ_ENTRY(swapdev) swd_next; /* priority circleq */ 137 138 int swd_bsize; /* blocksize (bytes) */ 139 int swd_maxactive; /* max active i/o reqs */ 140 struct buf_queue swd_tab; /* buffer list */ 141 int swd_active; /* number of active buffers */ 142 struct ucred *swd_cred; /* cred for file access */ 143 }; 144 145 /* 146 * swap device priority entry; the list is kept sorted on `spi_priority'. 147 */ 148 struct swappri { 149 int spi_priority; /* priority */ 150 CIRCLEQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 151 /* circleq of swapdevs at this priority */ 152 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 153 }; 154 155 /* 156 * The following two structures are used to keep track of data transfers 157 * on swap devices associated with regular files. 158 * NOTE: this code is more or less a copy of vnd.c; we use the same 159 * structure names here to ease porting.. 160 */ 161 struct vndxfer { 162 struct buf *vx_bp; /* Pointer to parent buffer */ 163 struct swapdev *vx_sdp; 164 int vx_error; 165 int vx_pending; /* # of pending aux buffers */ 166 int vx_flags; 167 #define VX_BUSY 1 168 #define VX_DEAD 2 169 }; 170 171 struct vndbuf { 172 struct buf vb_buf; 173 struct vndxfer *vb_xfer; 174 }; 175 176 177 /* 178 * We keep a of pool vndbuf's and vndxfer structures. 179 */ 180 static struct pool vndxfer_pool; 181 static struct pool vndbuf_pool; 182 183 #define getvndxfer(vnx) do { \ 184 int s = splbio(); \ 185 vnx = pool_get(&vndxfer_pool, PR_MALLOCOK|PR_WAITOK); \ 186 splx(s); \ 187 } while (0) 188 189 #define putvndxfer(vnx) { \ 190 pool_put(&vndxfer_pool, (void *)(vnx)); \ 191 } 192 193 #define getvndbuf(vbp) do { \ 194 int s = splbio(); \ 195 vbp = pool_get(&vndbuf_pool, PR_MALLOCOK|PR_WAITOK); \ 196 splx(s); \ 197 } while (0) 198 199 #define putvndbuf(vbp) { \ 200 pool_put(&vndbuf_pool, (void *)(vbp)); \ 201 } 202 203 /* /dev/drum */ 204 bdev_decl(sw); 205 cdev_decl(sw); 206 207 /* 208 * local variables 209 */ 210 static struct extent *swapmap; /* controls the mapping of /dev/drum */ 211 212 /* list of all active swap devices [by priority] */ 213 LIST_HEAD(swap_priority, swappri); 214 static struct swap_priority swap_priority; 215 216 /* locks */ 217 struct lock swap_syscall_lock; 218 219 /* 220 * prototypes 221 */ 222 static struct swapdev *swapdrum_getsdp __P((int)); 223 224 static struct swapdev *swaplist_find __P((struct vnode *, int)); 225 static void swaplist_insert __P((struct swapdev *, 226 struct swappri *, int)); 227 static void swaplist_trim __P((void)); 228 229 static int swap_on __P((struct proc *, struct swapdev *)); 230 static int swap_off __P((struct proc *, struct swapdev *)); 231 232 static void sw_reg_strategy __P((struct swapdev *, struct buf *, int)); 233 static void sw_reg_iodone __P((struct buf *)); 234 static void sw_reg_start __P((struct swapdev *)); 235 236 static int uvm_swap_io __P((struct vm_page **, int, int, int)); 237 238 /* 239 * uvm_swap_init: init the swap system data structures and locks 240 * 241 * => called at boot time from init_main.c after the filesystems 242 * are brought up (which happens after uvm_init()) 243 */ 244 void 245 uvm_swap_init() 246 { 247 UVMHIST_FUNC("uvm_swap_init"); 248 249 UVMHIST_CALLED(pdhist); 250 /* 251 * first, init the swap list, its counter, and its lock. 252 * then get a handle on the vnode for /dev/drum by using 253 * the its dev_t number ("swapdev", from MD conf.c). 254 */ 255 256 LIST_INIT(&swap_priority); 257 uvmexp.nswapdev = 0; 258 lockinit(&swap_syscall_lock, PVM, "swapsys", 0, 0); 259 simple_lock_init(&uvm.swap_data_lock); 260 261 if (bdevvp(swapdev, &swapdev_vp)) 262 panic("uvm_swap_init: can't get vnode for swap device"); 263 264 /* 265 * create swap block resource map to map /dev/drum. the range 266 * from 1 to INT_MAX allows 2 gigablocks of swap space. note 267 * that block 0 is reserved (used to indicate an allocation 268 * failure, or no allocation). 269 */ 270 swapmap = extent_create("swapmap", 1, INT_MAX, 271 M_VMSWAP, 0, 0, EX_NOWAIT); 272 if (swapmap == 0) 273 panic("uvm_swap_init: extent_create failed"); 274 275 /* 276 * allocate pools for structures used for swapping to files. 277 */ 278 279 pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, 280 "swp vnx", 0, NULL, NULL, 0); 281 282 pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, 283 "swp vnd", 0, NULL, NULL, 0); 284 285 /* 286 * done! 287 */ 288 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); 289 } 290 291 /* 292 * swaplist functions: functions that operate on the list of swap 293 * devices on the system. 294 */ 295 296 /* 297 * swaplist_insert: insert swap device "sdp" into the global list 298 * 299 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 300 * => caller must provide a newly malloc'd swappri structure (we will 301 * FREE it if we don't need it... this it to prevent malloc blocking 302 * here while adding swap) 303 */ 304 static void 305 swaplist_insert(sdp, newspp, priority) 306 struct swapdev *sdp; 307 struct swappri *newspp; 308 int priority; 309 { 310 struct swappri *spp, *pspp; 311 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist); 312 313 /* 314 * find entry at or after which to insert the new device. 315 */ 316 for (pspp = NULL, spp = LIST_FIRST(&swap_priority); spp != NULL; 317 spp = LIST_NEXT(spp, spi_swappri)) { 318 if (priority <= spp->spi_priority) 319 break; 320 pspp = spp; 321 } 322 323 /* 324 * new priority? 325 */ 326 if (spp == NULL || spp->spi_priority != priority) { 327 spp = newspp; /* use newspp! */ 328 UVMHIST_LOG(pdhist, "created new swappri = %d", 329 priority, 0, 0, 0); 330 331 spp->spi_priority = priority; 332 CIRCLEQ_INIT(&spp->spi_swapdev); 333 334 if (pspp) 335 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 336 else 337 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 338 } else { 339 /* we don't need a new priority structure, free it */ 340 FREE(newspp, M_VMSWAP); 341 } 342 343 /* 344 * priority found (or created). now insert on the priority's 345 * circleq list and bump the total number of swapdevs. 346 */ 347 sdp->swd_priority = priority; 348 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 349 uvmexp.nswapdev++; 350 } 351 352 /* 353 * swaplist_find: find and optionally remove a swap device from the 354 * global list. 355 * 356 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 357 * => we return the swapdev we found (and removed) 358 */ 359 static struct swapdev * 360 swaplist_find(vp, remove) 361 struct vnode *vp; 362 boolean_t remove; 363 { 364 struct swapdev *sdp; 365 struct swappri *spp; 366 367 /* 368 * search the lists for the requested vp 369 */ 370 for (spp = LIST_FIRST(&swap_priority); spp != NULL; 371 spp = LIST_NEXT(spp, spi_swappri)) { 372 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev); 373 sdp != (void *)&spp->spi_swapdev; 374 sdp = CIRCLEQ_NEXT(sdp, swd_next)) 375 if (sdp->swd_vp == vp) { 376 if (remove) { 377 CIRCLEQ_REMOVE(&spp->spi_swapdev, 378 sdp, swd_next); 379 uvmexp.nswapdev--; 380 } 381 return(sdp); 382 } 383 } 384 return (NULL); 385 } 386 387 388 /* 389 * swaplist_trim: scan priority list for empty priority entries and kill 390 * them. 391 * 392 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 393 */ 394 static void 395 swaplist_trim() 396 { 397 struct swappri *spp, *nextspp; 398 399 for (spp = LIST_FIRST(&swap_priority); spp != NULL; spp = nextspp) { 400 nextspp = LIST_NEXT(spp, spi_swappri); 401 if (CIRCLEQ_FIRST(&spp->spi_swapdev) != 402 (void *)&spp->spi_swapdev) 403 continue; 404 LIST_REMOVE(spp, spi_swappri); 405 free(spp, M_VMSWAP); 406 } 407 } 408 409 /* 410 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 411 * to the "swapdev" that maps that section of the drum. 412 * 413 * => each swapdev takes one big contig chunk of the drum 414 * => caller must hold uvm.swap_data_lock 415 */ 416 static struct swapdev * 417 swapdrum_getsdp(pgno) 418 int pgno; 419 { 420 struct swapdev *sdp; 421 struct swappri *spp; 422 423 for (spp = LIST_FIRST(&swap_priority); spp != NULL; 424 spp = LIST_NEXT(spp, spi_swappri)) 425 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev); 426 sdp != (void *)&spp->spi_swapdev; 427 sdp = CIRCLEQ_NEXT(sdp, swd_next)) { 428 if (sdp->swd_flags & SWF_FAKE) 429 continue; 430 if (pgno >= sdp->swd_drumoffset && 431 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 432 return sdp; 433 } 434 } 435 return NULL; 436 } 437 438 439 /* 440 * sys_swapctl: main entry point for swapctl(2) system call 441 * [with two helper functions: swap_on and swap_off] 442 */ 443 int 444 sys_swapctl(p, v, retval) 445 struct proc *p; 446 void *v; 447 register_t *retval; 448 { 449 struct sys_swapctl_args /* { 450 syscallarg(int) cmd; 451 syscallarg(void *) arg; 452 syscallarg(int) misc; 453 } */ *uap = (struct sys_swapctl_args *)v; 454 struct vnode *vp; 455 struct nameidata nd; 456 struct swappri *spp; 457 struct swapdev *sdp; 458 struct swapent *sep; 459 char userpath[PATH_MAX + 1]; 460 size_t len; 461 int count, error, misc; 462 int priority; 463 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist); 464 465 misc = SCARG(uap, misc); 466 467 /* 468 * ensure serialized syscall access by grabbing the swap_syscall_lock 469 */ 470 lockmgr(&swap_syscall_lock, LK_EXCLUSIVE, NULL); 471 472 /* 473 * we handle the non-priv NSWAP and STATS request first. 474 * 475 * SWAP_NSWAP: return number of config'd swap devices 476 * [can also be obtained with uvmexp sysctl] 477 */ 478 if (SCARG(uap, cmd) == SWAP_NSWAP) { 479 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev, 480 0, 0, 0); 481 *retval = uvmexp.nswapdev; 482 error = 0; 483 goto out; 484 } 485 486 /* 487 * SWAP_STATS: get stats on current # of configured swap devs 488 * 489 * note that the swap_priority list can't change as long 490 * as we are holding the swap_syscall_lock. we don't want 491 * to grab the uvm.swap_data_lock because we may fault&sleep during 492 * copyout() and we don't want to be holding that lock then! 493 */ 494 if (SCARG(uap, cmd) == SWAP_STATS 495 #if defined(COMPAT_13) 496 || SCARG(uap, cmd) == SWAP_OSTATS 497 #endif 498 ) { 499 sep = (struct swapent *)SCARG(uap, arg); 500 count = 0; 501 502 for (spp = LIST_FIRST(&swap_priority); spp != NULL; 503 spp = LIST_NEXT(spp, spi_swappri)) { 504 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev); 505 sdp != (void *)&spp->spi_swapdev && misc-- > 0; 506 sdp = CIRCLEQ_NEXT(sdp, swd_next)) { 507 /* 508 * backwards compatibility for system call. 509 * note that we use 'struct oswapent' as an 510 * overlay into both 'struct swapdev' and 511 * the userland 'struct swapent', as we 512 * want to retain backwards compatibility 513 * with NetBSD 1.3. 514 */ 515 sdp->swd_ose.ose_inuse = 516 btodb((u_int64_t)sdp->swd_npginuse << 517 PAGE_SHIFT); 518 error = copyout(&sdp->swd_ose, sep, 519 sizeof(struct oswapent)); 520 521 /* now copy out the path if necessary */ 522 #if defined(COMPAT_13) 523 if (error == 0 && SCARG(uap, cmd) == SWAP_STATS) 524 #else 525 if (error == 0) 526 #endif 527 error = copyout(sdp->swd_path, 528 &sep->se_path, sdp->swd_pathlen); 529 530 if (error) 531 goto out; 532 count++; 533 #if defined(COMPAT_13) 534 if (SCARG(uap, cmd) == SWAP_OSTATS) 535 sep = (struct swapent *) 536 ((struct oswapent *)sep + 1); 537 else 538 #endif 539 sep++; 540 } 541 } 542 543 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); 544 545 *retval = count; 546 error = 0; 547 goto out; 548 } 549 550 /* 551 * all other requests require superuser privs. verify. 552 */ 553 if ((error = suser(p->p_ucred, &p->p_acflag))) 554 goto out; 555 556 if (SCARG(uap, cmd) == SWAP_GETDUMPDEV) { 557 dev_t *devp = (dev_t *)SCARG(uap, arg); 558 559 error = copyout(&dumpdev, devp, sizeof(dumpdev)); 560 goto out; 561 } 562 563 /* 564 * at this point we expect a path name in arg. we will 565 * use namei() to gain a vnode reference (vref), and lock 566 * the vnode (VOP_LOCK). 567 * 568 * XXX: a NULL arg means use the root vnode pointer (e.g. for 569 * miniroot) 570 */ 571 if (SCARG(uap, arg) == NULL) { 572 vp = rootvp; /* miniroot */ 573 if (vget(vp, LK_EXCLUSIVE)) { 574 error = EBUSY; 575 goto out; 576 } 577 if (SCARG(uap, cmd) == SWAP_ON && 578 copystr("miniroot", userpath, sizeof userpath, &len)) 579 panic("swapctl: miniroot copy failed"); 580 } else { 581 int space; 582 char *where; 583 584 if (SCARG(uap, cmd) == SWAP_ON) { 585 if ((error = copyinstr(SCARG(uap, arg), userpath, 586 sizeof userpath, &len))) 587 goto out; 588 space = UIO_SYSSPACE; 589 where = userpath; 590 } else { 591 space = UIO_USERSPACE; 592 where = (char *)SCARG(uap, arg); 593 } 594 NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, space, where, p); 595 if ((error = namei(&nd))) 596 goto out; 597 vp = nd.ni_vp; 598 } 599 /* note: "vp" is referenced and locked */ 600 601 error = 0; /* assume no error */ 602 switch(SCARG(uap, cmd)) { 603 604 case SWAP_DUMPDEV: 605 if (vp->v_type != VBLK) { 606 error = ENOTBLK; 607 break; 608 } 609 dumpdev = vp->v_rdev; 610 break; 611 612 case SWAP_CTL: 613 /* 614 * get new priority, remove old entry (if any) and then 615 * reinsert it in the correct place. finally, prune out 616 * any empty priority structures. 617 */ 618 priority = SCARG(uap, misc); 619 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 620 simple_lock(&uvm.swap_data_lock); 621 if ((sdp = swaplist_find(vp, 1)) == NULL) { 622 error = ENOENT; 623 } else { 624 swaplist_insert(sdp, spp, priority); 625 swaplist_trim(); 626 } 627 simple_unlock(&uvm.swap_data_lock); 628 if (error) 629 free(spp, M_VMSWAP); 630 break; 631 632 case SWAP_ON: 633 634 /* 635 * check for duplicates. if none found, then insert a 636 * dummy entry on the list to prevent someone else from 637 * trying to enable this device while we are working on 638 * it. 639 */ 640 641 priority = SCARG(uap, misc); 642 sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK); 643 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 644 simple_lock(&uvm.swap_data_lock); 645 if (swaplist_find(vp, 0) != NULL) { 646 error = EBUSY; 647 simple_unlock(&uvm.swap_data_lock); 648 free(sdp, M_VMSWAP); 649 free(spp, M_VMSWAP); 650 break; 651 } 652 memset(sdp, 0, sizeof(*sdp)); 653 sdp->swd_flags = SWF_FAKE; /* placeholder only */ 654 sdp->swd_vp = vp; 655 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 656 BUFQ_INIT(&sdp->swd_tab); 657 658 /* 659 * XXX Is NFS elaboration necessary? 660 */ 661 if (vp->v_type == VREG) { 662 sdp->swd_cred = crdup(p->p_ucred); 663 } 664 665 swaplist_insert(sdp, spp, priority); 666 simple_unlock(&uvm.swap_data_lock); 667 668 sdp->swd_pathlen = len; 669 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK); 670 if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0) 671 panic("swapctl: copystr"); 672 673 /* 674 * we've now got a FAKE placeholder in the swap list. 675 * now attempt to enable swap on it. if we fail, undo 676 * what we've done and kill the fake entry we just inserted. 677 * if swap_on is a success, it will clear the SWF_FAKE flag 678 */ 679 680 if ((error = swap_on(p, sdp)) != 0) { 681 simple_lock(&uvm.swap_data_lock); 682 (void) swaplist_find(vp, 1); /* kill fake entry */ 683 swaplist_trim(); 684 simple_unlock(&uvm.swap_data_lock); 685 if (vp->v_type == VREG) { 686 crfree(sdp->swd_cred); 687 } 688 free(sdp->swd_path, M_VMSWAP); 689 free(sdp, M_VMSWAP); 690 break; 691 } 692 break; 693 694 case SWAP_OFF: 695 simple_lock(&uvm.swap_data_lock); 696 if ((sdp = swaplist_find(vp, 0)) == NULL) { 697 simple_unlock(&uvm.swap_data_lock); 698 error = ENXIO; 699 break; 700 } 701 702 /* 703 * If a device isn't in use or enabled, we 704 * can't stop swapping from it (again). 705 */ 706 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 707 simple_unlock(&uvm.swap_data_lock); 708 error = EBUSY; 709 break; 710 } 711 712 /* 713 * do the real work. 714 */ 715 error = swap_off(p, sdp); 716 break; 717 718 default: 719 error = EINVAL; 720 } 721 722 /* 723 * done! release the ref gained by namei() and unlock. 724 */ 725 vput(vp); 726 727 out: 728 lockmgr(&swap_syscall_lock, LK_RELEASE, NULL); 729 730 UVMHIST_LOG(pdhist, "<- done! error=%d", error, 0, 0, 0); 731 return (error); 732 } 733 734 /* 735 * swap_on: attempt to enable a swapdev for swapping. note that the 736 * swapdev is already on the global list, but disabled (marked 737 * SWF_FAKE). 738 * 739 * => we avoid the start of the disk (to protect disk labels) 740 * => we also avoid the miniroot, if we are swapping to root. 741 * => caller should leave uvm.swap_data_lock unlocked, we may lock it 742 * if needed. 743 */ 744 static int 745 swap_on(p, sdp) 746 struct proc *p; 747 struct swapdev *sdp; 748 { 749 static int count = 0; /* static */ 750 struct vnode *vp; 751 int error, npages, nblocks, size; 752 long addr; 753 u_long result; 754 struct vattr va; 755 #ifdef NFS 756 extern int (**nfsv2_vnodeop_p) __P((void *)); 757 #endif /* NFS */ 758 dev_t dev; 759 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist); 760 761 /* 762 * we want to enable swapping on sdp. the swd_vp contains 763 * the vnode we want (locked and ref'd), and the swd_dev 764 * contains the dev_t of the file, if it a block device. 765 */ 766 767 vp = sdp->swd_vp; 768 dev = sdp->swd_dev; 769 770 /* 771 * open the swap file (mostly useful for block device files to 772 * let device driver know what is up). 773 * 774 * we skip the open/close for root on swap because the root 775 * has already been opened when root was mounted (mountroot). 776 */ 777 if (vp != rootvp) { 778 if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p))) 779 return (error); 780 } 781 782 /* XXX this only works for block devices */ 783 UVMHIST_LOG(pdhist, " dev=%d, major(dev)=%d", dev, major(dev), 0,0); 784 785 /* 786 * we now need to determine the size of the swap area. for 787 * block specials we can call the d_psize function. 788 * for normal files, we must stat [get attrs]. 789 * 790 * we put the result in nblks. 791 * for normal files, we also want the filesystem block size 792 * (which we get with statfs). 793 */ 794 switch (vp->v_type) { 795 case VBLK: 796 if (bdevsw[major(dev)].d_psize == 0 || 797 (nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) { 798 error = ENXIO; 799 goto bad; 800 } 801 break; 802 803 case VREG: 804 if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p))) 805 goto bad; 806 nblocks = (int)btodb(va.va_size); 807 if ((error = 808 VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0) 809 goto bad; 810 811 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize; 812 /* 813 * limit the max # of outstanding I/O requests we issue 814 * at any one time. take it easy on NFS servers. 815 */ 816 #ifdef NFS 817 if (vp->v_op == nfsv2_vnodeop_p) 818 sdp->swd_maxactive = 2; /* XXX */ 819 else 820 #endif /* NFS */ 821 sdp->swd_maxactive = 8; /* XXX */ 822 break; 823 824 default: 825 error = ENXIO; 826 goto bad; 827 } 828 829 /* 830 * save nblocks in a safe place and convert to pages. 831 */ 832 833 sdp->swd_ose.ose_nblks = nblocks; 834 npages = dbtob((u_int64_t)nblocks) >> PAGE_SHIFT; 835 836 /* 837 * for block special files, we want to make sure that leave 838 * the disklabel and bootblocks alone, so we arrange to skip 839 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 840 * note that because of this the "size" can be less than the 841 * actual number of blocks on the device. 842 */ 843 if (vp->v_type == VBLK) { 844 /* we use pages 1 to (size - 1) [inclusive] */ 845 size = npages - 1; 846 addr = 1; 847 } else { 848 /* we use pages 0 to (size - 1) [inclusive] */ 849 size = npages; 850 addr = 0; 851 } 852 853 /* 854 * make sure we have enough blocks for a reasonable sized swap 855 * area. we want at least one page. 856 */ 857 858 if (size < 1) { 859 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); 860 error = EINVAL; 861 goto bad; 862 } 863 864 UVMHIST_LOG(pdhist, " dev=%x: size=%d addr=%ld\n", dev, size, addr, 0); 865 866 /* 867 * now we need to allocate an extent to manage this swap device 868 */ 869 snprintf(sdp->swd_exname, sizeof(sdp->swd_exname), "swap0x%04x", 870 count++); 871 872 /* note that extent_create's 3rd arg is inclusive, thus "- 1" */ 873 sdp->swd_ex = extent_create(sdp->swd_exname, 0, npages - 1, M_VMSWAP, 874 0, 0, EX_WAITOK); 875 /* allocate the `saved' region from the extent so it won't be used */ 876 if (addr) { 877 if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK)) 878 panic("disklabel region"); 879 } 880 881 /* 882 * if the vnode we are swapping to is the root vnode 883 * (i.e. we are swapping to the miniroot) then we want 884 * to make sure we don't overwrite it. do a statfs to 885 * find its size and skip over it. 886 */ 887 if (vp == rootvp) { 888 struct mount *mp; 889 struct statfs *sp; 890 int rootblocks, rootpages; 891 892 mp = rootvnode->v_mount; 893 sp = &mp->mnt_stat; 894 rootblocks = sp->f_blocks * btodb(sp->f_bsize); 895 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; 896 if (rootpages > size) 897 panic("swap_on: miniroot larger than swap?"); 898 899 if (extent_alloc_region(sdp->swd_ex, addr, 900 rootpages, EX_WAITOK)) 901 panic("swap_on: unable to preserve miniroot"); 902 903 size -= rootpages; 904 printf("Preserved %d pages of miniroot ", rootpages); 905 printf("leaving %d pages of swap\n", size); 906 } 907 908 /* 909 * try to add anons to reflect the new swap space. 910 */ 911 912 error = uvm_anon_add(size); 913 if (error) { 914 goto bad; 915 } 916 917 /* 918 * add a ref to vp to reflect usage as a swap device. 919 */ 920 vref(vp); 921 922 /* 923 * now add the new swapdev to the drum and enable. 924 */ 925 if (extent_alloc(swapmap, npages, EX_NOALIGN, EX_NOBOUNDARY, 926 EX_WAITOK, &result)) 927 panic("swapdrum_add"); 928 929 sdp->swd_drumoffset = (int)result; 930 sdp->swd_drumsize = npages; 931 sdp->swd_npages = size; 932 simple_lock(&uvm.swap_data_lock); 933 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 934 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 935 uvmexp.swpages += size; 936 simple_unlock(&uvm.swap_data_lock); 937 return (0); 938 939 /* 940 * failure: clean up and return error. 941 */ 942 943 bad: 944 if (sdp->swd_ex) { 945 extent_destroy(sdp->swd_ex); 946 } 947 if (vp != rootvp) { 948 (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p); 949 } 950 return (error); 951 } 952 953 /* 954 * swap_off: stop swapping on swapdev 955 * 956 * => swap data should be locked, we will unlock. 957 */ 958 static int 959 swap_off(p, sdp) 960 struct proc *p; 961 struct swapdev *sdp; 962 { 963 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist); 964 UVMHIST_LOG(pdhist, " dev=%x", sdp->swd_dev,0,0,0); 965 966 /* disable the swap area being removed */ 967 sdp->swd_flags &= ~SWF_ENABLE; 968 simple_unlock(&uvm.swap_data_lock); 969 970 /* 971 * the idea is to find all the pages that are paged out to this 972 * device, and page them all in. in uvm, swap-backed pageable 973 * memory can take two forms: aobjs and anons. call the 974 * swapoff hook for each subsystem to bring in pages. 975 */ 976 977 if (uao_swap_off(sdp->swd_drumoffset, 978 sdp->swd_drumoffset + sdp->swd_drumsize) || 979 anon_swap_off(sdp->swd_drumoffset, 980 sdp->swd_drumoffset + sdp->swd_drumsize)) { 981 982 simple_lock(&uvm.swap_data_lock); 983 sdp->swd_flags |= SWF_ENABLE; 984 simple_unlock(&uvm.swap_data_lock); 985 return ENOMEM; 986 } 987 KASSERT(sdp->swd_npginuse == sdp->swd_npgbad); 988 989 /* 990 * done with the vnode and saved creds. 991 * drop our ref on the vnode before calling VOP_CLOSE() 992 * so that spec_close() can tell if this is the last close. 993 */ 994 if (sdp->swd_vp->v_type == VREG) { 995 crfree(sdp->swd_cred); 996 } 997 vrele(sdp->swd_vp); 998 if (sdp->swd_vp != rootvp) { 999 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p); 1000 } 1001 1002 /* remove anons from the system */ 1003 uvm_anon_remove(sdp->swd_npages); 1004 1005 simple_lock(&uvm.swap_data_lock); 1006 uvmexp.swpages -= sdp->swd_npages; 1007 1008 if (swaplist_find(sdp->swd_vp, 1) == NULL) 1009 panic("swap_off: swapdev not in list\n"); 1010 swaplist_trim(); 1011 simple_unlock(&uvm.swap_data_lock); 1012 1013 /* 1014 * free all resources! 1015 */ 1016 extent_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize, 1017 EX_WAITOK); 1018 extent_destroy(sdp->swd_ex); 1019 free(sdp, M_VMSWAP); 1020 return (0); 1021 } 1022 1023 /* 1024 * /dev/drum interface and i/o functions 1025 */ 1026 1027 /* 1028 * swread: the read function for the drum (just a call to physio) 1029 */ 1030 /*ARGSUSED*/ 1031 int 1032 swread(dev, uio, ioflag) 1033 dev_t dev; 1034 struct uio *uio; 1035 int ioflag; 1036 { 1037 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist); 1038 1039 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); 1040 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); 1041 } 1042 1043 /* 1044 * swwrite: the write function for the drum (just a call to physio) 1045 */ 1046 /*ARGSUSED*/ 1047 int 1048 swwrite(dev, uio, ioflag) 1049 dev_t dev; 1050 struct uio *uio; 1051 int ioflag; 1052 { 1053 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist); 1054 1055 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); 1056 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); 1057 } 1058 1059 /* 1060 * swstrategy: perform I/O on the drum 1061 * 1062 * => we must map the i/o request from the drum to the correct swapdev. 1063 */ 1064 void 1065 swstrategy(bp) 1066 struct buf *bp; 1067 { 1068 struct swapdev *sdp; 1069 struct vnode *vp; 1070 int s, pageno, bn; 1071 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist); 1072 1073 /* 1074 * convert block number to swapdev. note that swapdev can't 1075 * be yanked out from under us because we are holding resources 1076 * in it (i.e. the blocks we are doing I/O on). 1077 */ 1078 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT; 1079 simple_lock(&uvm.swap_data_lock); 1080 sdp = swapdrum_getsdp(pageno); 1081 simple_unlock(&uvm.swap_data_lock); 1082 if (sdp == NULL) { 1083 bp->b_error = EINVAL; 1084 bp->b_flags |= B_ERROR; 1085 biodone(bp); 1086 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); 1087 return; 1088 } 1089 1090 /* 1091 * convert drum page number to block number on this swapdev. 1092 */ 1093 1094 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1095 bn = btodb((u_int64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ 1096 1097 UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld", 1098 ((bp->b_flags & B_READ) == 0) ? "write" : "read", 1099 sdp->swd_drumoffset, bn, bp->b_bcount); 1100 1101 /* 1102 * for block devices we finish up here. 1103 * for regular files we have to do more work which we delegate 1104 * to sw_reg_strategy(). 1105 */ 1106 1107 switch (sdp->swd_vp->v_type) { 1108 default: 1109 panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type); 1110 1111 case VBLK: 1112 1113 /* 1114 * must convert "bp" from an I/O on /dev/drum to an I/O 1115 * on the swapdev (sdp). 1116 */ 1117 s = splbio(); 1118 bp->b_blkno = bn; /* swapdev block number */ 1119 vp = sdp->swd_vp; /* swapdev vnode pointer */ 1120 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */ 1121 VHOLD(vp); /* "hold" swapdev vp for i/o */ 1122 1123 /* 1124 * if we are doing a write, we have to redirect the i/o on 1125 * drum's v_numoutput counter to the swapdevs. 1126 */ 1127 if ((bp->b_flags & B_READ) == 0) { 1128 vwakeup(bp); /* kills one 'v_numoutput' on drum */ 1129 vp->v_numoutput++; /* put it on swapdev */ 1130 } 1131 1132 /* 1133 * dissassocate buffer with /dev/drum vnode 1134 * [could be null if buf was from physio] 1135 */ 1136 if (bp->b_vp != NULL) 1137 brelvp(bp); 1138 1139 /* 1140 * finally plug in swapdev vnode and start I/O 1141 */ 1142 bp->b_vp = vp; 1143 splx(s); 1144 VOP_STRATEGY(bp); 1145 return; 1146 1147 case VREG: 1148 /* 1149 * delegate to sw_reg_strategy function. 1150 */ 1151 sw_reg_strategy(sdp, bp, bn); 1152 return; 1153 } 1154 /* NOTREACHED */ 1155 } 1156 1157 /* 1158 * sw_reg_strategy: handle swap i/o to regular files 1159 */ 1160 static void 1161 sw_reg_strategy(sdp, bp, bn) 1162 struct swapdev *sdp; 1163 struct buf *bp; 1164 int bn; 1165 { 1166 struct vnode *vp; 1167 struct vndxfer *vnx; 1168 daddr_t nbn; 1169 caddr_t addr; 1170 off_t byteoff; 1171 int s, off, nra, error, sz, resid; 1172 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist); 1173 1174 /* 1175 * allocate a vndxfer head for this transfer and point it to 1176 * our buffer. 1177 */ 1178 getvndxfer(vnx); 1179 vnx->vx_flags = VX_BUSY; 1180 vnx->vx_error = 0; 1181 vnx->vx_pending = 0; 1182 vnx->vx_bp = bp; 1183 vnx->vx_sdp = sdp; 1184 1185 /* 1186 * setup for main loop where we read filesystem blocks into 1187 * our buffer. 1188 */ 1189 error = 0; 1190 bp->b_resid = bp->b_bcount; /* nothing transfered yet! */ 1191 addr = bp->b_data; /* current position in buffer */ 1192 byteoff = dbtob((u_int64_t)bn); 1193 1194 for (resid = bp->b_resid; resid; resid -= sz) { 1195 struct vndbuf *nbp; 1196 1197 /* 1198 * translate byteoffset into block number. return values: 1199 * vp = vnode of underlying device 1200 * nbn = new block number (on underlying vnode dev) 1201 * nra = num blocks we can read-ahead (excludes requested 1202 * block) 1203 */ 1204 nra = 0; 1205 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1206 &vp, &nbn, &nra); 1207 1208 if (error == 0 && nbn == (daddr_t)-1) { 1209 /* 1210 * this used to just set error, but that doesn't 1211 * do the right thing. Instead, it causes random 1212 * memory errors. The panic() should remain until 1213 * this condition doesn't destabilize the system. 1214 */ 1215 #if 1 1216 panic("sw_reg_strategy: swap to sparse file"); 1217 #else 1218 error = EIO; /* failure */ 1219 #endif 1220 } 1221 1222 /* 1223 * punt if there was an error or a hole in the file. 1224 * we must wait for any i/o ops we have already started 1225 * to finish before returning. 1226 * 1227 * XXX we could deal with holes here but it would be 1228 * a hassle (in the write case). 1229 */ 1230 if (error) { 1231 s = splbio(); 1232 vnx->vx_error = error; /* pass error up */ 1233 goto out; 1234 } 1235 1236 /* 1237 * compute the size ("sz") of this transfer (in bytes). 1238 */ 1239 off = byteoff % sdp->swd_bsize; 1240 sz = (1 + nra) * sdp->swd_bsize - off; 1241 if (sz > resid) 1242 sz = resid; 1243 1244 UVMHIST_LOG(pdhist, "sw_reg_strategy: " 1245 "vp %p/%p offset 0x%x/0x%x", 1246 sdp->swd_vp, vp, byteoff, nbn); 1247 1248 /* 1249 * now get a buf structure. note that the vb_buf is 1250 * at the front of the nbp structure so that you can 1251 * cast pointers between the two structure easily. 1252 */ 1253 getvndbuf(nbp); 1254 nbp->vb_buf.b_flags = bp->b_flags | B_CALL; 1255 nbp->vb_buf.b_bcount = sz; 1256 nbp->vb_buf.b_bufsize = sz; 1257 nbp->vb_buf.b_error = 0; 1258 nbp->vb_buf.b_data = addr; 1259 nbp->vb_buf.b_lblkno = 0; 1260 nbp->vb_buf.b_blkno = nbn + btodb(off); 1261 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno; 1262 nbp->vb_buf.b_iodone = sw_reg_iodone; 1263 nbp->vb_buf.b_vp = NULL; 1264 LIST_INIT(&nbp->vb_buf.b_dep); 1265 1266 nbp->vb_xfer = vnx; /* patch it back in to vnx */ 1267 1268 /* 1269 * Just sort by block number 1270 */ 1271 s = splbio(); 1272 if (vnx->vx_error != 0) { 1273 putvndbuf(nbp); 1274 goto out; 1275 } 1276 vnx->vx_pending++; 1277 1278 /* assoc new buffer with underlying vnode */ 1279 bgetvp(vp, &nbp->vb_buf); 1280 1281 /* sort it in and start I/O if we are not over our limit */ 1282 disksort_blkno(&sdp->swd_tab, &nbp->vb_buf); 1283 sw_reg_start(sdp); 1284 splx(s); 1285 1286 /* 1287 * advance to the next I/O 1288 */ 1289 byteoff += sz; 1290 addr += sz; 1291 } 1292 1293 s = splbio(); 1294 1295 out: /* Arrive here at splbio */ 1296 vnx->vx_flags &= ~VX_BUSY; 1297 if (vnx->vx_pending == 0) { 1298 if (vnx->vx_error != 0) { 1299 bp->b_error = vnx->vx_error; 1300 bp->b_flags |= B_ERROR; 1301 } 1302 putvndxfer(vnx); 1303 biodone(bp); 1304 } 1305 splx(s); 1306 } 1307 1308 /* 1309 * sw_reg_start: start an I/O request on the requested swapdev 1310 * 1311 * => reqs are sorted by disksort (above) 1312 */ 1313 static void 1314 sw_reg_start(sdp) 1315 struct swapdev *sdp; 1316 { 1317 struct buf *bp; 1318 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist); 1319 1320 /* recursion control */ 1321 if ((sdp->swd_flags & SWF_BUSY) != 0) 1322 return; 1323 1324 sdp->swd_flags |= SWF_BUSY; 1325 1326 while (sdp->swd_active < sdp->swd_maxactive) { 1327 bp = BUFQ_FIRST(&sdp->swd_tab); 1328 if (bp == NULL) 1329 break; 1330 BUFQ_REMOVE(&sdp->swd_tab, bp); 1331 sdp->swd_active++; 1332 1333 UVMHIST_LOG(pdhist, 1334 "sw_reg_start: bp %p vp %p blkno %p cnt %lx", 1335 bp, bp->b_vp, bp->b_blkno, bp->b_bcount); 1336 if ((bp->b_flags & B_READ) == 0) 1337 bp->b_vp->v_numoutput++; 1338 1339 VOP_STRATEGY(bp); 1340 } 1341 sdp->swd_flags &= ~SWF_BUSY; 1342 } 1343 1344 /* 1345 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1346 * 1347 * => note that we can recover the vndbuf struct by casting the buf ptr 1348 */ 1349 static void 1350 sw_reg_iodone(bp) 1351 struct buf *bp; 1352 { 1353 struct vndbuf *vbp = (struct vndbuf *) bp; 1354 struct vndxfer *vnx = vbp->vb_xfer; 1355 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1356 struct swapdev *sdp = vnx->vx_sdp; 1357 int s, resid; 1358 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist); 1359 1360 UVMHIST_LOG(pdhist, " vbp=%p vp=%p blkno=%x addr=%p", 1361 vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data); 1362 UVMHIST_LOG(pdhist, " cnt=%lx resid=%lx", 1363 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); 1364 1365 /* 1366 * protect vbp at splbio and update. 1367 */ 1368 1369 s = splbio(); 1370 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1371 pbp->b_resid -= resid; 1372 vnx->vx_pending--; 1373 1374 if (vbp->vb_buf.b_error) { 1375 UVMHIST_LOG(pdhist, " got error=%d !", 1376 vbp->vb_buf.b_error, 0, 0, 0); 1377 1378 /* pass error upward */ 1379 vnx->vx_error = vbp->vb_buf.b_error; 1380 } 1381 1382 /* 1383 * disassociate this buffer from the vnode. 1384 */ 1385 brelvp(&vbp->vb_buf); 1386 1387 /* 1388 * kill vbp structure 1389 */ 1390 putvndbuf(vbp); 1391 1392 /* 1393 * wrap up this transaction if it has run to completion or, in 1394 * case of an error, when all auxiliary buffers have returned. 1395 */ 1396 if (vnx->vx_error != 0) { 1397 /* pass error upward */ 1398 pbp->b_flags |= B_ERROR; 1399 pbp->b_error = vnx->vx_error; 1400 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1401 putvndxfer(vnx); 1402 biodone(pbp); 1403 } 1404 } else if (pbp->b_resid == 0) { 1405 KASSERT(vnx->vx_pending == 0); 1406 if ((vnx->vx_flags & VX_BUSY) == 0) { 1407 UVMHIST_LOG(pdhist, " iodone error=%d !", 1408 pbp, vnx->vx_error, 0, 0); 1409 putvndxfer(vnx); 1410 biodone(pbp); 1411 } 1412 } 1413 1414 /* 1415 * done! start next swapdev I/O if one is pending 1416 */ 1417 sdp->swd_active--; 1418 sw_reg_start(sdp); 1419 splx(s); 1420 } 1421 1422 1423 /* 1424 * uvm_swap_alloc: allocate space on swap 1425 * 1426 * => allocation is done "round robin" down the priority list, as we 1427 * allocate in a priority we "rotate" the circle queue. 1428 * => space can be freed with uvm_swap_free 1429 * => we return the page slot number in /dev/drum (0 == invalid slot) 1430 * => we lock uvm.swap_data_lock 1431 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1432 */ 1433 int 1434 uvm_swap_alloc(nslots, lessok) 1435 int *nslots; /* IN/OUT */ 1436 boolean_t lessok; 1437 { 1438 struct swapdev *sdp; 1439 struct swappri *spp; 1440 u_long result; 1441 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist); 1442 1443 /* 1444 * no swap devices configured yet? definite failure. 1445 */ 1446 if (uvmexp.nswapdev < 1) 1447 return 0; 1448 1449 /* 1450 * lock data lock, convert slots into blocks, and enter loop 1451 */ 1452 simple_lock(&uvm.swap_data_lock); 1453 1454 ReTry: /* XXXMRG */ 1455 for (spp = LIST_FIRST(&swap_priority); spp != NULL; 1456 spp = LIST_NEXT(spp, spi_swappri)) { 1457 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev); 1458 sdp != (void *)&spp->spi_swapdev; 1459 sdp = CIRCLEQ_NEXT(sdp,swd_next)) { 1460 /* if it's not enabled, then we can't swap from it */ 1461 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1462 continue; 1463 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1464 continue; 1465 if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN, 1466 EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT, 1467 &result) != 0) { 1468 continue; 1469 } 1470 1471 /* 1472 * successful allocation! now rotate the circleq. 1473 */ 1474 CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1475 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1476 sdp->swd_npginuse += *nslots; 1477 uvmexp.swpginuse += *nslots; 1478 simple_unlock(&uvm.swap_data_lock); 1479 /* done! return drum slot number */ 1480 UVMHIST_LOG(pdhist, 1481 "success! returning %d slots starting at %d", 1482 *nslots, result + sdp->swd_drumoffset, 0, 0); 1483 return(result + sdp->swd_drumoffset); 1484 } 1485 } 1486 1487 /* XXXMRG: BEGIN HACK */ 1488 if (*nslots > 1 && lessok) { 1489 *nslots = 1; 1490 goto ReTry; /* XXXMRG: ugh! extent should support this for us */ 1491 } 1492 /* XXXMRG: END HACK */ 1493 1494 simple_unlock(&uvm.swap_data_lock); 1495 return 0; /* failed */ 1496 } 1497 1498 /* 1499 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1500 * 1501 * => we lock uvm.swap_data_lock 1502 */ 1503 void 1504 uvm_swap_markbad(startslot, nslots) 1505 int startslot; 1506 int nslots; 1507 { 1508 struct swapdev *sdp; 1509 UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist); 1510 1511 simple_lock(&uvm.swap_data_lock); 1512 sdp = swapdrum_getsdp(startslot); 1513 1514 /* 1515 * we just keep track of how many pages have been marked bad 1516 * in this device, to make everything add up in swap_off(). 1517 * we assume here that the range of slots will all be within 1518 * one swap device. 1519 */ 1520 1521 sdp->swd_npgbad += nslots; 1522 UVMHIST_LOG(pdhist, "now %d bad", sdp->swd_npgbad, 0,0,0); 1523 simple_unlock(&uvm.swap_data_lock); 1524 } 1525 1526 /* 1527 * uvm_swap_free: free swap slots 1528 * 1529 * => this can be all or part of an allocation made by uvm_swap_alloc 1530 * => we lock uvm.swap_data_lock 1531 */ 1532 void 1533 uvm_swap_free(startslot, nslots) 1534 int startslot; 1535 int nslots; 1536 { 1537 struct swapdev *sdp; 1538 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist); 1539 1540 UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots, 1541 startslot, 0, 0); 1542 1543 /* 1544 * ignore attempts to free the "bad" slot. 1545 */ 1546 1547 if (startslot == SWSLOT_BAD) { 1548 return; 1549 } 1550 1551 /* 1552 * convert drum slot offset back to sdp, free the blocks 1553 * in the extent, and return. must hold pri lock to do 1554 * lookup and access the extent. 1555 */ 1556 1557 simple_lock(&uvm.swap_data_lock); 1558 sdp = swapdrum_getsdp(startslot); 1559 KASSERT(uvmexp.nswapdev >= 1); 1560 KASSERT(sdp != NULL); 1561 KASSERT(sdp->swd_npginuse >= nslots); 1562 if (extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots, 1563 EX_MALLOCOK|EX_NOWAIT) != 0) { 1564 printf("warning: resource shortage: %d pages of swap lost\n", 1565 nslots); 1566 } 1567 sdp->swd_npginuse -= nslots; 1568 uvmexp.swpginuse -= nslots; 1569 simple_unlock(&uvm.swap_data_lock); 1570 } 1571 1572 /* 1573 * uvm_swap_put: put any number of pages into a contig place on swap 1574 * 1575 * => can be sync or async 1576 * => XXXMRG: consider making it an inline or macro 1577 */ 1578 int 1579 uvm_swap_put(swslot, ppsp, npages, flags) 1580 int swslot; 1581 struct vm_page **ppsp; 1582 int npages; 1583 int flags; 1584 { 1585 int result; 1586 1587 result = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1588 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1589 1590 return (result); 1591 } 1592 1593 /* 1594 * uvm_swap_get: get a single page from swap 1595 * 1596 * => usually a sync op (from fault) 1597 * => XXXMRG: consider making it an inline or macro 1598 */ 1599 int 1600 uvm_swap_get(page, swslot, flags) 1601 struct vm_page *page; 1602 int swslot, flags; 1603 { 1604 int result; 1605 1606 uvmexp.nswget++; 1607 KASSERT(flags & PGO_SYNCIO); 1608 if (swslot == SWSLOT_BAD) { 1609 return EIO; 1610 } 1611 1612 /* 1613 * this page is (about to be) no longer only in swap. 1614 */ 1615 1616 simple_lock(&uvm.swap_data_lock); 1617 uvmexp.swpgonly--; 1618 simple_unlock(&uvm.swap_data_lock); 1619 1620 result = uvm_swap_io(&page, swslot, 1, B_READ | 1621 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1622 1623 if (result != 0) { 1624 1625 /* 1626 * oops, the read failed so it really is still only in swap. 1627 */ 1628 1629 simple_lock(&uvm.swap_data_lock); 1630 uvmexp.swpgonly++; 1631 simple_unlock(&uvm.swap_data_lock); 1632 } 1633 1634 return (result); 1635 } 1636 1637 /* 1638 * uvm_swap_io: do an i/o operation to swap 1639 */ 1640 1641 static int 1642 uvm_swap_io(pps, startslot, npages, flags) 1643 struct vm_page **pps; 1644 int startslot, npages, flags; 1645 { 1646 daddr_t startblk; 1647 struct buf *bp; 1648 vaddr_t kva; 1649 int error, s, mapinflags, pflag; 1650 boolean_t write, async; 1651 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist); 1652 1653 UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d", 1654 startslot, npages, flags, 0); 1655 1656 write = (flags & B_READ) == 0; 1657 async = (flags & B_ASYNC) != 0; 1658 1659 /* 1660 * convert starting drum slot to block number 1661 */ 1662 startblk = btodb((u_int64_t)startslot << PAGE_SHIFT); 1663 1664 /* 1665 * first, map the pages into the kernel (XXX: currently required 1666 * by buffer system). 1667 */ 1668 1669 mapinflags = !write ? UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE; 1670 if (!async) 1671 mapinflags |= UVMPAGER_MAPIN_WAITOK; 1672 kva = uvm_pagermapin(pps, npages, mapinflags); 1673 if (kva == 0) 1674 return (EAGAIN); 1675 1676 /* 1677 * now allocate a buf for the i/o. 1678 * [make sure we don't put the pagedaemon to sleep...] 1679 */ 1680 s = splbio(); 1681 pflag = (async || curproc == uvm.pagedaemon_proc) ? 0 : PR_WAITOK; 1682 bp = pool_get(&bufpool, pflag); 1683 splx(s); 1684 1685 /* 1686 * if we failed to get a buf, return "try again" 1687 */ 1688 if (bp == NULL) 1689 return (EAGAIN); 1690 1691 /* 1692 * fill in the bp/sbp. we currently route our i/o through 1693 * /dev/drum's vnode [swapdev_vp]. 1694 */ 1695 bp->b_flags = B_BUSY | B_NOCACHE | (flags & (B_READ|B_ASYNC)); 1696 bp->b_proc = &proc0; /* XXX */ 1697 bp->b_vnbufs.le_next = NOLIST; 1698 bp->b_data = (caddr_t)kva; 1699 bp->b_blkno = startblk; 1700 s = splbio(); 1701 VHOLD(swapdev_vp); 1702 bp->b_vp = swapdev_vp; 1703 splx(s); 1704 /* XXXCDC: isn't swapdev_vp always a VCHR? */ 1705 /* XXXMRG: probably -- this is obviously something inherited... */ 1706 if (swapdev_vp->v_type == VBLK) 1707 bp->b_dev = swapdev_vp->v_rdev; 1708 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT; 1709 LIST_INIT(&bp->b_dep); 1710 1711 /* 1712 * bump v_numoutput (counter of number of active outputs). 1713 */ 1714 if (write) { 1715 s = splbio(); 1716 swapdev_vp->v_numoutput++; 1717 splx(s); 1718 } 1719 1720 /* 1721 * for async ops we must set up the iodone handler. 1722 */ 1723 if (async) { 1724 /* XXXUBC pagedaemon */ 1725 bp->b_flags |= B_CALL | (curproc == uvm.pagedaemon_proc ? 1726 B_PDAEMON : 0); 1727 bp->b_iodone = uvm_aio_biodone; 1728 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); 1729 } 1730 UVMHIST_LOG(pdhist, 1731 "about to start io: data = %p blkno = 0x%x, bcount = %ld", 1732 bp->b_data, bp->b_blkno, bp->b_bcount, 0); 1733 1734 /* 1735 * now we start the I/O, and if async, return. 1736 */ 1737 VOP_STRATEGY(bp); 1738 if (async) 1739 return 0; 1740 1741 /* 1742 * must be sync i/o. wait for it to finish 1743 */ 1744 error = biowait(bp); 1745 1746 /* 1747 * kill the pager mapping 1748 */ 1749 uvm_pagermapout(kva, npages); 1750 1751 /* 1752 * now dispose of the buf 1753 */ 1754 s = splbio(); 1755 if (bp->b_vp) 1756 brelvp(bp); 1757 if (write) 1758 vwakeup(bp); 1759 pool_put(&bufpool, bp); 1760 splx(s); 1761 1762 /* 1763 * finally return. 1764 */ 1765 UVMHIST_LOG(pdhist, "<- done (sync) error=%d", error, 0, 0, 0); 1766 return (error); 1767 } 1768