1 /* $NetBSD: uvm_swap.c,v 1.37 2000/05/19 03:45:04 thorpej Exp $ */ 2 3 /* 4 * Copyright (c) 1995, 1996, 1997 Matthew R. Green 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. The name of the author may not be used to endorse or promote products 16 * derived from this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 31 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 32 */ 33 34 #include "fs_nfs.h" 35 #include "opt_uvmhist.h" 36 #include "opt_compat_netbsd.h" 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/buf.h> 41 #include <sys/conf.h> 42 #include <sys/proc.h> 43 #include <sys/namei.h> 44 #include <sys/disklabel.h> 45 #include <sys/errno.h> 46 #include <sys/kernel.h> 47 #include <sys/malloc.h> 48 #include <sys/vnode.h> 49 #include <sys/file.h> 50 #include <sys/extent.h> 51 #include <sys/mount.h> 52 #include <sys/pool.h> 53 #include <sys/syscallargs.h> 54 #include <sys/swap.h> 55 56 #include <vm/vm.h> 57 #include <uvm/uvm.h> 58 59 #include <miscfs/specfs/specdev.h> 60 61 /* 62 * uvm_swap.c: manage configuration and i/o to swap space. 63 */ 64 65 /* 66 * swap space is managed in the following way: 67 * 68 * each swap partition or file is described by a "swapdev" structure. 69 * each "swapdev" structure contains a "swapent" structure which contains 70 * information that is passed up to the user (via system calls). 71 * 72 * each swap partition is assigned a "priority" (int) which controls 73 * swap parition usage. 74 * 75 * the system maintains a global data structure describing all swap 76 * partitions/files. there is a sorted LIST of "swappri" structures 77 * which describe "swapdev"'s at that priority. this LIST is headed 78 * by the "swap_priority" global var. each "swappri" contains a 79 * CIRCLEQ of "swapdev" structures at that priority. 80 * 81 * the system maintains a fixed pool of "swapbuf" structures for use 82 * at swap i/o time. a swapbuf includes a "buf" structure and an 83 * "aiodone" [we want to avoid malloc()'ing anything at swapout time 84 * since memory may be low]. 85 * 86 * locking: 87 * - swap_syscall_lock (sleep lock): this lock serializes the swapctl 88 * system call and prevents the swap priority list from changing 89 * while we are in the middle of a system call (e.g. SWAP_STATS). 90 * - uvm.swap_data_lock (simple_lock): this lock protects all swap data 91 * structures including the priority list, the swapdev structures, 92 * and the swapmap extent. 93 * - swap_buf_lock (simple_lock): this lock protects the free swapbuf 94 * pool. 95 * 96 * each swap device has the following info: 97 * - swap device in use (could be disabled, preventing future use) 98 * - swap enabled (allows new allocations on swap) 99 * - map info in /dev/drum 100 * - vnode pointer 101 * for swap files only: 102 * - block size 103 * - max byte count in buffer 104 * - buffer 105 * - credentials to use when doing i/o to file 106 * 107 * userland controls and configures swap with the swapctl(2) system call. 108 * the sys_swapctl performs the following operations: 109 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 110 * [2] SWAP_STATS: given a pointer to an array of swapent structures 111 * (passed in via "arg") of a size passed in via "misc" ... we load 112 * the current swap config into the array. 113 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 114 * priority in "misc", start swapping on it. 115 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 116 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 117 * "misc") 118 */ 119 120 /* 121 * swapdev: describes a single swap partition/file 122 * 123 * note the following should be true: 124 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 125 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] 126 */ 127 struct swapdev { 128 struct oswapent swd_ose; 129 #define swd_dev swd_ose.ose_dev /* device id */ 130 #define swd_flags swd_ose.ose_flags /* flags:inuse/enable/fake */ 131 #define swd_priority swd_ose.ose_priority /* our priority */ 132 /* also: swd_ose.ose_nblks, swd_ose.ose_inuse */ 133 char *swd_path; /* saved pathname of device */ 134 int swd_pathlen; /* length of pathname */ 135 int swd_npages; /* #pages we can use */ 136 int swd_npginuse; /* #pages in use */ 137 int swd_npgbad; /* #pages bad */ 138 int swd_drumoffset; /* page0 offset in drum */ 139 int swd_drumsize; /* #pages in drum */ 140 struct extent *swd_ex; /* extent for this swapdev */ 141 struct vnode *swd_vp; /* backing vnode */ 142 CIRCLEQ_ENTRY(swapdev) swd_next; /* priority circleq */ 143 144 int swd_bsize; /* blocksize (bytes) */ 145 int swd_maxactive; /* max active i/o reqs */ 146 struct buf_queue swd_tab; /* buffer list */ 147 int swd_active; /* number of active buffers */ 148 struct ucred *swd_cred; /* cred for file access */ 149 }; 150 151 /* 152 * swap device priority entry; the list is kept sorted on `spi_priority'. 153 */ 154 struct swappri { 155 int spi_priority; /* priority */ 156 CIRCLEQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 157 /* circleq of swapdevs at this priority */ 158 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 159 }; 160 161 /* 162 * swapbuf, swapbuffer plus async i/o info 163 */ 164 struct swapbuf { 165 struct buf sw_buf; /* a buffer structure */ 166 struct uvm_aiodesc sw_aio; /* aiodesc structure, used if ASYNC */ 167 SIMPLEQ_ENTRY(swapbuf) sw_sq; /* free list pointer */ 168 }; 169 170 /* 171 * The following two structures are used to keep track of data transfers 172 * on swap devices associated with regular files. 173 * NOTE: this code is more or less a copy of vnd.c; we use the same 174 * structure names here to ease porting.. 175 */ 176 struct vndxfer { 177 struct buf *vx_bp; /* Pointer to parent buffer */ 178 struct swapdev *vx_sdp; 179 int vx_error; 180 int vx_pending; /* # of pending aux buffers */ 181 int vx_flags; 182 #define VX_BUSY 1 183 #define VX_DEAD 2 184 }; 185 186 struct vndbuf { 187 struct buf vb_buf; 188 struct vndxfer *vb_xfer; 189 }; 190 191 192 /* 193 * We keep a of pool vndbuf's and vndxfer structures. 194 */ 195 struct pool *vndxfer_pool; 196 struct pool *vndbuf_pool; 197 198 #define getvndxfer(vnx) do { \ 199 int s = splbio(); \ 200 vnx = pool_get(vndxfer_pool, PR_MALLOCOK|PR_WAITOK); \ 201 splx(s); \ 202 } while (0) 203 204 #define putvndxfer(vnx) { \ 205 pool_put(vndxfer_pool, (void *)(vnx)); \ 206 } 207 208 #define getvndbuf(vbp) do { \ 209 int s = splbio(); \ 210 vbp = pool_get(vndbuf_pool, PR_MALLOCOK|PR_WAITOK); \ 211 splx(s); \ 212 } while (0) 213 214 #define putvndbuf(vbp) { \ 215 pool_put(vndbuf_pool, (void *)(vbp)); \ 216 } 217 218 /* /dev/drum */ 219 bdev_decl(sw); 220 cdev_decl(sw); 221 222 /* 223 * local variables 224 */ 225 static struct extent *swapmap; /* controls the mapping of /dev/drum */ 226 SIMPLEQ_HEAD(swapbufhead, swapbuf); 227 struct pool *swapbuf_pool; 228 229 /* list of all active swap devices [by priority] */ 230 LIST_HEAD(swap_priority, swappri); 231 static struct swap_priority swap_priority; 232 233 /* locks */ 234 lock_data_t swap_syscall_lock; 235 236 /* 237 * prototypes 238 */ 239 static void swapdrum_add __P((struct swapdev *, int)); 240 static struct swapdev *swapdrum_getsdp __P((int)); 241 242 static struct swapdev *swaplist_find __P((struct vnode *, int)); 243 static void swaplist_insert __P((struct swapdev *, 244 struct swappri *, int)); 245 static void swaplist_trim __P((void)); 246 247 static int swap_on __P((struct proc *, struct swapdev *)); 248 static int swap_off __P((struct proc *, struct swapdev *)); 249 250 static void sw_reg_strategy __P((struct swapdev *, struct buf *, int)); 251 static void sw_reg_iodone __P((struct buf *)); 252 static void sw_reg_start __P((struct swapdev *)); 253 254 static void uvm_swap_aiodone __P((struct uvm_aiodesc *)); 255 static void uvm_swap_bufdone __P((struct buf *)); 256 static int uvm_swap_io __P((struct vm_page **, int, int, int)); 257 258 /* 259 * uvm_swap_init: init the swap system data structures and locks 260 * 261 * => called at boot time from init_main.c after the filesystems 262 * are brought up (which happens after uvm_init()) 263 */ 264 void 265 uvm_swap_init() 266 { 267 UVMHIST_FUNC("uvm_swap_init"); 268 269 UVMHIST_CALLED(pdhist); 270 /* 271 * first, init the swap list, its counter, and its lock. 272 * then get a handle on the vnode for /dev/drum by using 273 * the its dev_t number ("swapdev", from MD conf.c). 274 */ 275 276 LIST_INIT(&swap_priority); 277 uvmexp.nswapdev = 0; 278 lockinit(&swap_syscall_lock, PVM, "swapsys", 0, 0); 279 simple_lock_init(&uvm.swap_data_lock); 280 281 if (bdevvp(swapdev, &swapdev_vp)) 282 panic("uvm_swap_init: can't get vnode for swap device"); 283 284 /* 285 * create swap block resource map to map /dev/drum. the range 286 * from 1 to INT_MAX allows 2 gigablocks of swap space. note 287 * that block 0 is reserved (used to indicate an allocation 288 * failure, or no allocation). 289 */ 290 swapmap = extent_create("swapmap", 1, INT_MAX, 291 M_VMSWAP, 0, 0, EX_NOWAIT); 292 if (swapmap == 0) 293 panic("uvm_swap_init: extent_create failed"); 294 295 /* 296 * allocate our private pool of "swapbuf" structures (includes 297 * a "buf" structure). ["nswbuf" comes from param.c and can 298 * be adjusted by MD code before we get here]. 299 */ 300 301 swapbuf_pool = 302 pool_create(sizeof(struct swapbuf), 0, 0, 0, "swp buf", 0, 303 NULL, NULL, 0); 304 if (swapbuf_pool == NULL) 305 panic("swapinit: pool_create failed"); 306 /* XXX - set a maximum on swapbuf_pool? */ 307 308 vndxfer_pool = 309 pool_create(sizeof(struct vndxfer), 0, 0, 0, "swp vnx", 0, 310 NULL, NULL, 0); 311 if (vndxfer_pool == NULL) 312 panic("swapinit: pool_create failed"); 313 314 vndbuf_pool = 315 pool_create(sizeof(struct vndbuf), 0, 0, 0, "swp vnd", 0, 316 NULL, NULL, 0); 317 if (vndbuf_pool == NULL) 318 panic("swapinit: pool_create failed"); 319 /* 320 * done! 321 */ 322 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); 323 } 324 325 /* 326 * swaplist functions: functions that operate on the list of swap 327 * devices on the system. 328 */ 329 330 /* 331 * swaplist_insert: insert swap device "sdp" into the global list 332 * 333 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 334 * => caller must provide a newly malloc'd swappri structure (we will 335 * FREE it if we don't need it... this it to prevent malloc blocking 336 * here while adding swap) 337 */ 338 static void 339 swaplist_insert(sdp, newspp, priority) 340 struct swapdev *sdp; 341 struct swappri *newspp; 342 int priority; 343 { 344 struct swappri *spp, *pspp; 345 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist); 346 347 /* 348 * find entry at or after which to insert the new device. 349 */ 350 for (pspp = NULL, spp = LIST_FIRST(&swap_priority); spp != NULL; 351 spp = LIST_NEXT(spp, spi_swappri)) { 352 if (priority <= spp->spi_priority) 353 break; 354 pspp = spp; 355 } 356 357 /* 358 * new priority? 359 */ 360 if (spp == NULL || spp->spi_priority != priority) { 361 spp = newspp; /* use newspp! */ 362 UVMHIST_LOG(pdhist, "created new swappri = %d", 363 priority, 0, 0, 0); 364 365 spp->spi_priority = priority; 366 CIRCLEQ_INIT(&spp->spi_swapdev); 367 368 if (pspp) 369 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 370 else 371 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 372 } else { 373 /* we don't need a new priority structure, free it */ 374 FREE(newspp, M_VMSWAP); 375 } 376 377 /* 378 * priority found (or created). now insert on the priority's 379 * circleq list and bump the total number of swapdevs. 380 */ 381 sdp->swd_priority = priority; 382 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 383 uvmexp.nswapdev++; 384 } 385 386 /* 387 * swaplist_find: find and optionally remove a swap device from the 388 * global list. 389 * 390 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 391 * => we return the swapdev we found (and removed) 392 */ 393 static struct swapdev * 394 swaplist_find(vp, remove) 395 struct vnode *vp; 396 boolean_t remove; 397 { 398 struct swapdev *sdp; 399 struct swappri *spp; 400 401 /* 402 * search the lists for the requested vp 403 */ 404 for (spp = LIST_FIRST(&swap_priority); spp != NULL; 405 spp = LIST_NEXT(spp, spi_swappri)) { 406 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev); 407 sdp != (void *)&spp->spi_swapdev; 408 sdp = CIRCLEQ_NEXT(sdp, swd_next)) 409 if (sdp->swd_vp == vp) { 410 if (remove) { 411 CIRCLEQ_REMOVE(&spp->spi_swapdev, 412 sdp, swd_next); 413 uvmexp.nswapdev--; 414 } 415 return(sdp); 416 } 417 } 418 return (NULL); 419 } 420 421 422 /* 423 * swaplist_trim: scan priority list for empty priority entries and kill 424 * them. 425 * 426 * => caller must hold both swap_syscall_lock and uvm.swap_data_lock 427 */ 428 static void 429 swaplist_trim() 430 { 431 struct swappri *spp, *nextspp; 432 433 for (spp = LIST_FIRST(&swap_priority); spp != NULL; spp = nextspp) { 434 nextspp = LIST_NEXT(spp, spi_swappri); 435 if (CIRCLEQ_FIRST(&spp->spi_swapdev) != 436 (void *)&spp->spi_swapdev) 437 continue; 438 LIST_REMOVE(spp, spi_swappri); 439 free(spp, M_VMSWAP); 440 } 441 } 442 443 /* 444 * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area. 445 * 446 * => caller must hold swap_syscall_lock 447 * => uvm.swap_data_lock should be unlocked (we may sleep) 448 */ 449 static void 450 swapdrum_add(sdp, npages) 451 struct swapdev *sdp; 452 int npages; 453 { 454 u_long result; 455 456 if (extent_alloc(swapmap, npages, EX_NOALIGN, EX_NOBOUNDARY, 457 EX_WAITOK, &result)) 458 panic("swapdrum_add"); 459 460 sdp->swd_drumoffset = result; 461 sdp->swd_drumsize = npages; 462 } 463 464 /* 465 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 466 * to the "swapdev" that maps that section of the drum. 467 * 468 * => each swapdev takes one big contig chunk of the drum 469 * => caller must hold uvm.swap_data_lock 470 */ 471 static struct swapdev * 472 swapdrum_getsdp(pgno) 473 int pgno; 474 { 475 struct swapdev *sdp; 476 struct swappri *spp; 477 478 for (spp = LIST_FIRST(&swap_priority); spp != NULL; 479 spp = LIST_NEXT(spp, spi_swappri)) 480 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev); 481 sdp != (void *)&spp->spi_swapdev; 482 sdp = CIRCLEQ_NEXT(sdp, swd_next)) 483 if (pgno >= sdp->swd_drumoffset && 484 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 485 return sdp; 486 } 487 return NULL; 488 } 489 490 491 /* 492 * sys_swapctl: main entry point for swapctl(2) system call 493 * [with two helper functions: swap_on and swap_off] 494 */ 495 int 496 sys_swapctl(p, v, retval) 497 struct proc *p; 498 void *v; 499 register_t *retval; 500 { 501 struct sys_swapctl_args /* { 502 syscallarg(int) cmd; 503 syscallarg(void *) arg; 504 syscallarg(int) misc; 505 } */ *uap = (struct sys_swapctl_args *)v; 506 struct vnode *vp; 507 struct nameidata nd; 508 struct swappri *spp; 509 struct swapdev *sdp; 510 struct swapent *sep; 511 char userpath[PATH_MAX + 1]; 512 size_t len; 513 int count, error, misc; 514 int priority; 515 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist); 516 517 misc = SCARG(uap, misc); 518 519 /* 520 * ensure serialized syscall access by grabbing the swap_syscall_lock 521 */ 522 lockmgr(&swap_syscall_lock, LK_EXCLUSIVE, NULL); 523 524 /* 525 * we handle the non-priv NSWAP and STATS request first. 526 * 527 * SWAP_NSWAP: return number of config'd swap devices 528 * [can also be obtained with uvmexp sysctl] 529 */ 530 if (SCARG(uap, cmd) == SWAP_NSWAP) { 531 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev, 532 0, 0, 0); 533 *retval = uvmexp.nswapdev; 534 error = 0; 535 goto out; 536 } 537 538 /* 539 * SWAP_STATS: get stats on current # of configured swap devs 540 * 541 * note that the swap_priority list can't change as long 542 * as we are holding the swap_syscall_lock. we don't want 543 * to grab the uvm.swap_data_lock because we may fault&sleep during 544 * copyout() and we don't want to be holding that lock then! 545 */ 546 if (SCARG(uap, cmd) == SWAP_STATS 547 #if defined(COMPAT_13) 548 || SCARG(uap, cmd) == SWAP_OSTATS 549 #endif 550 ) { 551 sep = (struct swapent *)SCARG(uap, arg); 552 count = 0; 553 554 for (spp = LIST_FIRST(&swap_priority); spp != NULL; 555 spp = LIST_NEXT(spp, spi_swappri)) { 556 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev); 557 sdp != (void *)&spp->spi_swapdev && misc-- > 0; 558 sdp = CIRCLEQ_NEXT(sdp, swd_next)) { 559 /* 560 * backwards compatibility for system call. 561 * note that we use 'struct oswapent' as an 562 * overlay into both 'struct swapdev' and 563 * the userland 'struct swapent', as we 564 * want to retain backwards compatibility 565 * with NetBSD 1.3. 566 */ 567 sdp->swd_ose.ose_inuse = 568 btodb(sdp->swd_npginuse << PAGE_SHIFT); 569 error = copyout(&sdp->swd_ose, sep, 570 sizeof(struct oswapent)); 571 572 /* now copy out the path if necessary */ 573 #if defined(COMPAT_13) 574 if (error == 0 && SCARG(uap, cmd) == SWAP_STATS) 575 #else 576 if (error == 0) 577 #endif 578 error = copyout(sdp->swd_path, 579 &sep->se_path, sdp->swd_pathlen); 580 581 if (error) 582 goto out; 583 count++; 584 #if defined(COMPAT_13) 585 if (SCARG(uap, cmd) == SWAP_OSTATS) 586 ((struct oswapent *)sep)++; 587 else 588 #endif 589 sep++; 590 } 591 } 592 593 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); 594 595 *retval = count; 596 error = 0; 597 goto out; 598 } 599 600 /* 601 * all other requests require superuser privs. verify. 602 */ 603 if ((error = suser(p->p_ucred, &p->p_acflag))) 604 goto out; 605 606 /* 607 * at this point we expect a path name in arg. we will 608 * use namei() to gain a vnode reference (vref), and lock 609 * the vnode (VOP_LOCK). 610 * 611 * XXX: a NULL arg means use the root vnode pointer (e.g. for 612 * miniroot) 613 */ 614 if (SCARG(uap, arg) == NULL) { 615 vp = rootvp; /* miniroot */ 616 if (vget(vp, LK_EXCLUSIVE)) { 617 error = EBUSY; 618 goto out; 619 } 620 if (SCARG(uap, cmd) == SWAP_ON && 621 copystr("miniroot", userpath, sizeof userpath, &len)) 622 panic("swapctl: miniroot copy failed"); 623 } else { 624 int space; 625 char *where; 626 627 if (SCARG(uap, cmd) == SWAP_ON) { 628 if ((error = copyinstr(SCARG(uap, arg), userpath, 629 sizeof userpath, &len))) 630 goto out; 631 space = UIO_SYSSPACE; 632 where = userpath; 633 } else { 634 space = UIO_USERSPACE; 635 where = (char *)SCARG(uap, arg); 636 } 637 NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, space, where, p); 638 if ((error = namei(&nd))) 639 goto out; 640 vp = nd.ni_vp; 641 } 642 /* note: "vp" is referenced and locked */ 643 644 error = 0; /* assume no error */ 645 switch(SCARG(uap, cmd)) { 646 case SWAP_DUMPDEV: 647 if (vp->v_type != VBLK) { 648 error = ENOTBLK; 649 goto out; 650 } 651 dumpdev = vp->v_rdev; 652 653 break; 654 655 case SWAP_CTL: 656 /* 657 * get new priority, remove old entry (if any) and then 658 * reinsert it in the correct place. finally, prune out 659 * any empty priority structures. 660 */ 661 priority = SCARG(uap, misc); 662 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 663 simple_lock(&uvm.swap_data_lock); 664 if ((sdp = swaplist_find(vp, 1)) == NULL) { 665 error = ENOENT; 666 } else { 667 swaplist_insert(sdp, spp, priority); 668 swaplist_trim(); 669 } 670 simple_unlock(&uvm.swap_data_lock); 671 if (error) 672 free(spp, M_VMSWAP); 673 break; 674 675 case SWAP_ON: 676 677 /* 678 * check for duplicates. if none found, then insert a 679 * dummy entry on the list to prevent someone else from 680 * trying to enable this device while we are working on 681 * it. 682 */ 683 684 priority = SCARG(uap, misc); 685 simple_lock(&uvm.swap_data_lock); 686 if ((sdp = swaplist_find(vp, 0)) != NULL) { 687 error = EBUSY; 688 simple_unlock(&uvm.swap_data_lock); 689 break; 690 } 691 sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK); 692 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 693 memset(sdp, 0, sizeof(*sdp)); 694 sdp->swd_flags = SWF_FAKE; /* placeholder only */ 695 sdp->swd_vp = vp; 696 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 697 BUFQ_INIT(&sdp->swd_tab); 698 699 /* 700 * XXX Is NFS elaboration necessary? 701 */ 702 if (vp->v_type == VREG) { 703 sdp->swd_cred = crdup(p->p_ucred); 704 } 705 706 swaplist_insert(sdp, spp, priority); 707 simple_unlock(&uvm.swap_data_lock); 708 709 sdp->swd_pathlen = len; 710 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK); 711 if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0) 712 panic("swapctl: copystr"); 713 714 /* 715 * we've now got a FAKE placeholder in the swap list. 716 * now attempt to enable swap on it. if we fail, undo 717 * what we've done and kill the fake entry we just inserted. 718 * if swap_on is a success, it will clear the SWF_FAKE flag 719 */ 720 721 if ((error = swap_on(p, sdp)) != 0) { 722 simple_lock(&uvm.swap_data_lock); 723 (void) swaplist_find(vp, 1); /* kill fake entry */ 724 swaplist_trim(); 725 simple_unlock(&uvm.swap_data_lock); 726 if (vp->v_type == VREG) { 727 crfree(sdp->swd_cred); 728 } 729 free(sdp->swd_path, M_VMSWAP); 730 free(sdp, M_VMSWAP); 731 break; 732 } 733 734 /* 735 * got it! now add a second reference to vp so that 736 * we keep a reference to the vnode after we return. 737 */ 738 vref(vp); 739 break; 740 741 case SWAP_OFF: 742 simple_lock(&uvm.swap_data_lock); 743 if ((sdp = swaplist_find(vp, 0)) == NULL) { 744 simple_unlock(&uvm.swap_data_lock); 745 error = ENXIO; 746 break; 747 } 748 749 /* 750 * If a device isn't in use or enabled, we 751 * can't stop swapping from it (again). 752 */ 753 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 754 simple_unlock(&uvm.swap_data_lock); 755 error = EBUSY; 756 break; 757 } 758 759 /* 760 * do the real work. 761 */ 762 if ((error = swap_off(p, sdp)) != 0) 763 goto out; 764 765 break; 766 767 default: 768 error = EINVAL; 769 } 770 771 /* 772 * done! use vput to drop our reference and unlock 773 */ 774 vput(vp); 775 out: 776 lockmgr(&swap_syscall_lock, LK_RELEASE, NULL); 777 778 UVMHIST_LOG(pdhist, "<- done! error=%d", error, 0, 0, 0); 779 return (error); 780 } 781 782 /* 783 * swap_on: attempt to enable a swapdev for swapping. note that the 784 * swapdev is already on the global list, but disabled (marked 785 * SWF_FAKE). 786 * 787 * => we avoid the start of the disk (to protect disk labels) 788 * => we also avoid the miniroot, if we are swapping to root. 789 * => caller should leave uvm.swap_data_lock unlocked, we may lock it 790 * if needed. 791 */ 792 static int 793 swap_on(p, sdp) 794 struct proc *p; 795 struct swapdev *sdp; 796 { 797 static int count = 0; /* static */ 798 struct vnode *vp; 799 int error, npages, nblocks, size; 800 long addr; 801 struct vattr va; 802 #ifdef NFS 803 extern int (**nfsv2_vnodeop_p) __P((void *)); 804 #endif /* NFS */ 805 dev_t dev; 806 char *name; 807 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist); 808 809 /* 810 * we want to enable swapping on sdp. the swd_vp contains 811 * the vnode we want (locked and ref'd), and the swd_dev 812 * contains the dev_t of the file, if it a block device. 813 */ 814 815 vp = sdp->swd_vp; 816 dev = sdp->swd_dev; 817 818 /* 819 * open the swap file (mostly useful for block device files to 820 * let device driver know what is up). 821 * 822 * we skip the open/close for root on swap because the root 823 * has already been opened when root was mounted (mountroot). 824 */ 825 if (vp != rootvp) { 826 if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p))) 827 return (error); 828 } 829 830 /* XXX this only works for block devices */ 831 UVMHIST_LOG(pdhist, " dev=%d, major(dev)=%d", dev, major(dev), 0,0); 832 833 /* 834 * we now need to determine the size of the swap area. for 835 * block specials we can call the d_psize function. 836 * for normal files, we must stat [get attrs]. 837 * 838 * we put the result in nblks. 839 * for normal files, we also want the filesystem block size 840 * (which we get with statfs). 841 */ 842 switch (vp->v_type) { 843 case VBLK: 844 if (bdevsw[major(dev)].d_psize == 0 || 845 (nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) { 846 error = ENXIO; 847 goto bad; 848 } 849 break; 850 851 case VREG: 852 if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p))) 853 goto bad; 854 nblocks = (int)btodb(va.va_size); 855 if ((error = 856 VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0) 857 goto bad; 858 859 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize; 860 /* 861 * limit the max # of outstanding I/O requests we issue 862 * at any one time. take it easy on NFS servers. 863 */ 864 #ifdef NFS 865 if (vp->v_op == nfsv2_vnodeop_p) 866 sdp->swd_maxactive = 2; /* XXX */ 867 else 868 #endif /* NFS */ 869 sdp->swd_maxactive = 8; /* XXX */ 870 break; 871 872 default: 873 error = ENXIO; 874 goto bad; 875 } 876 877 /* 878 * save nblocks in a safe place and convert to pages. 879 */ 880 881 sdp->swd_ose.ose_nblks = nblocks; 882 npages = dbtob((u_int64_t)nblocks) >> PAGE_SHIFT; 883 884 /* 885 * for block special files, we want to make sure that leave 886 * the disklabel and bootblocks alone, so we arrange to skip 887 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 888 * note that because of this the "size" can be less than the 889 * actual number of blocks on the device. 890 */ 891 if (vp->v_type == VBLK) { 892 /* we use pages 1 to (size - 1) [inclusive] */ 893 size = npages - 1; 894 addr = 1; 895 } else { 896 /* we use pages 0 to (size - 1) [inclusive] */ 897 size = npages; 898 addr = 0; 899 } 900 901 /* 902 * make sure we have enough blocks for a reasonable sized swap 903 * area. we want at least one page. 904 */ 905 906 if (size < 1) { 907 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); 908 error = EINVAL; 909 goto bad; 910 } 911 912 UVMHIST_LOG(pdhist, " dev=%x: size=%d addr=%ld\n", dev, size, addr, 0); 913 914 /* 915 * now we need to allocate an extent to manage this swap device 916 */ 917 name = malloc(12, M_VMSWAP, M_WAITOK); 918 sprintf(name, "swap0x%04x", count++); 919 920 /* note that extent_create's 3rd arg is inclusive, thus "- 1" */ 921 sdp->swd_ex = extent_create(name, 0, npages - 1, M_VMSWAP, 922 0, 0, EX_WAITOK); 923 /* allocate the `saved' region from the extent so it won't be used */ 924 if (addr) { 925 if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK)) 926 panic("disklabel region"); 927 } 928 929 /* 930 * if the vnode we are swapping to is the root vnode 931 * (i.e. we are swapping to the miniroot) then we want 932 * to make sure we don't overwrite it. do a statfs to 933 * find its size and skip over it. 934 */ 935 if (vp == rootvp) { 936 struct mount *mp; 937 struct statfs *sp; 938 int rootblocks, rootpages; 939 940 mp = rootvnode->v_mount; 941 sp = &mp->mnt_stat; 942 rootblocks = sp->f_blocks * btodb(sp->f_bsize); 943 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; 944 if (rootpages > size) 945 panic("swap_on: miniroot larger than swap?"); 946 947 if (extent_alloc_region(sdp->swd_ex, addr, 948 rootpages, EX_WAITOK)) 949 panic("swap_on: unable to preserve miniroot"); 950 951 size -= rootpages; 952 printf("Preserved %d pages of miniroot ", rootpages); 953 printf("leaving %d pages of swap\n", size); 954 } 955 956 /* 957 * add anons to reflect the new swap space 958 */ 959 uvm_anon_add(size); 960 961 /* 962 * now add the new swapdev to the drum and enable. 963 */ 964 simple_lock(&uvm.swap_data_lock); 965 swapdrum_add(sdp, npages); 966 sdp->swd_npages = size; 967 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 968 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 969 uvmexp.swpages += size; 970 simple_unlock(&uvm.swap_data_lock); 971 return (0); 972 973 bad: 974 /* 975 * failure: close device if necessary and return error. 976 */ 977 if (vp != rootvp) 978 (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p); 979 return (error); 980 } 981 982 /* 983 * swap_off: stop swapping on swapdev 984 * 985 * => swap data should be locked, we will unlock. 986 */ 987 static int 988 swap_off(p, sdp) 989 struct proc *p; 990 struct swapdev *sdp; 991 { 992 void *name; 993 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist); 994 UVMHIST_LOG(pdhist, " dev=%x", sdp->swd_dev,0,0,0); 995 996 /* disable the swap area being removed */ 997 sdp->swd_flags &= ~SWF_ENABLE; 998 simple_unlock(&uvm.swap_data_lock); 999 1000 /* 1001 * the idea is to find all the pages that are paged out to this 1002 * device, and page them all in. in uvm, swap-backed pageable 1003 * memory can take two forms: aobjs and anons. call the 1004 * swapoff hook for each subsystem to bring in pages. 1005 */ 1006 1007 if (uao_swap_off(sdp->swd_drumoffset, 1008 sdp->swd_drumoffset + sdp->swd_drumsize) || 1009 anon_swap_off(sdp->swd_drumoffset, 1010 sdp->swd_drumoffset + sdp->swd_drumsize)) { 1011 1012 simple_lock(&uvm.swap_data_lock); 1013 sdp->swd_flags |= SWF_ENABLE; 1014 simple_unlock(&uvm.swap_data_lock); 1015 return ENOMEM; 1016 } 1017 1018 #ifdef DIAGNOSTIC 1019 if (sdp->swd_npginuse != sdp->swd_npgbad) { 1020 panic("swap_off: sdp %p - %d pages still in use (%d bad)\n", 1021 sdp, sdp->swd_npginuse, sdp->swd_npgbad); 1022 } 1023 #endif 1024 1025 /* 1026 * done with the vnode. 1027 */ 1028 if (sdp->swd_vp->v_type == VREG) { 1029 crfree(sdp->swd_cred); 1030 } 1031 if (sdp->swd_vp != rootvp) { 1032 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p); 1033 } 1034 if (sdp->swd_vp) { 1035 vrele(sdp->swd_vp); 1036 } 1037 1038 /* remove anons from the system */ 1039 uvm_anon_remove(sdp->swd_npages); 1040 1041 simple_lock(&uvm.swap_data_lock); 1042 uvmexp.swpages -= sdp->swd_npages; 1043 1044 if (swaplist_find(sdp->swd_vp, 1) == NULL) 1045 panic("swap_off: swapdev not in list\n"); 1046 swaplist_trim(); 1047 1048 /* 1049 * free all resources! 1050 */ 1051 extent_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize, 1052 EX_WAITOK); 1053 name = (void *)sdp->swd_ex->ex_name; 1054 extent_destroy(sdp->swd_ex); 1055 free(name, M_VMSWAP); 1056 free(sdp, M_VMSWAP); 1057 simple_unlock(&uvm.swap_data_lock); 1058 return (0); 1059 } 1060 1061 /* 1062 * /dev/drum interface and i/o functions 1063 */ 1064 1065 /* 1066 * swread: the read function for the drum (just a call to physio) 1067 */ 1068 /*ARGSUSED*/ 1069 int 1070 swread(dev, uio, ioflag) 1071 dev_t dev; 1072 struct uio *uio; 1073 int ioflag; 1074 { 1075 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist); 1076 1077 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); 1078 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); 1079 } 1080 1081 /* 1082 * swwrite: the write function for the drum (just a call to physio) 1083 */ 1084 /*ARGSUSED*/ 1085 int 1086 swwrite(dev, uio, ioflag) 1087 dev_t dev; 1088 struct uio *uio; 1089 int ioflag; 1090 { 1091 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist); 1092 1093 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); 1094 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); 1095 } 1096 1097 /* 1098 * swstrategy: perform I/O on the drum 1099 * 1100 * => we must map the i/o request from the drum to the correct swapdev. 1101 */ 1102 void 1103 swstrategy(bp) 1104 struct buf *bp; 1105 { 1106 struct swapdev *sdp; 1107 struct vnode *vp; 1108 int s, pageno, bn; 1109 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist); 1110 1111 /* 1112 * convert block number to swapdev. note that swapdev can't 1113 * be yanked out from under us because we are holding resources 1114 * in it (i.e. the blocks we are doing I/O on). 1115 */ 1116 pageno = dbtob(bp->b_blkno) >> PAGE_SHIFT; 1117 simple_lock(&uvm.swap_data_lock); 1118 sdp = swapdrum_getsdp(pageno); 1119 simple_unlock(&uvm.swap_data_lock); 1120 if (sdp == NULL) { 1121 bp->b_error = EINVAL; 1122 bp->b_flags |= B_ERROR; 1123 biodone(bp); 1124 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); 1125 return; 1126 } 1127 1128 /* 1129 * convert drum page number to block number on this swapdev. 1130 */ 1131 1132 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1133 bn = btodb(pageno << PAGE_SHIFT); /* convert to diskblock */ 1134 1135 UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld\n", 1136 ((bp->b_flags & B_READ) == 0) ? "write" : "read", 1137 sdp->swd_drumoffset, bn, bp->b_bcount); 1138 1139 /* 1140 * for block devices we finish up here. 1141 * for regular files we have to do more work which we delegate 1142 * to sw_reg_strategy(). 1143 */ 1144 1145 switch (sdp->swd_vp->v_type) { 1146 default: 1147 panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type); 1148 1149 case VBLK: 1150 1151 /* 1152 * must convert "bp" from an I/O on /dev/drum to an I/O 1153 * on the swapdev (sdp). 1154 */ 1155 s = splbio(); 1156 bp->b_blkno = bn; /* swapdev block number */ 1157 vp = sdp->swd_vp; /* swapdev vnode pointer */ 1158 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */ 1159 VHOLD(vp); /* "hold" swapdev vp for i/o */ 1160 1161 /* 1162 * if we are doing a write, we have to redirect the i/o on 1163 * drum's v_numoutput counter to the swapdevs. 1164 */ 1165 if ((bp->b_flags & B_READ) == 0) { 1166 vwakeup(bp); /* kills one 'v_numoutput' on drum */ 1167 vp->v_numoutput++; /* put it on swapdev */ 1168 } 1169 1170 /* 1171 * dissassocate buffer with /dev/drum vnode 1172 * [could be null if buf was from physio] 1173 */ 1174 if (bp->b_vp != NULLVP) 1175 brelvp(bp); 1176 1177 /* 1178 * finally plug in swapdev vnode and start I/O 1179 */ 1180 bp->b_vp = vp; 1181 splx(s); 1182 VOP_STRATEGY(bp); 1183 return; 1184 1185 case VREG: 1186 /* 1187 * delegate to sw_reg_strategy function. 1188 */ 1189 sw_reg_strategy(sdp, bp, bn); 1190 return; 1191 } 1192 /* NOTREACHED */ 1193 } 1194 1195 /* 1196 * sw_reg_strategy: handle swap i/o to regular files 1197 */ 1198 static void 1199 sw_reg_strategy(sdp, bp, bn) 1200 struct swapdev *sdp; 1201 struct buf *bp; 1202 int bn; 1203 { 1204 struct vnode *vp; 1205 struct vndxfer *vnx; 1206 daddr_t nbn, byteoff; 1207 caddr_t addr; 1208 int s, off, nra, error, sz, resid; 1209 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist); 1210 1211 /* 1212 * allocate a vndxfer head for this transfer and point it to 1213 * our buffer. 1214 */ 1215 getvndxfer(vnx); 1216 vnx->vx_flags = VX_BUSY; 1217 vnx->vx_error = 0; 1218 vnx->vx_pending = 0; 1219 vnx->vx_bp = bp; 1220 vnx->vx_sdp = sdp; 1221 1222 /* 1223 * setup for main loop where we read filesystem blocks into 1224 * our buffer. 1225 */ 1226 error = 0; 1227 bp->b_resid = bp->b_bcount; /* nothing transfered yet! */ 1228 addr = bp->b_data; /* current position in buffer */ 1229 byteoff = dbtob(bn); 1230 1231 for (resid = bp->b_resid; resid; resid -= sz) { 1232 struct vndbuf *nbp; 1233 1234 /* 1235 * translate byteoffset into block number. return values: 1236 * vp = vnode of underlying device 1237 * nbn = new block number (on underlying vnode dev) 1238 * nra = num blocks we can read-ahead (excludes requested 1239 * block) 1240 */ 1241 nra = 0; 1242 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1243 &vp, &nbn, &nra); 1244 1245 if (error == 0 && nbn == (daddr_t)-1) { 1246 /* 1247 * this used to just set error, but that doesn't 1248 * do the right thing. Instead, it causes random 1249 * memory errors. The panic() should remain until 1250 * this condition doesn't destabilize the system. 1251 */ 1252 #if 1 1253 panic("sw_reg_strategy: swap to sparse file"); 1254 #else 1255 error = EIO; /* failure */ 1256 #endif 1257 } 1258 1259 /* 1260 * punt if there was an error or a hole in the file. 1261 * we must wait for any i/o ops we have already started 1262 * to finish before returning. 1263 * 1264 * XXX we could deal with holes here but it would be 1265 * a hassle (in the write case). 1266 */ 1267 if (error) { 1268 s = splbio(); 1269 vnx->vx_error = error; /* pass error up */ 1270 goto out; 1271 } 1272 1273 /* 1274 * compute the size ("sz") of this transfer (in bytes). 1275 * XXXCDC: ignores read-ahead for non-zero offset 1276 */ 1277 if ((off = (byteoff % sdp->swd_bsize)) != 0) 1278 sz = sdp->swd_bsize - off; 1279 else 1280 sz = (1 + nra) * sdp->swd_bsize; 1281 1282 if (resid < sz) 1283 sz = resid; 1284 1285 UVMHIST_LOG(pdhist, "sw_reg_strategy: vp %p/%p offset 0x%x/0x%x", 1286 sdp->swd_vp, vp, byteoff, nbn); 1287 1288 /* 1289 * now get a buf structure. note that the vb_buf is 1290 * at the front of the nbp structure so that you can 1291 * cast pointers between the two structure easily. 1292 */ 1293 getvndbuf(nbp); 1294 nbp->vb_buf.b_flags = bp->b_flags | B_CALL; 1295 nbp->vb_buf.b_bcount = sz; 1296 nbp->vb_buf.b_bufsize = sz; 1297 nbp->vb_buf.b_error = 0; 1298 nbp->vb_buf.b_data = addr; 1299 nbp->vb_buf.b_blkno = nbn + btodb(off); 1300 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno; 1301 nbp->vb_buf.b_proc = bp->b_proc; 1302 nbp->vb_buf.b_iodone = sw_reg_iodone; 1303 nbp->vb_buf.b_vp = NULLVP; 1304 nbp->vb_buf.b_vnbufs.le_next = NOLIST; 1305 nbp->vb_buf.b_rcred = sdp->swd_cred; 1306 nbp->vb_buf.b_wcred = sdp->swd_cred; 1307 LIST_INIT(&nbp->vb_buf.b_dep); 1308 1309 /* 1310 * set b_dirtyoff/end and b_validoff/end. this is 1311 * required by the NFS client code (otherwise it will 1312 * just discard our I/O request). 1313 */ 1314 if (bp->b_dirtyend == 0) { 1315 nbp->vb_buf.b_dirtyoff = 0; 1316 nbp->vb_buf.b_dirtyend = sz; 1317 } else { 1318 nbp->vb_buf.b_dirtyoff = 1319 max(0, bp->b_dirtyoff - (bp->b_bcount-resid)); 1320 nbp->vb_buf.b_dirtyend = 1321 min(sz, 1322 max(0, bp->b_dirtyend - (bp->b_bcount-resid))); 1323 } 1324 if (bp->b_validend == 0) { 1325 nbp->vb_buf.b_validoff = 0; 1326 nbp->vb_buf.b_validend = sz; 1327 } else { 1328 nbp->vb_buf.b_validoff = 1329 max(0, bp->b_validoff - (bp->b_bcount-resid)); 1330 nbp->vb_buf.b_validend = 1331 min(sz, 1332 max(0, bp->b_validend - (bp->b_bcount-resid))); 1333 } 1334 1335 nbp->vb_xfer = vnx; /* patch it back in to vnx */ 1336 1337 /* 1338 * Just sort by block number 1339 */ 1340 s = splbio(); 1341 if (vnx->vx_error != 0) { 1342 putvndbuf(nbp); 1343 goto out; 1344 } 1345 vnx->vx_pending++; 1346 1347 /* assoc new buffer with underlying vnode */ 1348 bgetvp(vp, &nbp->vb_buf); 1349 1350 /* sort it in and start I/O if we are not over our limit */ 1351 disksort_blkno(&sdp->swd_tab, &nbp->vb_buf); 1352 sw_reg_start(sdp); 1353 splx(s); 1354 1355 /* 1356 * advance to the next I/O 1357 */ 1358 byteoff += sz; 1359 addr += sz; 1360 } 1361 1362 s = splbio(); 1363 1364 out: /* Arrive here at splbio */ 1365 vnx->vx_flags &= ~VX_BUSY; 1366 if (vnx->vx_pending == 0) { 1367 if (vnx->vx_error != 0) { 1368 bp->b_error = vnx->vx_error; 1369 bp->b_flags |= B_ERROR; 1370 } 1371 putvndxfer(vnx); 1372 biodone(bp); 1373 } 1374 splx(s); 1375 } 1376 1377 /* 1378 * sw_reg_start: start an I/O request on the requested swapdev 1379 * 1380 * => reqs are sorted by disksort (above) 1381 */ 1382 static void 1383 sw_reg_start(sdp) 1384 struct swapdev *sdp; 1385 { 1386 struct buf *bp; 1387 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist); 1388 1389 /* recursion control */ 1390 if ((sdp->swd_flags & SWF_BUSY) != 0) 1391 return; 1392 1393 sdp->swd_flags |= SWF_BUSY; 1394 1395 while (sdp->swd_active < sdp->swd_maxactive) { 1396 bp = BUFQ_FIRST(&sdp->swd_tab); 1397 if (bp == NULL) 1398 break; 1399 BUFQ_REMOVE(&sdp->swd_tab, bp); 1400 sdp->swd_active++; 1401 1402 UVMHIST_LOG(pdhist, 1403 "sw_reg_start: bp %p vp %p blkno %p cnt %lx", 1404 bp, bp->b_vp, bp->b_blkno, bp->b_bcount); 1405 if ((bp->b_flags & B_READ) == 0) 1406 bp->b_vp->v_numoutput++; 1407 VOP_STRATEGY(bp); 1408 } 1409 sdp->swd_flags &= ~SWF_BUSY; 1410 } 1411 1412 /* 1413 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1414 * 1415 * => note that we can recover the vndbuf struct by casting the buf ptr 1416 */ 1417 static void 1418 sw_reg_iodone(bp) 1419 struct buf *bp; 1420 { 1421 struct vndbuf *vbp = (struct vndbuf *) bp; 1422 struct vndxfer *vnx = vbp->vb_xfer; 1423 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1424 struct swapdev *sdp = vnx->vx_sdp; 1425 int s, resid; 1426 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist); 1427 1428 UVMHIST_LOG(pdhist, " vbp=%p vp=%p blkno=%x addr=%p", 1429 vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data); 1430 UVMHIST_LOG(pdhist, " cnt=%lx resid=%lx", 1431 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); 1432 1433 /* 1434 * protect vbp at splbio and update. 1435 */ 1436 1437 s = splbio(); 1438 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1439 pbp->b_resid -= resid; 1440 vnx->vx_pending--; 1441 1442 if (vbp->vb_buf.b_error) { 1443 UVMHIST_LOG(pdhist, " got error=%d !", 1444 vbp->vb_buf.b_error, 0, 0, 0); 1445 1446 /* pass error upward */ 1447 vnx->vx_error = vbp->vb_buf.b_error; 1448 } 1449 1450 /* 1451 * disassociate this buffer from the vnode (if any). 1452 */ 1453 if (vbp->vb_buf.b_vp != NULLVP) { 1454 brelvp(&vbp->vb_buf); 1455 } 1456 1457 /* 1458 * kill vbp structure 1459 */ 1460 putvndbuf(vbp); 1461 1462 /* 1463 * wrap up this transaction if it has run to completion or, in 1464 * case of an error, when all auxiliary buffers have returned. 1465 */ 1466 if (vnx->vx_error != 0) { 1467 /* pass error upward */ 1468 pbp->b_flags |= B_ERROR; 1469 pbp->b_error = vnx->vx_error; 1470 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1471 putvndxfer(vnx); 1472 biodone(pbp); 1473 } 1474 } else if (pbp->b_resid == 0) { 1475 #ifdef DIAGNOSTIC 1476 if (vnx->vx_pending != 0) 1477 panic("sw_reg_iodone: vnx pending: %d",vnx->vx_pending); 1478 #endif 1479 1480 if ((vnx->vx_flags & VX_BUSY) == 0) { 1481 UVMHIST_LOG(pdhist, " iodone error=%d !", 1482 pbp, vnx->vx_error, 0, 0); 1483 putvndxfer(vnx); 1484 biodone(pbp); 1485 } 1486 } 1487 1488 /* 1489 * done! start next swapdev I/O if one is pending 1490 */ 1491 sdp->swd_active--; 1492 sw_reg_start(sdp); 1493 splx(s); 1494 } 1495 1496 1497 /* 1498 * uvm_swap_alloc: allocate space on swap 1499 * 1500 * => allocation is done "round robin" down the priority list, as we 1501 * allocate in a priority we "rotate" the circle queue. 1502 * => space can be freed with uvm_swap_free 1503 * => we return the page slot number in /dev/drum (0 == invalid slot) 1504 * => we lock uvm.swap_data_lock 1505 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1506 */ 1507 int 1508 uvm_swap_alloc(nslots, lessok) 1509 int *nslots; /* IN/OUT */ 1510 boolean_t lessok; 1511 { 1512 struct swapdev *sdp; 1513 struct swappri *spp; 1514 u_long result; 1515 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist); 1516 1517 /* 1518 * no swap devices configured yet? definite failure. 1519 */ 1520 if (uvmexp.nswapdev < 1) 1521 return 0; 1522 1523 /* 1524 * lock data lock, convert slots into blocks, and enter loop 1525 */ 1526 simple_lock(&uvm.swap_data_lock); 1527 1528 ReTry: /* XXXMRG */ 1529 for (spp = LIST_FIRST(&swap_priority); spp != NULL; 1530 spp = LIST_NEXT(spp, spi_swappri)) { 1531 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev); 1532 sdp != (void *)&spp->spi_swapdev; 1533 sdp = CIRCLEQ_NEXT(sdp,swd_next)) { 1534 /* if it's not enabled, then we can't swap from it */ 1535 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1536 continue; 1537 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1538 continue; 1539 if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN, 1540 EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT, 1541 &result) != 0) { 1542 continue; 1543 } 1544 1545 /* 1546 * successful allocation! now rotate the circleq. 1547 */ 1548 CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1549 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1550 sdp->swd_npginuse += *nslots; 1551 uvmexp.swpginuse += *nslots; 1552 simple_unlock(&uvm.swap_data_lock); 1553 /* done! return drum slot number */ 1554 UVMHIST_LOG(pdhist, 1555 "success! returning %d slots starting at %d", 1556 *nslots, result + sdp->swd_drumoffset, 0, 0); 1557 return(result + sdp->swd_drumoffset); 1558 } 1559 } 1560 1561 /* XXXMRG: BEGIN HACK */ 1562 if (*nslots > 1 && lessok) { 1563 *nslots = 1; 1564 goto ReTry; /* XXXMRG: ugh! extent should support this for us */ 1565 } 1566 /* XXXMRG: END HACK */ 1567 1568 simple_unlock(&uvm.swap_data_lock); 1569 return 0; /* failed */ 1570 } 1571 1572 /* 1573 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1574 * 1575 * => we lock uvm.swap_data_lock 1576 */ 1577 void 1578 uvm_swap_markbad(startslot, nslots) 1579 int startslot; 1580 int nslots; 1581 { 1582 struct swapdev *sdp; 1583 UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist); 1584 1585 simple_lock(&uvm.swap_data_lock); 1586 sdp = swapdrum_getsdp(startslot); 1587 1588 /* 1589 * we just keep track of how many pages have been marked bad 1590 * in this device, to make everything add up in swap_off(). 1591 * we assume here that the range of slots will all be within 1592 * one swap device. 1593 */ 1594 sdp->swd_npgbad += nslots; 1595 1596 simple_unlock(&uvm.swap_data_lock); 1597 } 1598 1599 /* 1600 * uvm_swap_free: free swap slots 1601 * 1602 * => this can be all or part of an allocation made by uvm_swap_alloc 1603 * => we lock uvm.swap_data_lock 1604 */ 1605 void 1606 uvm_swap_free(startslot, nslots) 1607 int startslot; 1608 int nslots; 1609 { 1610 struct swapdev *sdp; 1611 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist); 1612 1613 UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots, 1614 startslot, 0, 0); 1615 1616 /* 1617 * ignore attempts to free the "bad" slot. 1618 */ 1619 if (startslot == SWSLOT_BAD) { 1620 return; 1621 } 1622 1623 /* 1624 * convert drum slot offset back to sdp, free the blocks 1625 * in the extent, and return. must hold pri lock to do 1626 * lookup and access the extent. 1627 */ 1628 simple_lock(&uvm.swap_data_lock); 1629 sdp = swapdrum_getsdp(startslot); 1630 1631 #ifdef DIAGNOSTIC 1632 if (uvmexp.nswapdev < 1) 1633 panic("uvm_swap_free: uvmexp.nswapdev < 1\n"); 1634 if (sdp == NULL) { 1635 printf("uvm_swap_free: startslot %d, nslots %d\n", startslot, 1636 nslots); 1637 panic("uvm_swap_free: unmapped address\n"); 1638 } 1639 #endif 1640 if (extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots, 1641 EX_MALLOCOK|EX_NOWAIT) != 0) { 1642 printf("warning: resource shortage: %d pages of swap lost\n", 1643 nslots); 1644 } 1645 1646 sdp->swd_npginuse -= nslots; 1647 uvmexp.swpginuse -= nslots; 1648 #ifdef DIAGNOSTIC 1649 if (sdp->swd_npginuse < 0) 1650 panic("uvm_swap_free: inuse < 0"); 1651 #endif 1652 simple_unlock(&uvm.swap_data_lock); 1653 } 1654 1655 /* 1656 * uvm_swap_put: put any number of pages into a contig place on swap 1657 * 1658 * => can be sync or async 1659 * => XXXMRG: consider making it an inline or macro 1660 */ 1661 int 1662 uvm_swap_put(swslot, ppsp, npages, flags) 1663 int swslot; 1664 struct vm_page **ppsp; 1665 int npages; 1666 int flags; 1667 { 1668 int result; 1669 1670 result = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1671 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1672 1673 return (result); 1674 } 1675 1676 /* 1677 * uvm_swap_get: get a single page from swap 1678 * 1679 * => usually a sync op (from fault) 1680 * => XXXMRG: consider making it an inline or macro 1681 */ 1682 int 1683 uvm_swap_get(page, swslot, flags) 1684 struct vm_page *page; 1685 int swslot, flags; 1686 { 1687 int result; 1688 1689 uvmexp.nswget++; 1690 #ifdef DIAGNOSTIC 1691 if ((flags & PGO_SYNCIO) == 0) 1692 printf("uvm_swap_get: ASYNC get requested?\n"); 1693 #endif 1694 1695 if (swslot == SWSLOT_BAD) { 1696 return VM_PAGER_ERROR; 1697 } 1698 1699 /* 1700 * this page is (about to be) no longer only in swap. 1701 */ 1702 simple_lock(&uvm.swap_data_lock); 1703 uvmexp.swpgonly--; 1704 simple_unlock(&uvm.swap_data_lock); 1705 1706 result = uvm_swap_io(&page, swslot, 1, B_READ | 1707 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1708 1709 if (result != VM_PAGER_OK && result != VM_PAGER_PEND) { 1710 /* 1711 * oops, the read failed so it really is still only in swap. 1712 */ 1713 simple_lock(&uvm.swap_data_lock); 1714 uvmexp.swpgonly++; 1715 simple_unlock(&uvm.swap_data_lock); 1716 } 1717 1718 return (result); 1719 } 1720 1721 /* 1722 * uvm_swap_io: do an i/o operation to swap 1723 */ 1724 1725 static int 1726 uvm_swap_io(pps, startslot, npages, flags) 1727 struct vm_page **pps; 1728 int startslot, npages, flags; 1729 { 1730 daddr_t startblk; 1731 struct swapbuf *sbp; 1732 struct buf *bp; 1733 vaddr_t kva; 1734 int result, s, mapinflags, pflag; 1735 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist); 1736 1737 UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d", 1738 startslot, npages, flags, 0); 1739 1740 /* 1741 * convert starting drum slot to block number 1742 */ 1743 startblk = btodb(startslot << PAGE_SHIFT); 1744 1745 /* 1746 * first, map the pages into the kernel (XXX: currently required 1747 * by buffer system). note that we don't let pagermapin alloc 1748 * an aiodesc structure because we don't want to chance a malloc. 1749 * we've got our own pool of aiodesc structures (in swapbuf). 1750 */ 1751 mapinflags = (flags & B_READ) ? UVMPAGER_MAPIN_READ : 1752 UVMPAGER_MAPIN_WRITE; 1753 if ((flags & B_ASYNC) == 0) 1754 mapinflags |= UVMPAGER_MAPIN_WAITOK; 1755 kva = uvm_pagermapin(pps, npages, NULL, mapinflags); 1756 if (kva == 0) 1757 return (VM_PAGER_AGAIN); 1758 1759 /* 1760 * now allocate a swap buffer off of freesbufs 1761 * [make sure we don't put the pagedaemon to sleep...] 1762 */ 1763 s = splbio(); 1764 pflag = ((flags & B_ASYNC) != 0 || curproc == uvm.pagedaemon_proc) 1765 ? 0 1766 : PR_WAITOK; 1767 sbp = pool_get(swapbuf_pool, pflag); 1768 splx(s); /* drop splbio */ 1769 1770 /* 1771 * if we failed to get a swapbuf, return "try again" 1772 */ 1773 if (sbp == NULL) 1774 return (VM_PAGER_AGAIN); 1775 1776 /* 1777 * fill in the bp/sbp. we currently route our i/o through 1778 * /dev/drum's vnode [swapdev_vp]. 1779 */ 1780 bp = &sbp->sw_buf; 1781 bp->b_flags = B_BUSY | B_NOCACHE | (flags & (B_READ|B_ASYNC)); 1782 bp->b_proc = &proc0; /* XXX */ 1783 bp->b_rcred = bp->b_wcred = proc0.p_ucred; 1784 bp->b_vnbufs.le_next = NOLIST; 1785 bp->b_data = (caddr_t)kva; 1786 bp->b_blkno = startblk; 1787 s = splbio(); 1788 VHOLD(swapdev_vp); 1789 bp->b_vp = swapdev_vp; 1790 splx(s); 1791 /* XXXCDC: isn't swapdev_vp always a VCHR? */ 1792 /* XXXMRG: probably -- this is obviously something inherited... */ 1793 if (swapdev_vp->v_type == VBLK) 1794 bp->b_dev = swapdev_vp->v_rdev; 1795 bp->b_bcount = npages << PAGE_SHIFT; 1796 LIST_INIT(&bp->b_dep); 1797 1798 /* 1799 * for pageouts we must set "dirtyoff" [NFS client code needs it]. 1800 * and we bump v_numoutput (counter of number of active outputs). 1801 */ 1802 if ((bp->b_flags & B_READ) == 0) { 1803 bp->b_dirtyoff = 0; 1804 bp->b_dirtyend = npages << PAGE_SHIFT; 1805 s = splbio(); 1806 swapdev_vp->v_numoutput++; 1807 splx(s); 1808 } 1809 1810 /* 1811 * for async ops we must set up the aiodesc and setup the callback 1812 * XXX: we expect no async-reads, but we don't prevent it here. 1813 */ 1814 if (flags & B_ASYNC) { 1815 sbp->sw_aio.aiodone = uvm_swap_aiodone; 1816 sbp->sw_aio.kva = kva; 1817 sbp->sw_aio.npages = npages; 1818 sbp->sw_aio.pd_ptr = sbp; /* backpointer */ 1819 bp->b_flags |= B_CALL; /* set callback */ 1820 bp->b_iodone = uvm_swap_bufdone;/* "buf" iodone function */ 1821 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); 1822 } 1823 UVMHIST_LOG(pdhist, 1824 "about to start io: data = 0x%p blkno = 0x%x, bcount = %ld", 1825 bp->b_data, bp->b_blkno, bp->b_bcount, 0); 1826 1827 /* 1828 * now we start the I/O, and if async, return. 1829 */ 1830 VOP_STRATEGY(bp); 1831 if (flags & B_ASYNC) 1832 return (VM_PAGER_PEND); 1833 1834 /* 1835 * must be sync i/o. wait for it to finish 1836 */ 1837 bp->b_error = biowait(bp); 1838 result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK; 1839 1840 /* 1841 * kill the pager mapping 1842 */ 1843 uvm_pagermapout(kva, npages); 1844 1845 /* 1846 * now dispose of the swap buffer 1847 */ 1848 s = splbio(); 1849 if (bp->b_vp) 1850 brelvp(bp); 1851 1852 pool_put(swapbuf_pool, sbp); 1853 splx(s); 1854 1855 /* 1856 * finally return. 1857 */ 1858 UVMHIST_LOG(pdhist, "<- done (sync) result=%d", result, 0, 0, 0); 1859 return (result); 1860 } 1861 1862 /* 1863 * uvm_swap_bufdone: called from the buffer system when the i/o is done 1864 */ 1865 static void 1866 uvm_swap_bufdone(bp) 1867 struct buf *bp; 1868 { 1869 struct swapbuf *sbp = (struct swapbuf *) bp; 1870 int s = splbio(); 1871 UVMHIST_FUNC("uvm_swap_bufdone"); UVMHIST_CALLED(pdhist); 1872 1873 UVMHIST_LOG(pdhist, "cleaning buf %p", buf, 0, 0, 0); 1874 #ifdef DIAGNOSTIC 1875 /* 1876 * sanity check: swapbufs are private, so they shouldn't be wanted 1877 */ 1878 if (bp->b_flags & B_WANTED) 1879 panic("uvm_swap_bufdone: private buf wanted"); 1880 #endif 1881 1882 /* 1883 * drop the buffer's reference to the vnode. 1884 */ 1885 if (bp->b_vp) 1886 brelvp(bp); 1887 1888 /* 1889 * now put the aio on the uvm.aio_done list and wake the 1890 * pagedaemon (which will finish up our job in its context). 1891 */ 1892 simple_lock(&uvm.pagedaemon_lock); /* locks uvm.aio_done */ 1893 TAILQ_INSERT_TAIL(&uvm.aio_done, &sbp->sw_aio, aioq); 1894 simple_unlock(&uvm.pagedaemon_lock); 1895 1896 wakeup(&uvm.pagedaemon); 1897 splx(s); 1898 } 1899 1900 /* 1901 * uvm_swap_aiodone: aiodone function for anonymous memory 1902 * 1903 * => this is called in the context of the pagedaemon (but with the 1904 * page queues unlocked!) 1905 * => our "aio" structure must be part of a "swapbuf" 1906 */ 1907 static void 1908 uvm_swap_aiodone(aio) 1909 struct uvm_aiodesc *aio; 1910 { 1911 struct swapbuf *sbp = aio->pd_ptr; 1912 struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT]; 1913 int lcv, s; 1914 vaddr_t addr; 1915 UVMHIST_FUNC("uvm_swap_aiodone"); UVMHIST_CALLED(pdhist); 1916 1917 UVMHIST_LOG(pdhist, "done with aio %p", aio, 0, 0, 0); 1918 #ifdef DIAGNOSTIC 1919 /* 1920 * sanity check 1921 */ 1922 if (aio->npages > (MAXBSIZE >> PAGE_SHIFT)) 1923 panic("uvm_swap_aiodone: aio too big!"); 1924 #endif 1925 1926 /* 1927 * first, we have to recover the page pointers (pps) by poking in the 1928 * kernel pmap (XXX: should be saved in the buf structure). 1929 */ 1930 for (addr = aio->kva, lcv = 0 ; lcv < aio->npages ; 1931 addr += PAGE_SIZE, lcv++) { 1932 pps[lcv] = uvm_pageratop(addr); 1933 } 1934 1935 /* 1936 * now we can dispose of the kernel mappings of the buffer 1937 */ 1938 uvm_pagermapout(aio->kva, aio->npages); 1939 1940 /* 1941 * now we can dispose of the pages by using the dropcluster function 1942 * [note that we have no "page of interest" so we pass in null] 1943 */ 1944 uvm_pager_dropcluster(NULL, NULL, pps, &aio->npages, 1945 PGO_PDFREECLUST); 1946 1947 /* 1948 * finally, we can dispose of the swapbuf 1949 */ 1950 s = splbio(); 1951 pool_put(swapbuf_pool, sbp); 1952 splx(s); 1953 } 1954