xref: /netbsd-src/sys/uvm/uvm_swap.c (revision 5e4c038a45edbc7d63b7c2daa76e29f88b64a4e3)
1 /*	$NetBSD: uvm_swap.c,v 1.64 2002/05/09 21:43:44 fredette Exp $	*/
2 
3 /*
4  * Copyright (c) 1995, 1996, 1997 Matthew R. Green
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. The name of the author may not be used to endorse or promote products
16  *    derived from this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
25  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
31  * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
32  */
33 
34 #include <sys/cdefs.h>
35 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.64 2002/05/09 21:43:44 fredette Exp $");
36 
37 #include "fs_nfs.h"
38 #include "opt_uvmhist.h"
39 #include "opt_compat_netbsd.h"
40 #include "opt_ddb.h"
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/buf.h>
45 #include <sys/conf.h>
46 #include <sys/proc.h>
47 #include <sys/namei.h>
48 #include <sys/disklabel.h>
49 #include <sys/errno.h>
50 #include <sys/kernel.h>
51 #include <sys/malloc.h>
52 #include <sys/vnode.h>
53 #include <sys/file.h>
54 #include <sys/extent.h>
55 #include <sys/mount.h>
56 #include <sys/pool.h>
57 #include <sys/syscallargs.h>
58 #include <sys/swap.h>
59 
60 #include <uvm/uvm.h>
61 
62 #include <miscfs/specfs/specdev.h>
63 
64 /*
65  * uvm_swap.c: manage configuration and i/o to swap space.
66  */
67 
68 /*
69  * swap space is managed in the following way:
70  *
71  * each swap partition or file is described by a "swapdev" structure.
72  * each "swapdev" structure contains a "swapent" structure which contains
73  * information that is passed up to the user (via system calls).
74  *
75  * each swap partition is assigned a "priority" (int) which controls
76  * swap parition usage.
77  *
78  * the system maintains a global data structure describing all swap
79  * partitions/files.   there is a sorted LIST of "swappri" structures
80  * which describe "swapdev"'s at that priority.   this LIST is headed
81  * by the "swap_priority" global var.    each "swappri" contains a
82  * CIRCLEQ of "swapdev" structures at that priority.
83  *
84  * locking:
85  *  - swap_syscall_lock (sleep lock): this lock serializes the swapctl
86  *    system call and prevents the swap priority list from changing
87  *    while we are in the middle of a system call (e.g. SWAP_STATS).
88  *  - uvm.swap_data_lock (simple_lock): this lock protects all swap data
89  *    structures including the priority list, the swapdev structures,
90  *    and the swapmap extent.
91  *
92  * each swap device has the following info:
93  *  - swap device in use (could be disabled, preventing future use)
94  *  - swap enabled (allows new allocations on swap)
95  *  - map info in /dev/drum
96  *  - vnode pointer
97  * for swap files only:
98  *  - block size
99  *  - max byte count in buffer
100  *  - buffer
101  *
102  * userland controls and configures swap with the swapctl(2) system call.
103  * the sys_swapctl performs the following operations:
104  *  [1] SWAP_NSWAP: returns the number of swap devices currently configured
105  *  [2] SWAP_STATS: given a pointer to an array of swapent structures
106  *	(passed in via "arg") of a size passed in via "misc" ... we load
107  *	the current swap config into the array. The actual work is done
108  *	in the uvm_swap_stats(9) function.
109  *  [3] SWAP_ON: given a pathname in arg (could be device or file) and a
110  *	priority in "misc", start swapping on it.
111  *  [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
112  *  [5] SWAP_CTL: changes the priority of a swap device (new priority in
113  *	"misc")
114  */
115 
116 /*
117  * swapdev: describes a single swap partition/file
118  *
119  * note the following should be true:
120  * swd_inuse <= swd_nblks  [number of blocks in use is <= total blocks]
121  * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
122  */
123 struct swapdev {
124 	struct oswapent swd_ose;
125 #define	swd_dev		swd_ose.ose_dev		/* device id */
126 #define	swd_flags	swd_ose.ose_flags	/* flags:inuse/enable/fake */
127 #define	swd_priority	swd_ose.ose_priority	/* our priority */
128 	/* also: swd_ose.ose_nblks, swd_ose.ose_inuse */
129 	char			*swd_path;	/* saved pathname of device */
130 	int			swd_pathlen;	/* length of pathname */
131 	int			swd_npages;	/* #pages we can use */
132 	int			swd_npginuse;	/* #pages in use */
133 	int			swd_npgbad;	/* #pages bad */
134 	int			swd_drumoffset;	/* page0 offset in drum */
135 	int			swd_drumsize;	/* #pages in drum */
136 	struct extent		*swd_ex;	/* extent for this swapdev */
137 	char			swd_exname[12];	/* name of extent above */
138 	struct vnode		*swd_vp;	/* backing vnode */
139 	CIRCLEQ_ENTRY(swapdev)	swd_next;	/* priority circleq */
140 
141 	int			swd_bsize;	/* blocksize (bytes) */
142 	int			swd_maxactive;	/* max active i/o reqs */
143 	struct buf_queue	swd_tab;	/* buffer list */
144 	int			swd_active;	/* number of active buffers */
145 };
146 
147 /*
148  * swap device priority entry; the list is kept sorted on `spi_priority'.
149  */
150 struct swappri {
151 	int			spi_priority;     /* priority */
152 	CIRCLEQ_HEAD(spi_swapdev, swapdev)	spi_swapdev;
153 	/* circleq of swapdevs at this priority */
154 	LIST_ENTRY(swappri)	spi_swappri;      /* global list of pri's */
155 };
156 
157 /*
158  * The following two structures are used to keep track of data transfers
159  * on swap devices associated with regular files.
160  * NOTE: this code is more or less a copy of vnd.c; we use the same
161  * structure names here to ease porting..
162  */
163 struct vndxfer {
164 	struct buf	*vx_bp;		/* Pointer to parent buffer */
165 	struct swapdev	*vx_sdp;
166 	int		vx_error;
167 	int		vx_pending;	/* # of pending aux buffers */
168 	int		vx_flags;
169 #define VX_BUSY		1
170 #define VX_DEAD		2
171 };
172 
173 struct vndbuf {
174 	struct buf	vb_buf;
175 	struct vndxfer	*vb_xfer;
176 };
177 
178 
179 /*
180  * We keep a of pool vndbuf's and vndxfer structures.
181  */
182 static struct pool vndxfer_pool;
183 static struct pool vndbuf_pool;
184 
185 #define	getvndxfer(vnx)	do {						\
186 	int s = splbio();						\
187 	vnx = pool_get(&vndxfer_pool, PR_WAITOK);			\
188 	splx(s);							\
189 } while (0)
190 
191 #define putvndxfer(vnx) {						\
192 	pool_put(&vndxfer_pool, (void *)(vnx));				\
193 }
194 
195 #define	getvndbuf(vbp)	do {						\
196 	int s = splbio();						\
197 	vbp = pool_get(&vndbuf_pool, PR_WAITOK);			\
198 	splx(s);							\
199 } while (0)
200 
201 #define putvndbuf(vbp) {						\
202 	pool_put(&vndbuf_pool, (void *)(vbp));				\
203 }
204 
205 /* /dev/drum */
206 bdev_decl(sw);
207 cdev_decl(sw);
208 
209 /*
210  * local variables
211  */
212 static struct extent *swapmap;		/* controls the mapping of /dev/drum */
213 
214 /* list of all active swap devices [by priority] */
215 LIST_HEAD(swap_priority, swappri);
216 static struct swap_priority swap_priority;
217 
218 /* locks */
219 struct lock swap_syscall_lock;
220 
221 /*
222  * prototypes
223  */
224 static struct swapdev	*swapdrum_getsdp __P((int));
225 
226 static struct swapdev	*swaplist_find __P((struct vnode *, int));
227 static void		 swaplist_insert __P((struct swapdev *,
228 					     struct swappri *, int));
229 static void		 swaplist_trim __P((void));
230 
231 static int swap_on __P((struct proc *, struct swapdev *));
232 static int swap_off __P((struct proc *, struct swapdev *));
233 
234 static void sw_reg_strategy __P((struct swapdev *, struct buf *, int));
235 static void sw_reg_iodone __P((struct buf *));
236 static void sw_reg_start __P((struct swapdev *));
237 
238 static int uvm_swap_io __P((struct vm_page **, int, int, int));
239 
240 /*
241  * uvm_swap_init: init the swap system data structures and locks
242  *
243  * => called at boot time from init_main.c after the filesystems
244  *	are brought up (which happens after uvm_init())
245  */
246 void
247 uvm_swap_init()
248 {
249 	UVMHIST_FUNC("uvm_swap_init");
250 
251 	UVMHIST_CALLED(pdhist);
252 	/*
253 	 * first, init the swap list, its counter, and its lock.
254 	 * then get a handle on the vnode for /dev/drum by using
255 	 * the its dev_t number ("swapdev", from MD conf.c).
256 	 */
257 
258 	LIST_INIT(&swap_priority);
259 	uvmexp.nswapdev = 0;
260 	lockinit(&swap_syscall_lock, PVM, "swapsys", 0, 0);
261 	simple_lock_init(&uvm.swap_data_lock);
262 
263 	if (bdevvp(swapdev, &swapdev_vp))
264 		panic("uvm_swap_init: can't get vnode for swap device");
265 
266 	/*
267 	 * create swap block resource map to map /dev/drum.   the range
268 	 * from 1 to INT_MAX allows 2 gigablocks of swap space.  note
269 	 * that block 0 is reserved (used to indicate an allocation
270 	 * failure, or no allocation).
271 	 */
272 	swapmap = extent_create("swapmap", 1, INT_MAX,
273 				M_VMSWAP, 0, 0, EX_NOWAIT);
274 	if (swapmap == 0)
275 		panic("uvm_swap_init: extent_create failed");
276 
277 	/*
278 	 * allocate pools for structures used for swapping to files.
279 	 */
280 
281 	pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0,
282 	    "swp vnx", NULL);
283 
284 	pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0,
285 	    "swp vnd", NULL);
286 
287 	/*
288 	 * done!
289 	 */
290 	UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
291 }
292 
293 /*
294  * swaplist functions: functions that operate on the list of swap
295  * devices on the system.
296  */
297 
298 /*
299  * swaplist_insert: insert swap device "sdp" into the global list
300  *
301  * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
302  * => caller must provide a newly malloc'd swappri structure (we will
303  *	FREE it if we don't need it... this it to prevent malloc blocking
304  *	here while adding swap)
305  */
306 static void
307 swaplist_insert(sdp, newspp, priority)
308 	struct swapdev *sdp;
309 	struct swappri *newspp;
310 	int priority;
311 {
312 	struct swappri *spp, *pspp;
313 	UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist);
314 
315 	/*
316 	 * find entry at or after which to insert the new device.
317 	 */
318 	pspp = NULL;
319 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
320 		if (priority <= spp->spi_priority)
321 			break;
322 		pspp = spp;
323 	}
324 
325 	/*
326 	 * new priority?
327 	 */
328 	if (spp == NULL || spp->spi_priority != priority) {
329 		spp = newspp;  /* use newspp! */
330 		UVMHIST_LOG(pdhist, "created new swappri = %d",
331 			    priority, 0, 0, 0);
332 
333 		spp->spi_priority = priority;
334 		CIRCLEQ_INIT(&spp->spi_swapdev);
335 
336 		if (pspp)
337 			LIST_INSERT_AFTER(pspp, spp, spi_swappri);
338 		else
339 			LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
340 	} else {
341 	  	/* we don't need a new priority structure, free it */
342 		FREE(newspp, M_VMSWAP);
343 	}
344 
345 	/*
346 	 * priority found (or created).   now insert on the priority's
347 	 * circleq list and bump the total number of swapdevs.
348 	 */
349 	sdp->swd_priority = priority;
350 	CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
351 	uvmexp.nswapdev++;
352 }
353 
354 /*
355  * swaplist_find: find and optionally remove a swap device from the
356  *	global list.
357  *
358  * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
359  * => we return the swapdev we found (and removed)
360  */
361 static struct swapdev *
362 swaplist_find(vp, remove)
363 	struct vnode *vp;
364 	boolean_t remove;
365 {
366 	struct swapdev *sdp;
367 	struct swappri *spp;
368 
369 	/*
370 	 * search the lists for the requested vp
371 	 */
372 
373 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
374 		CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
375 			if (sdp->swd_vp == vp) {
376 				if (remove) {
377 					CIRCLEQ_REMOVE(&spp->spi_swapdev,
378 					    sdp, swd_next);
379 					uvmexp.nswapdev--;
380 				}
381 				return(sdp);
382 			}
383 		}
384 	}
385 	return (NULL);
386 }
387 
388 
389 /*
390  * swaplist_trim: scan priority list for empty priority entries and kill
391  *	them.
392  *
393  * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
394  */
395 static void
396 swaplist_trim()
397 {
398 	struct swappri *spp, *nextspp;
399 
400 	for (spp = LIST_FIRST(&swap_priority); spp != NULL; spp = nextspp) {
401 		nextspp = LIST_NEXT(spp, spi_swappri);
402 		if (CIRCLEQ_FIRST(&spp->spi_swapdev) !=
403 		    (void *)&spp->spi_swapdev)
404 			continue;
405 		LIST_REMOVE(spp, spi_swappri);
406 		free(spp, M_VMSWAP);
407 	}
408 }
409 
410 /*
411  * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
412  *	to the "swapdev" that maps that section of the drum.
413  *
414  * => each swapdev takes one big contig chunk of the drum
415  * => caller must hold uvm.swap_data_lock
416  */
417 static struct swapdev *
418 swapdrum_getsdp(pgno)
419 	int pgno;
420 {
421 	struct swapdev *sdp;
422 	struct swappri *spp;
423 
424 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
425 		CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
426 			if (sdp->swd_flags & SWF_FAKE)
427 				continue;
428 			if (pgno >= sdp->swd_drumoffset &&
429 			    pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
430 				return sdp;
431 			}
432 		}
433 	}
434 	return NULL;
435 }
436 
437 
438 /*
439  * sys_swapctl: main entry point for swapctl(2) system call
440  * 	[with two helper functions: swap_on and swap_off]
441  */
442 int
443 sys_swapctl(p, v, retval)
444 	struct proc *p;
445 	void *v;
446 	register_t *retval;
447 {
448 	struct sys_swapctl_args /* {
449 		syscallarg(int) cmd;
450 		syscallarg(void *) arg;
451 		syscallarg(int) misc;
452 	} */ *uap = (struct sys_swapctl_args *)v;
453 	struct vnode *vp;
454 	struct nameidata nd;
455 	struct swappri *spp;
456 	struct swapdev *sdp;
457 	struct swapent *sep;
458 	char	userpath[PATH_MAX + 1];
459 	size_t	len;
460 	int	error, misc;
461 	int	priority;
462 	UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist);
463 
464 	misc = SCARG(uap, misc);
465 
466 	/*
467 	 * ensure serialized syscall access by grabbing the swap_syscall_lock
468 	 */
469 	lockmgr(&swap_syscall_lock, LK_EXCLUSIVE, NULL);
470 
471 	/*
472 	 * we handle the non-priv NSWAP and STATS request first.
473 	 *
474 	 * SWAP_NSWAP: return number of config'd swap devices
475 	 * [can also be obtained with uvmexp sysctl]
476 	 */
477 	if (SCARG(uap, cmd) == SWAP_NSWAP) {
478 		UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev,
479 		    0, 0, 0);
480 		*retval = uvmexp.nswapdev;
481 		error = 0;
482 		goto out;
483 	}
484 
485 	/*
486 	 * SWAP_STATS: get stats on current # of configured swap devs
487 	 *
488 	 * note that the swap_priority list can't change as long
489 	 * as we are holding the swap_syscall_lock.  we don't want
490 	 * to grab the uvm.swap_data_lock because we may fault&sleep during
491 	 * copyout() and we don't want to be holding that lock then!
492 	 */
493 	if (SCARG(uap, cmd) == SWAP_STATS
494 #if defined(COMPAT_13)
495 	    || SCARG(uap, cmd) == SWAP_OSTATS
496 #endif
497 	    ) {
498 		misc = MIN(uvmexp.nswapdev, misc);
499 #if defined(COMPAT_13)
500 		if (SCARG(uap, cmd) == SWAP_OSTATS)
501 			len = sizeof(struct oswapent) * misc;
502 		else
503 #endif
504 			len = sizeof(struct swapent) * misc;
505 		sep = (struct swapent *)malloc(len, M_TEMP, M_WAITOK);
506 
507 		uvm_swap_stats(SCARG(uap, cmd), sep, misc, retval);
508 		error = copyout(sep, (void *)SCARG(uap, arg), len);
509 
510 		free(sep, M_TEMP);
511 		UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
512 		goto out;
513 	}
514 	if (SCARG(uap, cmd) == SWAP_GETDUMPDEV) {
515 		dev_t	*devp = (dev_t *)SCARG(uap, arg);
516 
517 		error = copyout(&dumpdev, devp, sizeof(dumpdev));
518 		goto out;
519 	}
520 
521 	/*
522 	 * all other requests require superuser privs.   verify.
523 	 */
524 	if ((error = suser(p->p_ucred, &p->p_acflag)))
525 		goto out;
526 
527 	/*
528 	 * at this point we expect a path name in arg.   we will
529 	 * use namei() to gain a vnode reference (vref), and lock
530 	 * the vnode (VOP_LOCK).
531 	 *
532 	 * XXX: a NULL arg means use the root vnode pointer (e.g. for
533 	 * miniroot)
534 	 */
535 	if (SCARG(uap, arg) == NULL) {
536 		vp = rootvp;		/* miniroot */
537 		if (vget(vp, LK_EXCLUSIVE)) {
538 			error = EBUSY;
539 			goto out;
540 		}
541 		if (SCARG(uap, cmd) == SWAP_ON &&
542 		    copystr("miniroot", userpath, sizeof userpath, &len))
543 			panic("swapctl: miniroot copy failed");
544 	} else {
545 		int	space;
546 		char	*where;
547 
548 		if (SCARG(uap, cmd) == SWAP_ON) {
549 			if ((error = copyinstr(SCARG(uap, arg), userpath,
550 			    sizeof userpath, &len)))
551 				goto out;
552 			space = UIO_SYSSPACE;
553 			where = userpath;
554 		} else {
555 			space = UIO_USERSPACE;
556 			where = (char *)SCARG(uap, arg);
557 		}
558 		NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, space, where, p);
559 		if ((error = namei(&nd)))
560 			goto out;
561 		vp = nd.ni_vp;
562 	}
563 	/* note: "vp" is referenced and locked */
564 
565 	error = 0;		/* assume no error */
566 	switch(SCARG(uap, cmd)) {
567 
568 	case SWAP_DUMPDEV:
569 		if (vp->v_type != VBLK) {
570 			error = ENOTBLK;
571 			break;
572 		}
573 		dumpdev = vp->v_rdev;
574 		break;
575 
576 	case SWAP_CTL:
577 		/*
578 		 * get new priority, remove old entry (if any) and then
579 		 * reinsert it in the correct place.  finally, prune out
580 		 * any empty priority structures.
581 		 */
582 		priority = SCARG(uap, misc);
583 		spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
584 		simple_lock(&uvm.swap_data_lock);
585 		if ((sdp = swaplist_find(vp, 1)) == NULL) {
586 			error = ENOENT;
587 		} else {
588 			swaplist_insert(sdp, spp, priority);
589 			swaplist_trim();
590 		}
591 		simple_unlock(&uvm.swap_data_lock);
592 		if (error)
593 			free(spp, M_VMSWAP);
594 		break;
595 
596 	case SWAP_ON:
597 
598 		/*
599 		 * check for duplicates.   if none found, then insert a
600 		 * dummy entry on the list to prevent someone else from
601 		 * trying to enable this device while we are working on
602 		 * it.
603 		 */
604 
605 		priority = SCARG(uap, misc);
606 		sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK);
607 		spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
608 		simple_lock(&uvm.swap_data_lock);
609 		if (swaplist_find(vp, 0) != NULL) {
610 			error = EBUSY;
611 			simple_unlock(&uvm.swap_data_lock);
612 			free(sdp, M_VMSWAP);
613 			free(spp, M_VMSWAP);
614 			break;
615 		}
616 		memset(sdp, 0, sizeof(*sdp));
617 		sdp->swd_flags = SWF_FAKE;	/* placeholder only */
618 		sdp->swd_vp = vp;
619 		sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
620 		BUFQ_INIT(&sdp->swd_tab);
621 
622 		swaplist_insert(sdp, spp, priority);
623 		simple_unlock(&uvm.swap_data_lock);
624 
625 		sdp->swd_pathlen = len;
626 		sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK);
627 		if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0)
628 			panic("swapctl: copystr");
629 
630 		/*
631 		 * we've now got a FAKE placeholder in the swap list.
632 		 * now attempt to enable swap on it.  if we fail, undo
633 		 * what we've done and kill the fake entry we just inserted.
634 		 * if swap_on is a success, it will clear the SWF_FAKE flag
635 		 */
636 
637 		if ((error = swap_on(p, sdp)) != 0) {
638 			simple_lock(&uvm.swap_data_lock);
639 			(void) swaplist_find(vp, 1);  /* kill fake entry */
640 			swaplist_trim();
641 			simple_unlock(&uvm.swap_data_lock);
642 			free(sdp->swd_path, M_VMSWAP);
643 			free(sdp, M_VMSWAP);
644 			break;
645 		}
646 		break;
647 
648 	case SWAP_OFF:
649 		simple_lock(&uvm.swap_data_lock);
650 		if ((sdp = swaplist_find(vp, 0)) == NULL) {
651 			simple_unlock(&uvm.swap_data_lock);
652 			error = ENXIO;
653 			break;
654 		}
655 
656 		/*
657 		 * If a device isn't in use or enabled, we
658 		 * can't stop swapping from it (again).
659 		 */
660 		if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
661 			simple_unlock(&uvm.swap_data_lock);
662 			error = EBUSY;
663 			break;
664 		}
665 
666 		/*
667 		 * do the real work.
668 		 */
669 		error = swap_off(p, sdp);
670 		break;
671 
672 	default:
673 		error = EINVAL;
674 	}
675 
676 	/*
677 	 * done!  release the ref gained by namei() and unlock.
678 	 */
679 	vput(vp);
680 
681 out:
682 	lockmgr(&swap_syscall_lock, LK_RELEASE, NULL);
683 
684 	UVMHIST_LOG(pdhist, "<- done!  error=%d", error, 0, 0, 0);
685 	return (error);
686 }
687 
688 /*
689  * swap_stats: implements swapctl(SWAP_STATS). The function is kept
690  * away from sys_swapctl() in order to allow COMPAT_* swapctl()
691  * emulation to use it directly without going through sys_swapctl().
692  * The problem with using sys_swapctl() there is that it involves
693  * copying the swapent array to the stackgap, and this array's size
694  * is not known at build time. Hence it would not be possible to
695  * ensure it would fit in the stackgap in any case.
696  */
697 void
698 uvm_swap_stats(cmd, sep, sec, retval)
699 	int cmd;
700 	struct swapent *sep;
701 	int sec;
702 	register_t *retval;
703 {
704 	struct swappri *spp;
705 	struct swapdev *sdp;
706 	int count = 0;
707 
708 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
709 		for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
710 		     sdp != (void *)&spp->spi_swapdev && sec-- > 0;
711 		     sdp = CIRCLEQ_NEXT(sdp, swd_next)) {
712 		  	/*
713 			 * backwards compatibility for system call.
714 			 * note that we use 'struct oswapent' as an
715 			 * overlay into both 'struct swapdev' and
716 			 * the userland 'struct swapent', as we
717 			 * want to retain backwards compatibility
718 			 * with NetBSD 1.3.
719 			 */
720 			sdp->swd_ose.ose_inuse =
721 			    btodb((u_int64_t)sdp->swd_npginuse <<
722 			    PAGE_SHIFT);
723 			(void)memcpy(sep, &sdp->swd_ose,
724 			    sizeof(struct oswapent));
725 
726 			/* now copy out the path if necessary */
727 #if defined(COMPAT_13)
728 			if (cmd == SWAP_STATS)
729 #endif
730 				(void)memcpy(&sep->se_path, sdp->swd_path,
731 				    sdp->swd_pathlen);
732 
733 			count++;
734 #if defined(COMPAT_13)
735 			if (cmd == SWAP_OSTATS)
736 				sep = (struct swapent *)
737 				    ((struct oswapent *)sep + 1);
738 			else
739 #endif
740 				sep++;
741 		}
742 	}
743 
744 	*retval = count;
745 	return;
746 }
747 
748 /*
749  * swap_on: attempt to enable a swapdev for swapping.   note that the
750  *	swapdev is already on the global list, but disabled (marked
751  *	SWF_FAKE).
752  *
753  * => we avoid the start of the disk (to protect disk labels)
754  * => we also avoid the miniroot, if we are swapping to root.
755  * => caller should leave uvm.swap_data_lock unlocked, we may lock it
756  *	if needed.
757  */
758 static int
759 swap_on(p, sdp)
760 	struct proc *p;
761 	struct swapdev *sdp;
762 {
763 	static int count = 0;	/* static */
764 	struct vnode *vp;
765 	int error, npages, nblocks, size;
766 	long addr;
767 	u_long result;
768 	struct vattr va;
769 #ifdef NFS
770 	extern int (**nfsv2_vnodeop_p) __P((void *));
771 #endif /* NFS */
772 	dev_t dev;
773 	UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist);
774 
775 	/*
776 	 * we want to enable swapping on sdp.   the swd_vp contains
777 	 * the vnode we want (locked and ref'd), and the swd_dev
778 	 * contains the dev_t of the file, if it a block device.
779 	 */
780 
781 	vp = sdp->swd_vp;
782 	dev = sdp->swd_dev;
783 
784 	/*
785 	 * open the swap file (mostly useful for block device files to
786 	 * let device driver know what is up).
787 	 *
788 	 * we skip the open/close for root on swap because the root
789 	 * has already been opened when root was mounted (mountroot).
790 	 */
791 	if (vp != rootvp) {
792 		if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p)))
793 			return (error);
794 	}
795 
796 	/* XXX this only works for block devices */
797 	UVMHIST_LOG(pdhist, "  dev=%d, major(dev)=%d", dev, major(dev), 0,0);
798 
799 	/*
800 	 * we now need to determine the size of the swap area.   for
801 	 * block specials we can call the d_psize function.
802 	 * for normal files, we must stat [get attrs].
803 	 *
804 	 * we put the result in nblks.
805 	 * for normal files, we also want the filesystem block size
806 	 * (which we get with statfs).
807 	 */
808 	switch (vp->v_type) {
809 	case VBLK:
810 		if (bdevsw[major(dev)].d_psize == 0 ||
811 		    (nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) {
812 			error = ENXIO;
813 			goto bad;
814 		}
815 		break;
816 
817 	case VREG:
818 		if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)))
819 			goto bad;
820 		nblocks = (int)btodb(va.va_size);
821 		if ((error =
822 		     VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0)
823 			goto bad;
824 
825 		sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize;
826 		/*
827 		 * limit the max # of outstanding I/O requests we issue
828 		 * at any one time.   take it easy on NFS servers.
829 		 */
830 #ifdef NFS
831 		if (vp->v_op == nfsv2_vnodeop_p)
832 			sdp->swd_maxactive = 2; /* XXX */
833 		else
834 #endif /* NFS */
835 			sdp->swd_maxactive = 8; /* XXX */
836 		break;
837 
838 	default:
839 		error = ENXIO;
840 		goto bad;
841 	}
842 
843 	/*
844 	 * save nblocks in a safe place and convert to pages.
845 	 */
846 
847 	sdp->swd_ose.ose_nblks = nblocks;
848 	npages = dbtob((u_int64_t)nblocks) >> PAGE_SHIFT;
849 
850 	/*
851 	 * for block special files, we want to make sure that leave
852 	 * the disklabel and bootblocks alone, so we arrange to skip
853 	 * over them (arbitrarily choosing to skip PAGE_SIZE bytes).
854 	 * note that because of this the "size" can be less than the
855 	 * actual number of blocks on the device.
856 	 */
857 	if (vp->v_type == VBLK) {
858 		/* we use pages 1 to (size - 1) [inclusive] */
859 		size = npages - 1;
860 		addr = 1;
861 	} else {
862 		/* we use pages 0 to (size - 1) [inclusive] */
863 		size = npages;
864 		addr = 0;
865 	}
866 
867 	/*
868 	 * make sure we have enough blocks for a reasonable sized swap
869 	 * area.   we want at least one page.
870 	 */
871 
872 	if (size < 1) {
873 		UVMHIST_LOG(pdhist, "  size <= 1!!", 0, 0, 0, 0);
874 		error = EINVAL;
875 		goto bad;
876 	}
877 
878 	UVMHIST_LOG(pdhist, "  dev=%x: size=%d addr=%ld\n", dev, size, addr, 0);
879 
880 	/*
881 	 * now we need to allocate an extent to manage this swap device
882 	 */
883 	snprintf(sdp->swd_exname, sizeof(sdp->swd_exname), "swap0x%04x",
884 	    count++);
885 
886 	/* note that extent_create's 3rd arg is inclusive, thus "- 1" */
887 	sdp->swd_ex = extent_create(sdp->swd_exname, 0, npages - 1, M_VMSWAP,
888 				    0, 0, EX_WAITOK);
889 	/* allocate the `saved' region from the extent so it won't be used */
890 	if (addr) {
891 		if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK))
892 			panic("disklabel region");
893 	}
894 
895 	/*
896 	 * if the vnode we are swapping to is the root vnode
897 	 * (i.e. we are swapping to the miniroot) then we want
898 	 * to make sure we don't overwrite it.   do a statfs to
899 	 * find its size and skip over it.
900 	 */
901 	if (vp == rootvp) {
902 		struct mount *mp;
903 		struct statfs *sp;
904 		int rootblocks, rootpages;
905 
906 		mp = rootvnode->v_mount;
907 		sp = &mp->mnt_stat;
908 		rootblocks = sp->f_blocks * btodb(sp->f_bsize);
909 		/*
910 		 * XXX: sp->f_blocks isn't the total number of
911 		 * blocks in the filesystem, it's the number of
912 		 * data blocks.  so, our rootblocks almost
913 		 * definitely underestimates the total size
914 		 * of the filesystem - how badly depends on the
915 		 * details of the filesystem type.  there isn't
916 		 * an obvious way to deal with this cleanly
917 		 * and perfectly, so for now we just pad our
918 		 * rootblocks estimate with an extra 5 percent.
919 		 */
920 		rootblocks += (rootblocks >> 5) +
921 			(rootblocks >> 6) +
922 			(rootblocks >> 7);
923 		rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
924 		if (rootpages > size)
925 			panic("swap_on: miniroot larger than swap?");
926 
927 		if (extent_alloc_region(sdp->swd_ex, addr,
928 					rootpages, EX_WAITOK))
929 			panic("swap_on: unable to preserve miniroot");
930 
931 		size -= rootpages;
932 		printf("Preserved %d pages of miniroot ", rootpages);
933 		printf("leaving %d pages of swap\n", size);
934 	}
935 
936   	/*
937 	 * try to add anons to reflect the new swap space.
938 	 */
939 
940 	error = uvm_anon_add(size);
941 	if (error) {
942 		goto bad;
943 	}
944 
945 	/*
946 	 * add a ref to vp to reflect usage as a swap device.
947 	 */
948 	vref(vp);
949 
950 	/*
951 	 * now add the new swapdev to the drum and enable.
952 	 */
953 	if (extent_alloc(swapmap, npages, EX_NOALIGN, EX_NOBOUNDARY,
954 	    EX_WAITOK, &result))
955 		panic("swapdrum_add");
956 
957 	sdp->swd_drumoffset = (int)result;
958 	sdp->swd_drumsize = npages;
959 	sdp->swd_npages = size;
960 	simple_lock(&uvm.swap_data_lock);
961 	sdp->swd_flags &= ~SWF_FAKE;	/* going live */
962 	sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
963 	uvmexp.swpages += size;
964 	simple_unlock(&uvm.swap_data_lock);
965 	return (0);
966 
967 	/*
968 	 * failure: clean up and return error.
969 	 */
970 
971 bad:
972 	if (sdp->swd_ex) {
973 		extent_destroy(sdp->swd_ex);
974 	}
975 	if (vp != rootvp) {
976 		(void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p);
977 	}
978 	return (error);
979 }
980 
981 /*
982  * swap_off: stop swapping on swapdev
983  *
984  * => swap data should be locked, we will unlock.
985  */
986 static int
987 swap_off(p, sdp)
988 	struct proc *p;
989 	struct swapdev *sdp;
990 {
991 	UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist);
992 	UVMHIST_LOG(pdhist, "  dev=%x", sdp->swd_dev,0,0,0);
993 
994 	/* disable the swap area being removed */
995 	sdp->swd_flags &= ~SWF_ENABLE;
996 	simple_unlock(&uvm.swap_data_lock);
997 
998 	/*
999 	 * the idea is to find all the pages that are paged out to this
1000 	 * device, and page them all in.  in uvm, swap-backed pageable
1001 	 * memory can take two forms: aobjs and anons.  call the
1002 	 * swapoff hook for each subsystem to bring in pages.
1003 	 */
1004 
1005 	if (uao_swap_off(sdp->swd_drumoffset,
1006 			 sdp->swd_drumoffset + sdp->swd_drumsize) ||
1007 	    anon_swap_off(sdp->swd_drumoffset,
1008 			  sdp->swd_drumoffset + sdp->swd_drumsize)) {
1009 
1010 		simple_lock(&uvm.swap_data_lock);
1011 		sdp->swd_flags |= SWF_ENABLE;
1012 		simple_unlock(&uvm.swap_data_lock);
1013 		return ENOMEM;
1014 	}
1015 	KASSERT(sdp->swd_npginuse == sdp->swd_npgbad);
1016 
1017 	/*
1018 	 * done with the vnode.
1019 	 * drop our ref on the vnode before calling VOP_CLOSE()
1020 	 * so that spec_close() can tell if this is the last close.
1021 	 */
1022 	vrele(sdp->swd_vp);
1023 	if (sdp->swd_vp != rootvp) {
1024 		(void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p);
1025 	}
1026 
1027 	/* remove anons from the system */
1028 	uvm_anon_remove(sdp->swd_npages);
1029 
1030 	simple_lock(&uvm.swap_data_lock);
1031 	uvmexp.swpages -= sdp->swd_npages;
1032 
1033 	if (swaplist_find(sdp->swd_vp, 1) == NULL)
1034 		panic("swap_off: swapdev not in list\n");
1035 	swaplist_trim();
1036 	simple_unlock(&uvm.swap_data_lock);
1037 
1038 	/*
1039 	 * free all resources!
1040 	 */
1041 	extent_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize,
1042 		    EX_WAITOK);
1043 	extent_destroy(sdp->swd_ex);
1044 	free(sdp, M_VMSWAP);
1045 	return (0);
1046 }
1047 
1048 /*
1049  * /dev/drum interface and i/o functions
1050  */
1051 
1052 /*
1053  * swread: the read function for the drum (just a call to physio)
1054  */
1055 /*ARGSUSED*/
1056 int
1057 swread(dev, uio, ioflag)
1058 	dev_t dev;
1059 	struct uio *uio;
1060 	int ioflag;
1061 {
1062 	UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist);
1063 
1064 	UVMHIST_LOG(pdhist, "  dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1065 	return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
1066 }
1067 
1068 /*
1069  * swwrite: the write function for the drum (just a call to physio)
1070  */
1071 /*ARGSUSED*/
1072 int
1073 swwrite(dev, uio, ioflag)
1074 	dev_t dev;
1075 	struct uio *uio;
1076 	int ioflag;
1077 {
1078 	UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist);
1079 
1080 	UVMHIST_LOG(pdhist, "  dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1081 	return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
1082 }
1083 
1084 /*
1085  * swstrategy: perform I/O on the drum
1086  *
1087  * => we must map the i/o request from the drum to the correct swapdev.
1088  */
1089 void
1090 swstrategy(bp)
1091 	struct buf *bp;
1092 {
1093 	struct swapdev *sdp;
1094 	struct vnode *vp;
1095 	int s, pageno, bn;
1096 	UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist);
1097 
1098 	/*
1099 	 * convert block number to swapdev.   note that swapdev can't
1100 	 * be yanked out from under us because we are holding resources
1101 	 * in it (i.e. the blocks we are doing I/O on).
1102 	 */
1103 	pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT;
1104 	simple_lock(&uvm.swap_data_lock);
1105 	sdp = swapdrum_getsdp(pageno);
1106 	simple_unlock(&uvm.swap_data_lock);
1107 	if (sdp == NULL) {
1108 		bp->b_error = EINVAL;
1109 		bp->b_flags |= B_ERROR;
1110 		biodone(bp);
1111 		UVMHIST_LOG(pdhist, "  failed to get swap device", 0, 0, 0, 0);
1112 		return;
1113 	}
1114 
1115 	/*
1116 	 * convert drum page number to block number on this swapdev.
1117 	 */
1118 
1119 	pageno -= sdp->swd_drumoffset;	/* page # on swapdev */
1120 	bn = btodb((u_int64_t)pageno << PAGE_SHIFT); /* convert to diskblock */
1121 
1122 	UVMHIST_LOG(pdhist, "  %s: mapoff=%x bn=%x bcount=%ld",
1123 		((bp->b_flags & B_READ) == 0) ? "write" : "read",
1124 		sdp->swd_drumoffset, bn, bp->b_bcount);
1125 
1126 	/*
1127 	 * for block devices we finish up here.
1128 	 * for regular files we have to do more work which we delegate
1129 	 * to sw_reg_strategy().
1130 	 */
1131 
1132 	switch (sdp->swd_vp->v_type) {
1133 	default:
1134 		panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type);
1135 
1136 	case VBLK:
1137 
1138 		/*
1139 		 * must convert "bp" from an I/O on /dev/drum to an I/O
1140 		 * on the swapdev (sdp).
1141 		 */
1142 		s = splbio();
1143 		bp->b_blkno = bn;		/* swapdev block number */
1144 		vp = sdp->swd_vp;		/* swapdev vnode pointer */
1145 		bp->b_dev = sdp->swd_dev;	/* swapdev dev_t */
1146 
1147 		/*
1148 		 * if we are doing a write, we have to redirect the i/o on
1149 		 * drum's v_numoutput counter to the swapdevs.
1150 		 */
1151 		if ((bp->b_flags & B_READ) == 0) {
1152 			vwakeup(bp);	/* kills one 'v_numoutput' on drum */
1153 			vp->v_numoutput++;	/* put it on swapdev */
1154 		}
1155 
1156 		/*
1157 		 * finally plug in swapdev vnode and start I/O
1158 		 */
1159 		bp->b_vp = vp;
1160 		splx(s);
1161 		VOP_STRATEGY(bp);
1162 		return;
1163 
1164 	case VREG:
1165 		/*
1166 		 * delegate to sw_reg_strategy function.
1167 		 */
1168 		sw_reg_strategy(sdp, bp, bn);
1169 		return;
1170 	}
1171 	/* NOTREACHED */
1172 }
1173 
1174 /*
1175  * sw_reg_strategy: handle swap i/o to regular files
1176  */
1177 static void
1178 sw_reg_strategy(sdp, bp, bn)
1179 	struct swapdev	*sdp;
1180 	struct buf	*bp;
1181 	int		bn;
1182 {
1183 	struct vnode	*vp;
1184 	struct vndxfer	*vnx;
1185 	daddr_t		nbn;
1186 	caddr_t		addr;
1187 	off_t		byteoff;
1188 	int		s, off, nra, error, sz, resid;
1189 	UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist);
1190 
1191 	/*
1192 	 * allocate a vndxfer head for this transfer and point it to
1193 	 * our buffer.
1194 	 */
1195 	getvndxfer(vnx);
1196 	vnx->vx_flags = VX_BUSY;
1197 	vnx->vx_error = 0;
1198 	vnx->vx_pending = 0;
1199 	vnx->vx_bp = bp;
1200 	vnx->vx_sdp = sdp;
1201 
1202 	/*
1203 	 * setup for main loop where we read filesystem blocks into
1204 	 * our buffer.
1205 	 */
1206 	error = 0;
1207 	bp->b_resid = bp->b_bcount;	/* nothing transfered yet! */
1208 	addr = bp->b_data;		/* current position in buffer */
1209 	byteoff = dbtob((u_int64_t)bn);
1210 
1211 	for (resid = bp->b_resid; resid; resid -= sz) {
1212 		struct vndbuf	*nbp;
1213 
1214 		/*
1215 		 * translate byteoffset into block number.  return values:
1216 		 *   vp = vnode of underlying device
1217 		 *  nbn = new block number (on underlying vnode dev)
1218 		 *  nra = num blocks we can read-ahead (excludes requested
1219 		 *	block)
1220 		 */
1221 		nra = 0;
1222 		error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
1223 				 	&vp, &nbn, &nra);
1224 
1225 		if (error == 0 && nbn == (daddr_t)-1) {
1226 			/*
1227 			 * this used to just set error, but that doesn't
1228 			 * do the right thing.  Instead, it causes random
1229 			 * memory errors.  The panic() should remain until
1230 			 * this condition doesn't destabilize the system.
1231 			 */
1232 #if 1
1233 			panic("sw_reg_strategy: swap to sparse file");
1234 #else
1235 			error = EIO;	/* failure */
1236 #endif
1237 		}
1238 
1239 		/*
1240 		 * punt if there was an error or a hole in the file.
1241 		 * we must wait for any i/o ops we have already started
1242 		 * to finish before returning.
1243 		 *
1244 		 * XXX we could deal with holes here but it would be
1245 		 * a hassle (in the write case).
1246 		 */
1247 		if (error) {
1248 			s = splbio();
1249 			vnx->vx_error = error;	/* pass error up */
1250 			goto out;
1251 		}
1252 
1253 		/*
1254 		 * compute the size ("sz") of this transfer (in bytes).
1255 		 */
1256 		off = byteoff % sdp->swd_bsize;
1257 		sz = (1 + nra) * sdp->swd_bsize - off;
1258 		if (sz > resid)
1259 			sz = resid;
1260 
1261 		UVMHIST_LOG(pdhist, "sw_reg_strategy: "
1262 			    "vp %p/%p offset 0x%x/0x%x",
1263 			    sdp->swd_vp, vp, byteoff, nbn);
1264 
1265 		/*
1266 		 * now get a buf structure.   note that the vb_buf is
1267 		 * at the front of the nbp structure so that you can
1268 		 * cast pointers between the two structure easily.
1269 		 */
1270 		getvndbuf(nbp);
1271 		nbp->vb_buf.b_flags    = bp->b_flags | B_CALL;
1272 		nbp->vb_buf.b_bcount   = sz;
1273 		nbp->vb_buf.b_bufsize  = sz;
1274 		nbp->vb_buf.b_error    = 0;
1275 		nbp->vb_buf.b_data     = addr;
1276 		nbp->vb_buf.b_lblkno   = 0;
1277 		nbp->vb_buf.b_blkno    = nbn + btodb(off);
1278 		nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno;
1279 		nbp->vb_buf.b_iodone   = sw_reg_iodone;
1280 		nbp->vb_buf.b_vp       = vp;
1281 		if (vp->v_type == VBLK) {
1282 			nbp->vb_buf.b_dev = vp->v_rdev;
1283 		}
1284 		LIST_INIT(&nbp->vb_buf.b_dep);
1285 
1286 		nbp->vb_xfer = vnx;	/* patch it back in to vnx */
1287 
1288 		/*
1289 		 * Just sort by block number
1290 		 */
1291 		s = splbio();
1292 		if (vnx->vx_error != 0) {
1293 			putvndbuf(nbp);
1294 			goto out;
1295 		}
1296 		vnx->vx_pending++;
1297 
1298 		/* sort it in and start I/O if we are not over our limit */
1299 		disksort_blkno(&sdp->swd_tab, &nbp->vb_buf);
1300 		sw_reg_start(sdp);
1301 		splx(s);
1302 
1303 		/*
1304 		 * advance to the next I/O
1305 		 */
1306 		byteoff += sz;
1307 		addr += sz;
1308 	}
1309 
1310 	s = splbio();
1311 
1312 out: /* Arrive here at splbio */
1313 	vnx->vx_flags &= ~VX_BUSY;
1314 	if (vnx->vx_pending == 0) {
1315 		if (vnx->vx_error != 0) {
1316 			bp->b_error = vnx->vx_error;
1317 			bp->b_flags |= B_ERROR;
1318 		}
1319 		putvndxfer(vnx);
1320 		biodone(bp);
1321 	}
1322 	splx(s);
1323 }
1324 
1325 /*
1326  * sw_reg_start: start an I/O request on the requested swapdev
1327  *
1328  * => reqs are sorted by disksort (above)
1329  */
1330 static void
1331 sw_reg_start(sdp)
1332 	struct swapdev	*sdp;
1333 {
1334 	struct buf	*bp;
1335 	UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist);
1336 
1337 	/* recursion control */
1338 	if ((sdp->swd_flags & SWF_BUSY) != 0)
1339 		return;
1340 
1341 	sdp->swd_flags |= SWF_BUSY;
1342 
1343 	while (sdp->swd_active < sdp->swd_maxactive) {
1344 		bp = BUFQ_FIRST(&sdp->swd_tab);
1345 		if (bp == NULL)
1346 			break;
1347 		BUFQ_REMOVE(&sdp->swd_tab, bp);
1348 		sdp->swd_active++;
1349 
1350 		UVMHIST_LOG(pdhist,
1351 		    "sw_reg_start:  bp %p vp %p blkno %p cnt %lx",
1352 		    bp, bp->b_vp, bp->b_blkno, bp->b_bcount);
1353 		if ((bp->b_flags & B_READ) == 0)
1354 			bp->b_vp->v_numoutput++;
1355 
1356 		VOP_STRATEGY(bp);
1357 	}
1358 	sdp->swd_flags &= ~SWF_BUSY;
1359 }
1360 
1361 /*
1362  * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
1363  *
1364  * => note that we can recover the vndbuf struct by casting the buf ptr
1365  */
1366 static void
1367 sw_reg_iodone(bp)
1368 	struct buf *bp;
1369 {
1370 	struct vndbuf *vbp = (struct vndbuf *) bp;
1371 	struct vndxfer *vnx = vbp->vb_xfer;
1372 	struct buf *pbp = vnx->vx_bp;		/* parent buffer */
1373 	struct swapdev	*sdp = vnx->vx_sdp;
1374 	int		s, resid;
1375 	UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist);
1376 
1377 	UVMHIST_LOG(pdhist, "  vbp=%p vp=%p blkno=%x addr=%p",
1378 	    vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data);
1379 	UVMHIST_LOG(pdhist, "  cnt=%lx resid=%lx",
1380 	    vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
1381 
1382 	/*
1383 	 * protect vbp at splbio and update.
1384 	 */
1385 
1386 	s = splbio();
1387 	resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
1388 	pbp->b_resid -= resid;
1389 	vnx->vx_pending--;
1390 
1391 	if (vbp->vb_buf.b_error) {
1392 		UVMHIST_LOG(pdhist, "  got error=%d !",
1393 		    vbp->vb_buf.b_error, 0, 0, 0);
1394 
1395 		/* pass error upward */
1396 		vnx->vx_error = vbp->vb_buf.b_error;
1397 	}
1398 
1399 	/*
1400 	 * kill vbp structure
1401 	 */
1402 	putvndbuf(vbp);
1403 
1404 	/*
1405 	 * wrap up this transaction if it has run to completion or, in
1406 	 * case of an error, when all auxiliary buffers have returned.
1407 	 */
1408 	if (vnx->vx_error != 0) {
1409 		/* pass error upward */
1410 		pbp->b_flags |= B_ERROR;
1411 		pbp->b_error = vnx->vx_error;
1412 		if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
1413 			putvndxfer(vnx);
1414 			biodone(pbp);
1415 		}
1416 	} else if (pbp->b_resid == 0) {
1417 		KASSERT(vnx->vx_pending == 0);
1418 		if ((vnx->vx_flags & VX_BUSY) == 0) {
1419 			UVMHIST_LOG(pdhist, "  iodone error=%d !",
1420 			    pbp, vnx->vx_error, 0, 0);
1421 			putvndxfer(vnx);
1422 			biodone(pbp);
1423 		}
1424 	}
1425 
1426 	/*
1427 	 * done!   start next swapdev I/O if one is pending
1428 	 */
1429 	sdp->swd_active--;
1430 	sw_reg_start(sdp);
1431 	splx(s);
1432 }
1433 
1434 
1435 /*
1436  * uvm_swap_alloc: allocate space on swap
1437  *
1438  * => allocation is done "round robin" down the priority list, as we
1439  *	allocate in a priority we "rotate" the circle queue.
1440  * => space can be freed with uvm_swap_free
1441  * => we return the page slot number in /dev/drum (0 == invalid slot)
1442  * => we lock uvm.swap_data_lock
1443  * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
1444  */
1445 int
1446 uvm_swap_alloc(nslots, lessok)
1447 	int *nslots;	/* IN/OUT */
1448 	boolean_t lessok;
1449 {
1450 	struct swapdev *sdp;
1451 	struct swappri *spp;
1452 	u_long	result;
1453 	UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist);
1454 
1455 	/*
1456 	 * no swap devices configured yet?   definite failure.
1457 	 */
1458 	if (uvmexp.nswapdev < 1)
1459 		return 0;
1460 
1461 	/*
1462 	 * lock data lock, convert slots into blocks, and enter loop
1463 	 */
1464 	simple_lock(&uvm.swap_data_lock);
1465 
1466 ReTry:	/* XXXMRG */
1467 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
1468 		CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
1469 			/* if it's not enabled, then we can't swap from it */
1470 			if ((sdp->swd_flags & SWF_ENABLE) == 0)
1471 				continue;
1472 			if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
1473 				continue;
1474 			if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN,
1475 					 EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT,
1476 					 &result) != 0) {
1477 				continue;
1478 			}
1479 
1480 			/*
1481 			 * successful allocation!  now rotate the circleq.
1482 			 */
1483 			CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1484 			CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
1485 			sdp->swd_npginuse += *nslots;
1486 			uvmexp.swpginuse += *nslots;
1487 			simple_unlock(&uvm.swap_data_lock);
1488 			/* done!  return drum slot number */
1489 			UVMHIST_LOG(pdhist,
1490 			    "success!  returning %d slots starting at %d",
1491 			    *nslots, result + sdp->swd_drumoffset, 0, 0);
1492 			return (result + sdp->swd_drumoffset);
1493 		}
1494 	}
1495 
1496 	/* XXXMRG: BEGIN HACK */
1497 	if (*nslots > 1 && lessok) {
1498 		*nslots = 1;
1499 		goto ReTry;	/* XXXMRG: ugh!  extent should support this for us */
1500 	}
1501 	/* XXXMRG: END HACK */
1502 
1503 	simple_unlock(&uvm.swap_data_lock);
1504 	return 0;
1505 }
1506 
1507 /*
1508  * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
1509  *
1510  * => we lock uvm.swap_data_lock
1511  */
1512 void
1513 uvm_swap_markbad(startslot, nslots)
1514 	int startslot;
1515 	int nslots;
1516 {
1517 	struct swapdev *sdp;
1518 	UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist);
1519 
1520 	simple_lock(&uvm.swap_data_lock);
1521 	sdp = swapdrum_getsdp(startslot);
1522 
1523 	/*
1524 	 * we just keep track of how many pages have been marked bad
1525 	 * in this device, to make everything add up in swap_off().
1526 	 * we assume here that the range of slots will all be within
1527 	 * one swap device.
1528 	 */
1529 
1530 	sdp->swd_npgbad += nslots;
1531 	UVMHIST_LOG(pdhist, "now %d bad", sdp->swd_npgbad, 0,0,0);
1532 	simple_unlock(&uvm.swap_data_lock);
1533 }
1534 
1535 /*
1536  * uvm_swap_free: free swap slots
1537  *
1538  * => this can be all or part of an allocation made by uvm_swap_alloc
1539  * => we lock uvm.swap_data_lock
1540  */
1541 void
1542 uvm_swap_free(startslot, nslots)
1543 	int startslot;
1544 	int nslots;
1545 {
1546 	struct swapdev *sdp;
1547 	UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist);
1548 
1549 	UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots,
1550 	    startslot, 0, 0);
1551 
1552 	/*
1553 	 * ignore attempts to free the "bad" slot.
1554 	 */
1555 
1556 	if (startslot == SWSLOT_BAD) {
1557 		return;
1558 	}
1559 
1560 	/*
1561 	 * convert drum slot offset back to sdp, free the blocks
1562 	 * in the extent, and return.   must hold pri lock to do
1563 	 * lookup and access the extent.
1564 	 */
1565 
1566 	simple_lock(&uvm.swap_data_lock);
1567 	sdp = swapdrum_getsdp(startslot);
1568 	KASSERT(uvmexp.nswapdev >= 1);
1569 	KASSERT(sdp != NULL);
1570 	KASSERT(sdp->swd_npginuse >= nslots);
1571 	if (extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots,
1572 			EX_MALLOCOK|EX_NOWAIT) != 0) {
1573 		printf("warning: resource shortage: %d pages of swap lost\n",
1574 			nslots);
1575 	}
1576 	sdp->swd_npginuse -= nslots;
1577 	uvmexp.swpginuse -= nslots;
1578 	simple_unlock(&uvm.swap_data_lock);
1579 }
1580 
1581 /*
1582  * uvm_swap_put: put any number of pages into a contig place on swap
1583  *
1584  * => can be sync or async
1585  */
1586 
1587 int
1588 uvm_swap_put(swslot, ppsp, npages, flags)
1589 	int swslot;
1590 	struct vm_page **ppsp;
1591 	int npages;
1592 	int flags;
1593 {
1594 	int error;
1595 
1596 	error = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
1597 	    ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1598 	return error;
1599 }
1600 
1601 /*
1602  * uvm_swap_get: get a single page from swap
1603  *
1604  * => usually a sync op (from fault)
1605  */
1606 
1607 int
1608 uvm_swap_get(page, swslot, flags)
1609 	struct vm_page *page;
1610 	int swslot, flags;
1611 {
1612 	int error;
1613 
1614 	uvmexp.nswget++;
1615 	KASSERT(flags & PGO_SYNCIO);
1616 	if (swslot == SWSLOT_BAD) {
1617 		return EIO;
1618 	}
1619 	error = uvm_swap_io(&page, swslot, 1, B_READ |
1620 	    ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1621 	if (error == 0) {
1622 
1623 		/*
1624 		 * this page is no longer only in swap.
1625 		 */
1626 
1627 		simple_lock(&uvm.swap_data_lock);
1628 		KASSERT(uvmexp.swpgonly > 0);
1629 		uvmexp.swpgonly--;
1630 		simple_unlock(&uvm.swap_data_lock);
1631 	}
1632 	return error;
1633 }
1634 
1635 /*
1636  * uvm_swap_io: do an i/o operation to swap
1637  */
1638 
1639 static int
1640 uvm_swap_io(pps, startslot, npages, flags)
1641 	struct vm_page **pps;
1642 	int startslot, npages, flags;
1643 {
1644 	daddr_t startblk;
1645 	struct	buf *bp;
1646 	vaddr_t kva;
1647 	int	error, s, mapinflags;
1648 	boolean_t write, async;
1649 	UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist);
1650 
1651 	UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d",
1652 	    startslot, npages, flags, 0);
1653 
1654 	write = (flags & B_READ) == 0;
1655 	async = (flags & B_ASYNC) != 0;
1656 
1657 	/*
1658 	 * convert starting drum slot to block number
1659 	 */
1660 
1661 	startblk = btodb((u_int64_t)startslot << PAGE_SHIFT);
1662 
1663 	/*
1664 	 * first, map the pages into the kernel.
1665 	 */
1666 
1667 	mapinflags = !write ?
1668 		UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ :
1669 		UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE;
1670 	kva = uvm_pagermapin(pps, npages, mapinflags);
1671 
1672 	/*
1673 	 * now allocate a buf for the i/o.
1674 	 */
1675 
1676 	s = splbio();
1677 	bp = pool_get(&bufpool, PR_WAITOK);
1678 	splx(s);
1679 
1680 	/*
1681 	 * fill in the bp/sbp.   we currently route our i/o through
1682 	 * /dev/drum's vnode [swapdev_vp].
1683 	 */
1684 
1685 	bp->b_flags = B_BUSY | B_NOCACHE | (flags & (B_READ|B_ASYNC));
1686 	bp->b_proc = &proc0;	/* XXX */
1687 	bp->b_vnbufs.le_next = NOLIST;
1688 	bp->b_data = (caddr_t)kva;
1689 	bp->b_blkno = startblk;
1690 	bp->b_vp = swapdev_vp;
1691 	bp->b_dev = swapdev_vp->v_rdev;
1692 	bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT;
1693 	LIST_INIT(&bp->b_dep);
1694 
1695 	/*
1696 	 * bump v_numoutput (counter of number of active outputs).
1697 	 */
1698 
1699 	if (write) {
1700 		s = splbio();
1701 		swapdev_vp->v_numoutput++;
1702 		splx(s);
1703 	}
1704 
1705 	/*
1706 	 * for async ops we must set up the iodone handler.
1707 	 */
1708 
1709 	if (async) {
1710 		bp->b_flags |= B_CALL;
1711 		bp->b_iodone = uvm_aio_biodone;
1712 		UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
1713 	}
1714 	UVMHIST_LOG(pdhist,
1715 	    "about to start io: data = %p blkno = 0x%x, bcount = %ld",
1716 	    bp->b_data, bp->b_blkno, bp->b_bcount, 0);
1717 
1718 	/*
1719 	 * now we start the I/O, and if async, return.
1720 	 */
1721 
1722 	VOP_STRATEGY(bp);
1723 	if (async)
1724 		return 0;
1725 
1726 	/*
1727 	 * must be sync i/o.   wait for it to finish
1728 	 */
1729 
1730 	error = biowait(bp);
1731 
1732 	/*
1733 	 * kill the pager mapping
1734 	 */
1735 
1736 	uvm_pagermapout(kva, npages);
1737 
1738 	/*
1739 	 * now dispose of the buf and we're done.
1740 	 */
1741 
1742 	s = splbio();
1743 	if (write)
1744 		vwakeup(bp);
1745 	pool_put(&bufpool, bp);
1746 	splx(s);
1747 	UVMHIST_LOG(pdhist, "<- done (sync)  error=%d", error, 0, 0, 0);
1748 	return (error);
1749 }
1750