xref: /netbsd-src/sys/uvm/uvm_swap.c (revision d710132b4b8ce7f7cccaaf660cb16aa16b4077a0)
1 /*	$NetBSD: uvm_swap.c,v 1.77 2003/02/25 20:35:41 thorpej Exp $	*/
2 
3 /*
4  * Copyright (c) 1995, 1996, 1997 Matthew R. Green
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. The name of the author may not be used to endorse or promote products
16  *    derived from this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
25  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
31  * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
32  */
33 
34 #include <sys/cdefs.h>
35 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.77 2003/02/25 20:35:41 thorpej Exp $");
36 
37 #include "fs_nfs.h"
38 #include "opt_uvmhist.h"
39 #include "opt_compat_netbsd.h"
40 #include "opt_ddb.h"
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/buf.h>
45 #include <sys/conf.h>
46 #include <sys/proc.h>
47 #include <sys/namei.h>
48 #include <sys/disklabel.h>
49 #include <sys/errno.h>
50 #include <sys/kernel.h>
51 #include <sys/malloc.h>
52 #include <sys/vnode.h>
53 #include <sys/file.h>
54 #include <sys/extent.h>
55 #include <sys/mount.h>
56 #include <sys/pool.h>
57 #include <sys/sa.h>
58 #include <sys/syscallargs.h>
59 #include <sys/swap.h>
60 
61 #include <uvm/uvm.h>
62 
63 #include <miscfs/specfs/specdev.h>
64 
65 /*
66  * uvm_swap.c: manage configuration and i/o to swap space.
67  */
68 
69 /*
70  * swap space is managed in the following way:
71  *
72  * each swap partition or file is described by a "swapdev" structure.
73  * each "swapdev" structure contains a "swapent" structure which contains
74  * information that is passed up to the user (via system calls).
75  *
76  * each swap partition is assigned a "priority" (int) which controls
77  * swap parition usage.
78  *
79  * the system maintains a global data structure describing all swap
80  * partitions/files.   there is a sorted LIST of "swappri" structures
81  * which describe "swapdev"'s at that priority.   this LIST is headed
82  * by the "swap_priority" global var.    each "swappri" contains a
83  * CIRCLEQ of "swapdev" structures at that priority.
84  *
85  * locking:
86  *  - swap_syscall_lock (sleep lock): this lock serializes the swapctl
87  *    system call and prevents the swap priority list from changing
88  *    while we are in the middle of a system call (e.g. SWAP_STATS).
89  *  - uvm.swap_data_lock (simple_lock): this lock protects all swap data
90  *    structures including the priority list, the swapdev structures,
91  *    and the swapmap extent.
92  *
93  * each swap device has the following info:
94  *  - swap device in use (could be disabled, preventing future use)
95  *  - swap enabled (allows new allocations on swap)
96  *  - map info in /dev/drum
97  *  - vnode pointer
98  * for swap files only:
99  *  - block size
100  *  - max byte count in buffer
101  *  - buffer
102  *
103  * userland controls and configures swap with the swapctl(2) system call.
104  * the sys_swapctl performs the following operations:
105  *  [1] SWAP_NSWAP: returns the number of swap devices currently configured
106  *  [2] SWAP_STATS: given a pointer to an array of swapent structures
107  *	(passed in via "arg") of a size passed in via "misc" ... we load
108  *	the current swap config into the array. The actual work is done
109  *	in the uvm_swap_stats(9) function.
110  *  [3] SWAP_ON: given a pathname in arg (could be device or file) and a
111  *	priority in "misc", start swapping on it.
112  *  [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
113  *  [5] SWAP_CTL: changes the priority of a swap device (new priority in
114  *	"misc")
115  */
116 
117 /*
118  * swapdev: describes a single swap partition/file
119  *
120  * note the following should be true:
121  * swd_inuse <= swd_nblks  [number of blocks in use is <= total blocks]
122  * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
123  */
124 struct swapdev {
125 	struct oswapent swd_ose;
126 #define	swd_dev		swd_ose.ose_dev		/* device id */
127 #define	swd_flags	swd_ose.ose_flags	/* flags:inuse/enable/fake */
128 #define	swd_priority	swd_ose.ose_priority	/* our priority */
129 	/* also: swd_ose.ose_nblks, swd_ose.ose_inuse */
130 	char			*swd_path;	/* saved pathname of device */
131 	int			swd_pathlen;	/* length of pathname */
132 	int			swd_npages;	/* #pages we can use */
133 	int			swd_npginuse;	/* #pages in use */
134 	int			swd_npgbad;	/* #pages bad */
135 	int			swd_drumoffset;	/* page0 offset in drum */
136 	int			swd_drumsize;	/* #pages in drum */
137 	struct extent		*swd_ex;	/* extent for this swapdev */
138 	char			swd_exname[12];	/* name of extent above */
139 	struct vnode		*swd_vp;	/* backing vnode */
140 	CIRCLEQ_ENTRY(swapdev)	swd_next;	/* priority circleq */
141 
142 	int			swd_bsize;	/* blocksize (bytes) */
143 	int			swd_maxactive;	/* max active i/o reqs */
144 	struct bufq_state	swd_tab;	/* buffer list */
145 	int			swd_active;	/* number of active buffers */
146 };
147 
148 /*
149  * swap device priority entry; the list is kept sorted on `spi_priority'.
150  */
151 struct swappri {
152 	int			spi_priority;     /* priority */
153 	CIRCLEQ_HEAD(spi_swapdev, swapdev)	spi_swapdev;
154 	/* circleq of swapdevs at this priority */
155 	LIST_ENTRY(swappri)	spi_swappri;      /* global list of pri's */
156 };
157 
158 /*
159  * The following two structures are used to keep track of data transfers
160  * on swap devices associated with regular files.
161  * NOTE: this code is more or less a copy of vnd.c; we use the same
162  * structure names here to ease porting..
163  */
164 struct vndxfer {
165 	struct buf	*vx_bp;		/* Pointer to parent buffer */
166 	struct swapdev	*vx_sdp;
167 	int		vx_error;
168 	int		vx_pending;	/* # of pending aux buffers */
169 	int		vx_flags;
170 #define VX_BUSY		1
171 #define VX_DEAD		2
172 };
173 
174 struct vndbuf {
175 	struct buf	vb_buf;
176 	struct vndxfer	*vb_xfer;
177 };
178 
179 
180 /*
181  * We keep a of pool vndbuf's and vndxfer structures.
182  */
183 static struct pool vndxfer_pool;
184 static struct pool vndbuf_pool;
185 
186 #define	getvndxfer(vnx)	do {						\
187 	int s = splbio();						\
188 	vnx = pool_get(&vndxfer_pool, PR_WAITOK);			\
189 	splx(s);							\
190 } while (/*CONSTCOND*/ 0)
191 
192 #define putvndxfer(vnx) {						\
193 	pool_put(&vndxfer_pool, (void *)(vnx));				\
194 }
195 
196 #define	getvndbuf(vbp)	do {						\
197 	int s = splbio();						\
198 	vbp = pool_get(&vndbuf_pool, PR_WAITOK);			\
199 	splx(s);							\
200 } while (/*CONSTCOND*/ 0)
201 
202 #define putvndbuf(vbp) {						\
203 	pool_put(&vndbuf_pool, (void *)(vbp));				\
204 }
205 
206 /*
207  * local variables
208  */
209 static struct extent *swapmap;		/* controls the mapping of /dev/drum */
210 
211 MALLOC_DEFINE(M_VMSWAP, "VM swap", "VM swap structures");
212 
213 /* list of all active swap devices [by priority] */
214 LIST_HEAD(swap_priority, swappri);
215 static struct swap_priority swap_priority;
216 
217 /* locks */
218 struct lock swap_syscall_lock;
219 
220 /*
221  * prototypes
222  */
223 static struct swapdev	*swapdrum_getsdp __P((int));
224 
225 static struct swapdev	*swaplist_find __P((struct vnode *, int));
226 static void		 swaplist_insert __P((struct swapdev *,
227 					     struct swappri *, int));
228 static void		 swaplist_trim __P((void));
229 
230 static int swap_on __P((struct proc *, struct swapdev *));
231 static int swap_off __P((struct proc *, struct swapdev *));
232 
233 static void sw_reg_strategy __P((struct swapdev *, struct buf *, int));
234 static void sw_reg_iodone __P((struct buf *));
235 static void sw_reg_start __P((struct swapdev *));
236 
237 static int uvm_swap_io __P((struct vm_page **, int, int, int));
238 
239 dev_type_read(swread);
240 dev_type_write(swwrite);
241 dev_type_strategy(swstrategy);
242 
243 const struct bdevsw swap_bdevsw = {
244 	noopen, noclose, swstrategy, noioctl, nodump, nosize,
245 };
246 
247 const struct cdevsw swap_cdevsw = {
248 	nullopen, nullclose, swread, swwrite, noioctl,
249 	nostop, notty, nopoll, nommap, nokqfilter
250 };
251 
252 /*
253  * uvm_swap_init: init the swap system data structures and locks
254  *
255  * => called at boot time from init_main.c after the filesystems
256  *	are brought up (which happens after uvm_init())
257  */
258 void
259 uvm_swap_init()
260 {
261 	UVMHIST_FUNC("uvm_swap_init");
262 
263 	UVMHIST_CALLED(pdhist);
264 	/*
265 	 * first, init the swap list, its counter, and its lock.
266 	 * then get a handle on the vnode for /dev/drum by using
267 	 * the its dev_t number ("swapdev", from MD conf.c).
268 	 */
269 
270 	LIST_INIT(&swap_priority);
271 	uvmexp.nswapdev = 0;
272 	lockinit(&swap_syscall_lock, PVM, "swapsys", 0, 0);
273 	simple_lock_init(&uvm.swap_data_lock);
274 
275 	if (bdevvp(swapdev, &swapdev_vp))
276 		panic("uvm_swap_init: can't get vnode for swap device");
277 
278 	/*
279 	 * create swap block resource map to map /dev/drum.   the range
280 	 * from 1 to INT_MAX allows 2 gigablocks of swap space.  note
281 	 * that block 0 is reserved (used to indicate an allocation
282 	 * failure, or no allocation).
283 	 */
284 	swapmap = extent_create("swapmap", 1, INT_MAX,
285 				M_VMSWAP, 0, 0, EX_NOWAIT);
286 	if (swapmap == 0)
287 		panic("uvm_swap_init: extent_create failed");
288 
289 	/*
290 	 * allocate pools for structures used for swapping to files.
291 	 */
292 
293 	pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0,
294 	    "swp vnx", NULL);
295 
296 	pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0,
297 	    "swp vnd", NULL);
298 
299 	/*
300 	 * done!
301 	 */
302 	UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
303 }
304 
305 /*
306  * swaplist functions: functions that operate on the list of swap
307  * devices on the system.
308  */
309 
310 /*
311  * swaplist_insert: insert swap device "sdp" into the global list
312  *
313  * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
314  * => caller must provide a newly malloc'd swappri structure (we will
315  *	FREE it if we don't need it... this it to prevent malloc blocking
316  *	here while adding swap)
317  */
318 static void
319 swaplist_insert(sdp, newspp, priority)
320 	struct swapdev *sdp;
321 	struct swappri *newspp;
322 	int priority;
323 {
324 	struct swappri *spp, *pspp;
325 	UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist);
326 
327 	/*
328 	 * find entry at or after which to insert the new device.
329 	 */
330 	pspp = NULL;
331 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
332 		if (priority <= spp->spi_priority)
333 			break;
334 		pspp = spp;
335 	}
336 
337 	/*
338 	 * new priority?
339 	 */
340 	if (spp == NULL || spp->spi_priority != priority) {
341 		spp = newspp;  /* use newspp! */
342 		UVMHIST_LOG(pdhist, "created new swappri = %d",
343 			    priority, 0, 0, 0);
344 
345 		spp->spi_priority = priority;
346 		CIRCLEQ_INIT(&spp->spi_swapdev);
347 
348 		if (pspp)
349 			LIST_INSERT_AFTER(pspp, spp, spi_swappri);
350 		else
351 			LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
352 	} else {
353 	  	/* we don't need a new priority structure, free it */
354 		FREE(newspp, M_VMSWAP);
355 	}
356 
357 	/*
358 	 * priority found (or created).   now insert on the priority's
359 	 * circleq list and bump the total number of swapdevs.
360 	 */
361 	sdp->swd_priority = priority;
362 	CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
363 	uvmexp.nswapdev++;
364 }
365 
366 /*
367  * swaplist_find: find and optionally remove a swap device from the
368  *	global list.
369  *
370  * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
371  * => we return the swapdev we found (and removed)
372  */
373 static struct swapdev *
374 swaplist_find(vp, remove)
375 	struct vnode *vp;
376 	boolean_t remove;
377 {
378 	struct swapdev *sdp;
379 	struct swappri *spp;
380 
381 	/*
382 	 * search the lists for the requested vp
383 	 */
384 
385 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
386 		CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
387 			if (sdp->swd_vp == vp) {
388 				if (remove) {
389 					CIRCLEQ_REMOVE(&spp->spi_swapdev,
390 					    sdp, swd_next);
391 					uvmexp.nswapdev--;
392 				}
393 				return(sdp);
394 			}
395 		}
396 	}
397 	return (NULL);
398 }
399 
400 
401 /*
402  * swaplist_trim: scan priority list for empty priority entries and kill
403  *	them.
404  *
405  * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
406  */
407 static void
408 swaplist_trim()
409 {
410 	struct swappri *spp, *nextspp;
411 
412 	for (spp = LIST_FIRST(&swap_priority); spp != NULL; spp = nextspp) {
413 		nextspp = LIST_NEXT(spp, spi_swappri);
414 		if (CIRCLEQ_FIRST(&spp->spi_swapdev) !=
415 		    (void *)&spp->spi_swapdev)
416 			continue;
417 		LIST_REMOVE(spp, spi_swappri);
418 		free(spp, M_VMSWAP);
419 	}
420 }
421 
422 /*
423  * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
424  *	to the "swapdev" that maps that section of the drum.
425  *
426  * => each swapdev takes one big contig chunk of the drum
427  * => caller must hold uvm.swap_data_lock
428  */
429 static struct swapdev *
430 swapdrum_getsdp(pgno)
431 	int pgno;
432 {
433 	struct swapdev *sdp;
434 	struct swappri *spp;
435 
436 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
437 		CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
438 			if (sdp->swd_flags & SWF_FAKE)
439 				continue;
440 			if (pgno >= sdp->swd_drumoffset &&
441 			    pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
442 				return sdp;
443 			}
444 		}
445 	}
446 	return NULL;
447 }
448 
449 
450 /*
451  * sys_swapctl: main entry point for swapctl(2) system call
452  * 	[with two helper functions: swap_on and swap_off]
453  */
454 int
455 sys_swapctl(l, v, retval)
456 	struct lwp *l;
457 	void *v;
458 	register_t *retval;
459 {
460 	struct sys_swapctl_args /* {
461 		syscallarg(int) cmd;
462 		syscallarg(void *) arg;
463 		syscallarg(int) misc;
464 	} */ *uap = (struct sys_swapctl_args *)v;
465 	struct proc *p = l->l_proc;
466 	struct vnode *vp;
467 	struct nameidata nd;
468 	struct swappri *spp;
469 	struct swapdev *sdp;
470 	struct swapent *sep;
471 	char	userpath[PATH_MAX + 1];
472 	size_t	len;
473 	int	error, misc;
474 	int	priority;
475 	UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist);
476 
477 	misc = SCARG(uap, misc);
478 
479 	/*
480 	 * ensure serialized syscall access by grabbing the swap_syscall_lock
481 	 */
482 	lockmgr(&swap_syscall_lock, LK_EXCLUSIVE, NULL);
483 
484 	/*
485 	 * we handle the non-priv NSWAP and STATS request first.
486 	 *
487 	 * SWAP_NSWAP: return number of config'd swap devices
488 	 * [can also be obtained with uvmexp sysctl]
489 	 */
490 	if (SCARG(uap, cmd) == SWAP_NSWAP) {
491 		UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev,
492 		    0, 0, 0);
493 		*retval = uvmexp.nswapdev;
494 		error = 0;
495 		goto out;
496 	}
497 
498 	/*
499 	 * SWAP_STATS: get stats on current # of configured swap devs
500 	 *
501 	 * note that the swap_priority list can't change as long
502 	 * as we are holding the swap_syscall_lock.  we don't want
503 	 * to grab the uvm.swap_data_lock because we may fault&sleep during
504 	 * copyout() and we don't want to be holding that lock then!
505 	 */
506 	if (SCARG(uap, cmd) == SWAP_STATS
507 #if defined(COMPAT_13)
508 	    || SCARG(uap, cmd) == SWAP_OSTATS
509 #endif
510 	    ) {
511 		misc = MIN(uvmexp.nswapdev, misc);
512 #if defined(COMPAT_13)
513 		if (SCARG(uap, cmd) == SWAP_OSTATS)
514 			len = sizeof(struct oswapent) * misc;
515 		else
516 #endif
517 			len = sizeof(struct swapent) * misc;
518 		sep = (struct swapent *)malloc(len, M_TEMP, M_WAITOK);
519 
520 		uvm_swap_stats(SCARG(uap, cmd), sep, misc, retval);
521 		error = copyout(sep, (void *)SCARG(uap, arg), len);
522 
523 		free(sep, M_TEMP);
524 		UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
525 		goto out;
526 	}
527 	if (SCARG(uap, cmd) == SWAP_GETDUMPDEV) {
528 		dev_t	*devp = (dev_t *)SCARG(uap, arg);
529 
530 		error = copyout(&dumpdev, devp, sizeof(dumpdev));
531 		goto out;
532 	}
533 
534 	/*
535 	 * all other requests require superuser privs.   verify.
536 	 */
537 	if ((error = suser(p->p_ucred, &p->p_acflag)))
538 		goto out;
539 
540 	/*
541 	 * at this point we expect a path name in arg.   we will
542 	 * use namei() to gain a vnode reference (vref), and lock
543 	 * the vnode (VOP_LOCK).
544 	 *
545 	 * XXX: a NULL arg means use the root vnode pointer (e.g. for
546 	 * miniroot)
547 	 */
548 	if (SCARG(uap, arg) == NULL) {
549 		vp = rootvp;		/* miniroot */
550 		if (vget(vp, LK_EXCLUSIVE)) {
551 			error = EBUSY;
552 			goto out;
553 		}
554 		if (SCARG(uap, cmd) == SWAP_ON &&
555 		    copystr("miniroot", userpath, sizeof userpath, &len))
556 			panic("swapctl: miniroot copy failed");
557 	} else {
558 		int	space;
559 		char	*where;
560 
561 		if (SCARG(uap, cmd) == SWAP_ON) {
562 			if ((error = copyinstr(SCARG(uap, arg), userpath,
563 			    sizeof userpath, &len)))
564 				goto out;
565 			space = UIO_SYSSPACE;
566 			where = userpath;
567 		} else {
568 			space = UIO_USERSPACE;
569 			where = (char *)SCARG(uap, arg);
570 		}
571 		NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, space, where, p);
572 		if ((error = namei(&nd)))
573 			goto out;
574 		vp = nd.ni_vp;
575 	}
576 	/* note: "vp" is referenced and locked */
577 
578 	error = 0;		/* assume no error */
579 	switch(SCARG(uap, cmd)) {
580 
581 	case SWAP_DUMPDEV:
582 		if (vp->v_type != VBLK) {
583 			error = ENOTBLK;
584 			break;
585 		}
586 		dumpdev = vp->v_rdev;
587 		cpu_dumpconf();
588 		break;
589 
590 	case SWAP_CTL:
591 		/*
592 		 * get new priority, remove old entry (if any) and then
593 		 * reinsert it in the correct place.  finally, prune out
594 		 * any empty priority structures.
595 		 */
596 		priority = SCARG(uap, misc);
597 		spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
598 		simple_lock(&uvm.swap_data_lock);
599 		if ((sdp = swaplist_find(vp, 1)) == NULL) {
600 			error = ENOENT;
601 		} else {
602 			swaplist_insert(sdp, spp, priority);
603 			swaplist_trim();
604 		}
605 		simple_unlock(&uvm.swap_data_lock);
606 		if (error)
607 			free(spp, M_VMSWAP);
608 		break;
609 
610 	case SWAP_ON:
611 
612 		/*
613 		 * check for duplicates.   if none found, then insert a
614 		 * dummy entry on the list to prevent someone else from
615 		 * trying to enable this device while we are working on
616 		 * it.
617 		 */
618 
619 		priority = SCARG(uap, misc);
620 		sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK);
621 		spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
622 		memset(sdp, 0, sizeof(*sdp));
623 		sdp->swd_flags = SWF_FAKE;
624 		sdp->swd_vp = vp;
625 		sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
626 		bufq_alloc(&sdp->swd_tab, BUFQ_DISKSORT|BUFQ_SORT_RAWBLOCK);
627 		simple_lock(&uvm.swap_data_lock);
628 		if (swaplist_find(vp, 0) != NULL) {
629 			error = EBUSY;
630 			simple_unlock(&uvm.swap_data_lock);
631 			bufq_free(&sdp->swd_tab);
632 			free(sdp, M_VMSWAP);
633 			free(spp, M_VMSWAP);
634 			break;
635 		}
636 		swaplist_insert(sdp, spp, priority);
637 		simple_unlock(&uvm.swap_data_lock);
638 
639 		sdp->swd_pathlen = len;
640 		sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK);
641 		if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0)
642 			panic("swapctl: copystr");
643 
644 		/*
645 		 * we've now got a FAKE placeholder in the swap list.
646 		 * now attempt to enable swap on it.  if we fail, undo
647 		 * what we've done and kill the fake entry we just inserted.
648 		 * if swap_on is a success, it will clear the SWF_FAKE flag
649 		 */
650 
651 		if ((error = swap_on(p, sdp)) != 0) {
652 			simple_lock(&uvm.swap_data_lock);
653 			(void) swaplist_find(vp, 1);  /* kill fake entry */
654 			swaplist_trim();
655 			simple_unlock(&uvm.swap_data_lock);
656 			bufq_free(&sdp->swd_tab);
657 			free(sdp->swd_path, M_VMSWAP);
658 			free(sdp, M_VMSWAP);
659 			break;
660 		}
661 		break;
662 
663 	case SWAP_OFF:
664 		simple_lock(&uvm.swap_data_lock);
665 		if ((sdp = swaplist_find(vp, 0)) == NULL) {
666 			simple_unlock(&uvm.swap_data_lock);
667 			error = ENXIO;
668 			break;
669 		}
670 
671 		/*
672 		 * If a device isn't in use or enabled, we
673 		 * can't stop swapping from it (again).
674 		 */
675 		if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
676 			simple_unlock(&uvm.swap_data_lock);
677 			error = EBUSY;
678 			break;
679 		}
680 
681 		/*
682 		 * do the real work.
683 		 */
684 		error = swap_off(p, sdp);
685 		break;
686 
687 	default:
688 		error = EINVAL;
689 	}
690 
691 	/*
692 	 * done!  release the ref gained by namei() and unlock.
693 	 */
694 	vput(vp);
695 
696 out:
697 	lockmgr(&swap_syscall_lock, LK_RELEASE, NULL);
698 
699 	UVMHIST_LOG(pdhist, "<- done!  error=%d", error, 0, 0, 0);
700 	return (error);
701 }
702 
703 /*
704  * swap_stats: implements swapctl(SWAP_STATS). The function is kept
705  * away from sys_swapctl() in order to allow COMPAT_* swapctl()
706  * emulation to use it directly without going through sys_swapctl().
707  * The problem with using sys_swapctl() there is that it involves
708  * copying the swapent array to the stackgap, and this array's size
709  * is not known at build time. Hence it would not be possible to
710  * ensure it would fit in the stackgap in any case.
711  */
712 void
713 uvm_swap_stats(cmd, sep, sec, retval)
714 	int cmd;
715 	struct swapent *sep;
716 	int sec;
717 	register_t *retval;
718 {
719 	struct swappri *spp;
720 	struct swapdev *sdp;
721 	int count = 0;
722 
723 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
724 		for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
725 		     sdp != (void *)&spp->spi_swapdev && sec-- > 0;
726 		     sdp = CIRCLEQ_NEXT(sdp, swd_next)) {
727 		  	/*
728 			 * backwards compatibility for system call.
729 			 * note that we use 'struct oswapent' as an
730 			 * overlay into both 'struct swapdev' and
731 			 * the userland 'struct swapent', as we
732 			 * want to retain backwards compatibility
733 			 * with NetBSD 1.3.
734 			 */
735 			sdp->swd_ose.ose_inuse =
736 			    btodb((u_int64_t)sdp->swd_npginuse <<
737 			    PAGE_SHIFT);
738 			(void)memcpy(sep, &sdp->swd_ose,
739 			    sizeof(struct oswapent));
740 
741 			/* now copy out the path if necessary */
742 #if defined(COMPAT_13)
743 			if (cmd == SWAP_STATS)
744 #endif
745 				(void)memcpy(&sep->se_path, sdp->swd_path,
746 				    sdp->swd_pathlen);
747 
748 			count++;
749 #if defined(COMPAT_13)
750 			if (cmd == SWAP_OSTATS)
751 				sep = (struct swapent *)
752 				    ((struct oswapent *)sep + 1);
753 			else
754 #endif
755 				sep++;
756 		}
757 	}
758 
759 	*retval = count;
760 	return;
761 }
762 
763 /*
764  * swap_on: attempt to enable a swapdev for swapping.   note that the
765  *	swapdev is already on the global list, but disabled (marked
766  *	SWF_FAKE).
767  *
768  * => we avoid the start of the disk (to protect disk labels)
769  * => we also avoid the miniroot, if we are swapping to root.
770  * => caller should leave uvm.swap_data_lock unlocked, we may lock it
771  *	if needed.
772  */
773 static int
774 swap_on(p, sdp)
775 	struct proc *p;
776 	struct swapdev *sdp;
777 {
778 	static int count = 0;	/* static */
779 	struct vnode *vp;
780 	int error, npages, nblocks, size;
781 	long addr;
782 	u_long result;
783 	struct vattr va;
784 #ifdef NFS
785 	extern int (**nfsv2_vnodeop_p) __P((void *));
786 #endif /* NFS */
787 	const struct bdevsw *bdev;
788 	dev_t dev;
789 	UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist);
790 
791 	/*
792 	 * we want to enable swapping on sdp.   the swd_vp contains
793 	 * the vnode we want (locked and ref'd), and the swd_dev
794 	 * contains the dev_t of the file, if it a block device.
795 	 */
796 
797 	vp = sdp->swd_vp;
798 	dev = sdp->swd_dev;
799 
800 	/*
801 	 * open the swap file (mostly useful for block device files to
802 	 * let device driver know what is up).
803 	 *
804 	 * we skip the open/close for root on swap because the root
805 	 * has already been opened when root was mounted (mountroot).
806 	 */
807 	if (vp != rootvp) {
808 		if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p)))
809 			return (error);
810 	}
811 
812 	/* XXX this only works for block devices */
813 	UVMHIST_LOG(pdhist, "  dev=%d, major(dev)=%d", dev, major(dev), 0,0);
814 
815 	/*
816 	 * we now need to determine the size of the swap area.   for
817 	 * block specials we can call the d_psize function.
818 	 * for normal files, we must stat [get attrs].
819 	 *
820 	 * we put the result in nblks.
821 	 * for normal files, we also want the filesystem block size
822 	 * (which we get with statfs).
823 	 */
824 	switch (vp->v_type) {
825 	case VBLK:
826 		bdev = bdevsw_lookup(dev);
827 		if (bdev == NULL || bdev->d_psize == NULL ||
828 		    (nblocks = (*bdev->d_psize)(dev)) == -1) {
829 			error = ENXIO;
830 			goto bad;
831 		}
832 		break;
833 
834 	case VREG:
835 		if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)))
836 			goto bad;
837 		nblocks = (int)btodb(va.va_size);
838 		if ((error =
839 		     VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0)
840 			goto bad;
841 
842 		sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize;
843 		/*
844 		 * limit the max # of outstanding I/O requests we issue
845 		 * at any one time.   take it easy on NFS servers.
846 		 */
847 #ifdef NFS
848 		if (vp->v_op == nfsv2_vnodeop_p)
849 			sdp->swd_maxactive = 2; /* XXX */
850 		else
851 #endif /* NFS */
852 			sdp->swd_maxactive = 8; /* XXX */
853 		break;
854 
855 	default:
856 		error = ENXIO;
857 		goto bad;
858 	}
859 
860 	/*
861 	 * save nblocks in a safe place and convert to pages.
862 	 */
863 
864 	sdp->swd_ose.ose_nblks = nblocks;
865 	npages = dbtob((u_int64_t)nblocks) >> PAGE_SHIFT;
866 
867 	/*
868 	 * for block special files, we want to make sure that leave
869 	 * the disklabel and bootblocks alone, so we arrange to skip
870 	 * over them (arbitrarily choosing to skip PAGE_SIZE bytes).
871 	 * note that because of this the "size" can be less than the
872 	 * actual number of blocks on the device.
873 	 */
874 	if (vp->v_type == VBLK) {
875 		/* we use pages 1 to (size - 1) [inclusive] */
876 		size = npages - 1;
877 		addr = 1;
878 	} else {
879 		/* we use pages 0 to (size - 1) [inclusive] */
880 		size = npages;
881 		addr = 0;
882 	}
883 
884 	/*
885 	 * make sure we have enough blocks for a reasonable sized swap
886 	 * area.   we want at least one page.
887 	 */
888 
889 	if (size < 1) {
890 		UVMHIST_LOG(pdhist, "  size <= 1!!", 0, 0, 0, 0);
891 		error = EINVAL;
892 		goto bad;
893 	}
894 
895 	UVMHIST_LOG(pdhist, "  dev=%x: size=%d addr=%ld\n", dev, size, addr, 0);
896 
897 	/*
898 	 * now we need to allocate an extent to manage this swap device
899 	 */
900 	snprintf(sdp->swd_exname, sizeof(sdp->swd_exname), "swap0x%04x",
901 	    count++);
902 
903 	/* note that extent_create's 3rd arg is inclusive, thus "- 1" */
904 	sdp->swd_ex = extent_create(sdp->swd_exname, 0, npages - 1, M_VMSWAP,
905 				    0, 0, EX_WAITOK);
906 	/* allocate the `saved' region from the extent so it won't be used */
907 	if (addr) {
908 		if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK))
909 			panic("disklabel region");
910 	}
911 
912 	/*
913 	 * if the vnode we are swapping to is the root vnode
914 	 * (i.e. we are swapping to the miniroot) then we want
915 	 * to make sure we don't overwrite it.   do a statfs to
916 	 * find its size and skip over it.
917 	 */
918 	if (vp == rootvp) {
919 		struct mount *mp;
920 		struct statfs *sp;
921 		int rootblocks, rootpages;
922 
923 		mp = rootvnode->v_mount;
924 		sp = &mp->mnt_stat;
925 		rootblocks = sp->f_blocks * btodb(sp->f_bsize);
926 		/*
927 		 * XXX: sp->f_blocks isn't the total number of
928 		 * blocks in the filesystem, it's the number of
929 		 * data blocks.  so, our rootblocks almost
930 		 * definitely underestimates the total size
931 		 * of the filesystem - how badly depends on the
932 		 * details of the filesystem type.  there isn't
933 		 * an obvious way to deal with this cleanly
934 		 * and perfectly, so for now we just pad our
935 		 * rootblocks estimate with an extra 5 percent.
936 		 */
937 		rootblocks += (rootblocks >> 5) +
938 			(rootblocks >> 6) +
939 			(rootblocks >> 7);
940 		rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
941 		if (rootpages > size)
942 			panic("swap_on: miniroot larger than swap?");
943 
944 		if (extent_alloc_region(sdp->swd_ex, addr,
945 					rootpages, EX_WAITOK))
946 			panic("swap_on: unable to preserve miniroot");
947 
948 		size -= rootpages;
949 		printf("Preserved %d pages of miniroot ", rootpages);
950 		printf("leaving %d pages of swap\n", size);
951 	}
952 
953   	/*
954 	 * try to add anons to reflect the new swap space.
955 	 */
956 
957 	error = uvm_anon_add(size);
958 	if (error) {
959 		goto bad;
960 	}
961 
962 	/*
963 	 * add a ref to vp to reflect usage as a swap device.
964 	 */
965 	vref(vp);
966 
967 	/*
968 	 * now add the new swapdev to the drum and enable.
969 	 */
970 	if (extent_alloc(swapmap, npages, EX_NOALIGN, EX_NOBOUNDARY,
971 	    EX_WAITOK, &result))
972 		panic("swapdrum_add");
973 
974 	sdp->swd_drumoffset = (int)result;
975 	sdp->swd_drumsize = npages;
976 	sdp->swd_npages = size;
977 	simple_lock(&uvm.swap_data_lock);
978 	sdp->swd_flags &= ~SWF_FAKE;	/* going live */
979 	sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
980 	uvmexp.swpages += size;
981 	simple_unlock(&uvm.swap_data_lock);
982 	return (0);
983 
984 	/*
985 	 * failure: clean up and return error.
986 	 */
987 
988 bad:
989 	if (sdp->swd_ex) {
990 		extent_destroy(sdp->swd_ex);
991 	}
992 	if (vp != rootvp) {
993 		(void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p);
994 	}
995 	return (error);
996 }
997 
998 /*
999  * swap_off: stop swapping on swapdev
1000  *
1001  * => swap data should be locked, we will unlock.
1002  */
1003 static int
1004 swap_off(p, sdp)
1005 	struct proc *p;
1006 	struct swapdev *sdp;
1007 {
1008 	UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist);
1009 	UVMHIST_LOG(pdhist, "  dev=%x", sdp->swd_dev,0,0,0);
1010 
1011 	/* disable the swap area being removed */
1012 	sdp->swd_flags &= ~SWF_ENABLE;
1013 	simple_unlock(&uvm.swap_data_lock);
1014 
1015 	/*
1016 	 * the idea is to find all the pages that are paged out to this
1017 	 * device, and page them all in.  in uvm, swap-backed pageable
1018 	 * memory can take two forms: aobjs and anons.  call the
1019 	 * swapoff hook for each subsystem to bring in pages.
1020 	 */
1021 
1022 	if (uao_swap_off(sdp->swd_drumoffset,
1023 			 sdp->swd_drumoffset + sdp->swd_drumsize) ||
1024 	    anon_swap_off(sdp->swd_drumoffset,
1025 			  sdp->swd_drumoffset + sdp->swd_drumsize)) {
1026 
1027 		simple_lock(&uvm.swap_data_lock);
1028 		sdp->swd_flags |= SWF_ENABLE;
1029 		simple_unlock(&uvm.swap_data_lock);
1030 		return ENOMEM;
1031 	}
1032 	KASSERT(sdp->swd_npginuse == sdp->swd_npgbad);
1033 
1034 	/*
1035 	 * done with the vnode.
1036 	 * drop our ref on the vnode before calling VOP_CLOSE()
1037 	 * so that spec_close() can tell if this is the last close.
1038 	 */
1039 	vrele(sdp->swd_vp);
1040 	if (sdp->swd_vp != rootvp) {
1041 		(void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p);
1042 	}
1043 
1044 	/* remove anons from the system */
1045 	uvm_anon_remove(sdp->swd_npages);
1046 
1047 	simple_lock(&uvm.swap_data_lock);
1048 	uvmexp.swpages -= sdp->swd_npages;
1049 
1050 	if (swaplist_find(sdp->swd_vp, 1) == NULL)
1051 		panic("swap_off: swapdev not in list");
1052 	swaplist_trim();
1053 	simple_unlock(&uvm.swap_data_lock);
1054 
1055 	/*
1056 	 * free all resources!
1057 	 */
1058 	extent_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize,
1059 		    EX_WAITOK);
1060 	extent_destroy(sdp->swd_ex);
1061 	bufq_free(&sdp->swd_tab);
1062 	free(sdp, M_VMSWAP);
1063 	return (0);
1064 }
1065 
1066 /*
1067  * /dev/drum interface and i/o functions
1068  */
1069 
1070 /*
1071  * swread: the read function for the drum (just a call to physio)
1072  */
1073 /*ARGSUSED*/
1074 int
1075 swread(dev, uio, ioflag)
1076 	dev_t dev;
1077 	struct uio *uio;
1078 	int ioflag;
1079 {
1080 	UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist);
1081 
1082 	UVMHIST_LOG(pdhist, "  dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1083 	return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
1084 }
1085 
1086 /*
1087  * swwrite: the write function for the drum (just a call to physio)
1088  */
1089 /*ARGSUSED*/
1090 int
1091 swwrite(dev, uio, ioflag)
1092 	dev_t dev;
1093 	struct uio *uio;
1094 	int ioflag;
1095 {
1096 	UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist);
1097 
1098 	UVMHIST_LOG(pdhist, "  dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1099 	return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
1100 }
1101 
1102 /*
1103  * swstrategy: perform I/O on the drum
1104  *
1105  * => we must map the i/o request from the drum to the correct swapdev.
1106  */
1107 void
1108 swstrategy(bp)
1109 	struct buf *bp;
1110 {
1111 	struct swapdev *sdp;
1112 	struct vnode *vp;
1113 	int s, pageno, bn;
1114 	UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist);
1115 
1116 	/*
1117 	 * convert block number to swapdev.   note that swapdev can't
1118 	 * be yanked out from under us because we are holding resources
1119 	 * in it (i.e. the blocks we are doing I/O on).
1120 	 */
1121 	pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT;
1122 	simple_lock(&uvm.swap_data_lock);
1123 	sdp = swapdrum_getsdp(pageno);
1124 	simple_unlock(&uvm.swap_data_lock);
1125 	if (sdp == NULL) {
1126 		bp->b_error = EINVAL;
1127 		bp->b_flags |= B_ERROR;
1128 		biodone(bp);
1129 		UVMHIST_LOG(pdhist, "  failed to get swap device", 0, 0, 0, 0);
1130 		return;
1131 	}
1132 
1133 	/*
1134 	 * convert drum page number to block number on this swapdev.
1135 	 */
1136 
1137 	pageno -= sdp->swd_drumoffset;	/* page # on swapdev */
1138 	bn = btodb((u_int64_t)pageno << PAGE_SHIFT); /* convert to diskblock */
1139 
1140 	UVMHIST_LOG(pdhist, "  %s: mapoff=%x bn=%x bcount=%ld",
1141 		((bp->b_flags & B_READ) == 0) ? "write" : "read",
1142 		sdp->swd_drumoffset, bn, bp->b_bcount);
1143 
1144 	/*
1145 	 * for block devices we finish up here.
1146 	 * for regular files we have to do more work which we delegate
1147 	 * to sw_reg_strategy().
1148 	 */
1149 
1150 	switch (sdp->swd_vp->v_type) {
1151 	default:
1152 		panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type);
1153 
1154 	case VBLK:
1155 
1156 		/*
1157 		 * must convert "bp" from an I/O on /dev/drum to an I/O
1158 		 * on the swapdev (sdp).
1159 		 */
1160 		s = splbio();
1161 		bp->b_blkno = bn;		/* swapdev block number */
1162 		vp = sdp->swd_vp;		/* swapdev vnode pointer */
1163 		bp->b_dev = sdp->swd_dev;	/* swapdev dev_t */
1164 
1165 		/*
1166 		 * if we are doing a write, we have to redirect the i/o on
1167 		 * drum's v_numoutput counter to the swapdevs.
1168 		 */
1169 		if ((bp->b_flags & B_READ) == 0) {
1170 			vwakeup(bp);	/* kills one 'v_numoutput' on drum */
1171 			V_INCR_NUMOUTPUT(vp);	/* put it on swapdev */
1172 		}
1173 
1174 		/*
1175 		 * finally plug in swapdev vnode and start I/O
1176 		 */
1177 		bp->b_vp = vp;
1178 		splx(s);
1179 		VOP_STRATEGY(bp);
1180 		return;
1181 
1182 	case VREG:
1183 		/*
1184 		 * delegate to sw_reg_strategy function.
1185 		 */
1186 		sw_reg_strategy(sdp, bp, bn);
1187 		return;
1188 	}
1189 	/* NOTREACHED */
1190 }
1191 
1192 /*
1193  * sw_reg_strategy: handle swap i/o to regular files
1194  */
1195 static void
1196 sw_reg_strategy(sdp, bp, bn)
1197 	struct swapdev	*sdp;
1198 	struct buf	*bp;
1199 	int		bn;
1200 {
1201 	struct vnode	*vp;
1202 	struct vndxfer	*vnx;
1203 	daddr_t		nbn;
1204 	caddr_t		addr;
1205 	off_t		byteoff;
1206 	int		s, off, nra, error, sz, resid;
1207 	UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist);
1208 
1209 	/*
1210 	 * allocate a vndxfer head for this transfer and point it to
1211 	 * our buffer.
1212 	 */
1213 	getvndxfer(vnx);
1214 	vnx->vx_flags = VX_BUSY;
1215 	vnx->vx_error = 0;
1216 	vnx->vx_pending = 0;
1217 	vnx->vx_bp = bp;
1218 	vnx->vx_sdp = sdp;
1219 
1220 	/*
1221 	 * setup for main loop where we read filesystem blocks into
1222 	 * our buffer.
1223 	 */
1224 	error = 0;
1225 	bp->b_resid = bp->b_bcount;	/* nothing transfered yet! */
1226 	addr = bp->b_data;		/* current position in buffer */
1227 	byteoff = dbtob((u_int64_t)bn);
1228 
1229 	for (resid = bp->b_resid; resid; resid -= sz) {
1230 		struct vndbuf	*nbp;
1231 
1232 		/*
1233 		 * translate byteoffset into block number.  return values:
1234 		 *   vp = vnode of underlying device
1235 		 *  nbn = new block number (on underlying vnode dev)
1236 		 *  nra = num blocks we can read-ahead (excludes requested
1237 		 *	block)
1238 		 */
1239 		nra = 0;
1240 		error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
1241 				 	&vp, &nbn, &nra);
1242 
1243 		if (error == 0 && nbn == (daddr_t)-1) {
1244 			/*
1245 			 * this used to just set error, but that doesn't
1246 			 * do the right thing.  Instead, it causes random
1247 			 * memory errors.  The panic() should remain until
1248 			 * this condition doesn't destabilize the system.
1249 			 */
1250 #if 1
1251 			panic("sw_reg_strategy: swap to sparse file");
1252 #else
1253 			error = EIO;	/* failure */
1254 #endif
1255 		}
1256 
1257 		/*
1258 		 * punt if there was an error or a hole in the file.
1259 		 * we must wait for any i/o ops we have already started
1260 		 * to finish before returning.
1261 		 *
1262 		 * XXX we could deal with holes here but it would be
1263 		 * a hassle (in the write case).
1264 		 */
1265 		if (error) {
1266 			s = splbio();
1267 			vnx->vx_error = error;	/* pass error up */
1268 			goto out;
1269 		}
1270 
1271 		/*
1272 		 * compute the size ("sz") of this transfer (in bytes).
1273 		 */
1274 		off = byteoff % sdp->swd_bsize;
1275 		sz = (1 + nra) * sdp->swd_bsize - off;
1276 		if (sz > resid)
1277 			sz = resid;
1278 
1279 		UVMHIST_LOG(pdhist, "sw_reg_strategy: "
1280 			    "vp %p/%p offset 0x%x/0x%x",
1281 			    sdp->swd_vp, vp, byteoff, nbn);
1282 
1283 		/*
1284 		 * now get a buf structure.   note that the vb_buf is
1285 		 * at the front of the nbp structure so that you can
1286 		 * cast pointers between the two structure easily.
1287 		 */
1288 		getvndbuf(nbp);
1289 		BUF_INIT(&nbp->vb_buf);
1290 		nbp->vb_buf.b_flags    = bp->b_flags | B_CALL;
1291 		nbp->vb_buf.b_bcount   = sz;
1292 		nbp->vb_buf.b_bufsize  = sz;
1293 		nbp->vb_buf.b_error    = 0;
1294 		nbp->vb_buf.b_data     = addr;
1295 		nbp->vb_buf.b_lblkno   = 0;
1296 		nbp->vb_buf.b_blkno    = nbn + btodb(off);
1297 		nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno;
1298 		nbp->vb_buf.b_iodone   = sw_reg_iodone;
1299 		nbp->vb_buf.b_vp       = vp;
1300 		if (vp->v_type == VBLK) {
1301 			nbp->vb_buf.b_dev = vp->v_rdev;
1302 		}
1303 
1304 		nbp->vb_xfer = vnx;	/* patch it back in to vnx */
1305 
1306 		/*
1307 		 * Just sort by block number
1308 		 */
1309 		s = splbio();
1310 		if (vnx->vx_error != 0) {
1311 			putvndbuf(nbp);
1312 			goto out;
1313 		}
1314 		vnx->vx_pending++;
1315 
1316 		/* sort it in and start I/O if we are not over our limit */
1317 		BUFQ_PUT(&sdp->swd_tab, &nbp->vb_buf);
1318 		sw_reg_start(sdp);
1319 		splx(s);
1320 
1321 		/*
1322 		 * advance to the next I/O
1323 		 */
1324 		byteoff += sz;
1325 		addr += sz;
1326 	}
1327 
1328 	s = splbio();
1329 
1330 out: /* Arrive here at splbio */
1331 	vnx->vx_flags &= ~VX_BUSY;
1332 	if (vnx->vx_pending == 0) {
1333 		if (vnx->vx_error != 0) {
1334 			bp->b_error = vnx->vx_error;
1335 			bp->b_flags |= B_ERROR;
1336 		}
1337 		putvndxfer(vnx);
1338 		biodone(bp);
1339 	}
1340 	splx(s);
1341 }
1342 
1343 /*
1344  * sw_reg_start: start an I/O request on the requested swapdev
1345  *
1346  * => reqs are sorted by b_rawblkno (above)
1347  */
1348 static void
1349 sw_reg_start(sdp)
1350 	struct swapdev	*sdp;
1351 {
1352 	struct buf	*bp;
1353 	UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist);
1354 
1355 	/* recursion control */
1356 	if ((sdp->swd_flags & SWF_BUSY) != 0)
1357 		return;
1358 
1359 	sdp->swd_flags |= SWF_BUSY;
1360 
1361 	while (sdp->swd_active < sdp->swd_maxactive) {
1362 		bp = BUFQ_GET(&sdp->swd_tab);
1363 		if (bp == NULL)
1364 			break;
1365 		sdp->swd_active++;
1366 
1367 		UVMHIST_LOG(pdhist,
1368 		    "sw_reg_start:  bp %p vp %p blkno %p cnt %lx",
1369 		    bp, bp->b_vp, bp->b_blkno, bp->b_bcount);
1370 		if ((bp->b_flags & B_READ) == 0)
1371 			V_INCR_NUMOUTPUT(bp->b_vp);
1372 
1373 		VOP_STRATEGY(bp);
1374 	}
1375 	sdp->swd_flags &= ~SWF_BUSY;
1376 }
1377 
1378 /*
1379  * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
1380  *
1381  * => note that we can recover the vndbuf struct by casting the buf ptr
1382  */
1383 static void
1384 sw_reg_iodone(bp)
1385 	struct buf *bp;
1386 {
1387 	struct vndbuf *vbp = (struct vndbuf *) bp;
1388 	struct vndxfer *vnx = vbp->vb_xfer;
1389 	struct buf *pbp = vnx->vx_bp;		/* parent buffer */
1390 	struct swapdev	*sdp = vnx->vx_sdp;
1391 	int s, resid, error;
1392 	UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist);
1393 
1394 	UVMHIST_LOG(pdhist, "  vbp=%p vp=%p blkno=%x addr=%p",
1395 	    vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data);
1396 	UVMHIST_LOG(pdhist, "  cnt=%lx resid=%lx",
1397 	    vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
1398 
1399 	/*
1400 	 * protect vbp at splbio and update.
1401 	 */
1402 
1403 	s = splbio();
1404 	resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
1405 	pbp->b_resid -= resid;
1406 	vnx->vx_pending--;
1407 
1408 	if (vbp->vb_buf.b_flags & B_ERROR) {
1409 		/* pass error upward */
1410 		error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO;
1411 		UVMHIST_LOG(pdhist, "  got error=%d !", error, 0, 0, 0);
1412 		vnx->vx_error = error;
1413 	}
1414 
1415 	/*
1416 	 * kill vbp structure
1417 	 */
1418 	putvndbuf(vbp);
1419 
1420 	/*
1421 	 * wrap up this transaction if it has run to completion or, in
1422 	 * case of an error, when all auxiliary buffers have returned.
1423 	 */
1424 	if (vnx->vx_error != 0) {
1425 		/* pass error upward */
1426 		pbp->b_flags |= B_ERROR;
1427 		pbp->b_error = vnx->vx_error;
1428 		if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
1429 			putvndxfer(vnx);
1430 			biodone(pbp);
1431 		}
1432 	} else if (pbp->b_resid == 0) {
1433 		KASSERT(vnx->vx_pending == 0);
1434 		if ((vnx->vx_flags & VX_BUSY) == 0) {
1435 			UVMHIST_LOG(pdhist, "  iodone error=%d !",
1436 			    pbp, vnx->vx_error, 0, 0);
1437 			putvndxfer(vnx);
1438 			biodone(pbp);
1439 		}
1440 	}
1441 
1442 	/*
1443 	 * done!   start next swapdev I/O if one is pending
1444 	 */
1445 	sdp->swd_active--;
1446 	sw_reg_start(sdp);
1447 	splx(s);
1448 }
1449 
1450 
1451 /*
1452  * uvm_swap_alloc: allocate space on swap
1453  *
1454  * => allocation is done "round robin" down the priority list, as we
1455  *	allocate in a priority we "rotate" the circle queue.
1456  * => space can be freed with uvm_swap_free
1457  * => we return the page slot number in /dev/drum (0 == invalid slot)
1458  * => we lock uvm.swap_data_lock
1459  * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
1460  */
1461 int
1462 uvm_swap_alloc(nslots, lessok)
1463 	int *nslots;	/* IN/OUT */
1464 	boolean_t lessok;
1465 {
1466 	struct swapdev *sdp;
1467 	struct swappri *spp;
1468 	u_long	result;
1469 	UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist);
1470 
1471 	/*
1472 	 * no swap devices configured yet?   definite failure.
1473 	 */
1474 	if (uvmexp.nswapdev < 1)
1475 		return 0;
1476 
1477 	/*
1478 	 * lock data lock, convert slots into blocks, and enter loop
1479 	 */
1480 	simple_lock(&uvm.swap_data_lock);
1481 
1482 ReTry:	/* XXXMRG */
1483 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
1484 		CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
1485 			/* if it's not enabled, then we can't swap from it */
1486 			if ((sdp->swd_flags & SWF_ENABLE) == 0)
1487 				continue;
1488 			if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
1489 				continue;
1490 			if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN,
1491 					 EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT,
1492 					 &result) != 0) {
1493 				continue;
1494 			}
1495 
1496 			/*
1497 			 * successful allocation!  now rotate the circleq.
1498 			 */
1499 			CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1500 			CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
1501 			sdp->swd_npginuse += *nslots;
1502 			uvmexp.swpginuse += *nslots;
1503 			simple_unlock(&uvm.swap_data_lock);
1504 			/* done!  return drum slot number */
1505 			UVMHIST_LOG(pdhist,
1506 			    "success!  returning %d slots starting at %d",
1507 			    *nslots, result + sdp->swd_drumoffset, 0, 0);
1508 			return (result + sdp->swd_drumoffset);
1509 		}
1510 	}
1511 
1512 	/* XXXMRG: BEGIN HACK */
1513 	if (*nslots > 1 && lessok) {
1514 		*nslots = 1;
1515 		goto ReTry;	/* XXXMRG: ugh!  extent should support this for us */
1516 	}
1517 	/* XXXMRG: END HACK */
1518 
1519 	simple_unlock(&uvm.swap_data_lock);
1520 	return 0;
1521 }
1522 
1523 /*
1524  * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
1525  *
1526  * => we lock uvm.swap_data_lock
1527  */
1528 void
1529 uvm_swap_markbad(startslot, nslots)
1530 	int startslot;
1531 	int nslots;
1532 {
1533 	struct swapdev *sdp;
1534 	UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist);
1535 
1536 	simple_lock(&uvm.swap_data_lock);
1537 	sdp = swapdrum_getsdp(startslot);
1538 
1539 	/*
1540 	 * we just keep track of how many pages have been marked bad
1541 	 * in this device, to make everything add up in swap_off().
1542 	 * we assume here that the range of slots will all be within
1543 	 * one swap device.
1544 	 */
1545 
1546 	sdp->swd_npgbad += nslots;
1547 	UVMHIST_LOG(pdhist, "now %d bad", sdp->swd_npgbad, 0,0,0);
1548 	simple_unlock(&uvm.swap_data_lock);
1549 }
1550 
1551 /*
1552  * uvm_swap_free: free swap slots
1553  *
1554  * => this can be all or part of an allocation made by uvm_swap_alloc
1555  * => we lock uvm.swap_data_lock
1556  */
1557 void
1558 uvm_swap_free(startslot, nslots)
1559 	int startslot;
1560 	int nslots;
1561 {
1562 	struct swapdev *sdp;
1563 	UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist);
1564 
1565 	UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots,
1566 	    startslot, 0, 0);
1567 
1568 	/*
1569 	 * ignore attempts to free the "bad" slot.
1570 	 */
1571 
1572 	if (startslot == SWSLOT_BAD) {
1573 		return;
1574 	}
1575 
1576 	/*
1577 	 * convert drum slot offset back to sdp, free the blocks
1578 	 * in the extent, and return.   must hold pri lock to do
1579 	 * lookup and access the extent.
1580 	 */
1581 
1582 	simple_lock(&uvm.swap_data_lock);
1583 	sdp = swapdrum_getsdp(startslot);
1584 	KASSERT(uvmexp.nswapdev >= 1);
1585 	KASSERT(sdp != NULL);
1586 	KASSERT(sdp->swd_npginuse >= nslots);
1587 	if (extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots,
1588 			EX_MALLOCOK|EX_NOWAIT) != 0) {
1589 		printf("warning: resource shortage: %d pages of swap lost\n",
1590 			nslots);
1591 	}
1592 	sdp->swd_npginuse -= nslots;
1593 	uvmexp.swpginuse -= nslots;
1594 	simple_unlock(&uvm.swap_data_lock);
1595 }
1596 
1597 /*
1598  * uvm_swap_put: put any number of pages into a contig place on swap
1599  *
1600  * => can be sync or async
1601  */
1602 
1603 int
1604 uvm_swap_put(swslot, ppsp, npages, flags)
1605 	int swslot;
1606 	struct vm_page **ppsp;
1607 	int npages;
1608 	int flags;
1609 {
1610 	int error;
1611 
1612 	error = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
1613 	    ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1614 	return error;
1615 }
1616 
1617 /*
1618  * uvm_swap_get: get a single page from swap
1619  *
1620  * => usually a sync op (from fault)
1621  */
1622 
1623 int
1624 uvm_swap_get(page, swslot, flags)
1625 	struct vm_page *page;
1626 	int swslot, flags;
1627 {
1628 	int error;
1629 
1630 	uvmexp.nswget++;
1631 	KASSERT(flags & PGO_SYNCIO);
1632 	if (swslot == SWSLOT_BAD) {
1633 		return EIO;
1634 	}
1635 	error = uvm_swap_io(&page, swslot, 1, B_READ |
1636 	    ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1637 	if (error == 0) {
1638 
1639 		/*
1640 		 * this page is no longer only in swap.
1641 		 */
1642 
1643 		simple_lock(&uvm.swap_data_lock);
1644 		KASSERT(uvmexp.swpgonly > 0);
1645 		uvmexp.swpgonly--;
1646 		simple_unlock(&uvm.swap_data_lock);
1647 	}
1648 	return error;
1649 }
1650 
1651 /*
1652  * uvm_swap_io: do an i/o operation to swap
1653  */
1654 
1655 static int
1656 uvm_swap_io(pps, startslot, npages, flags)
1657 	struct vm_page **pps;
1658 	int startslot, npages, flags;
1659 {
1660 	daddr_t startblk;
1661 	struct	buf *bp;
1662 	vaddr_t kva;
1663 	int	error, s, mapinflags;
1664 	boolean_t write, async;
1665 	UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist);
1666 
1667 	UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d",
1668 	    startslot, npages, flags, 0);
1669 
1670 	write = (flags & B_READ) == 0;
1671 	async = (flags & B_ASYNC) != 0;
1672 
1673 	/*
1674 	 * convert starting drum slot to block number
1675 	 */
1676 
1677 	startblk = btodb((u_int64_t)startslot << PAGE_SHIFT);
1678 
1679 	/*
1680 	 * first, map the pages into the kernel.
1681 	 */
1682 
1683 	mapinflags = !write ?
1684 		UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ :
1685 		UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE;
1686 	kva = uvm_pagermapin(pps, npages, mapinflags);
1687 
1688 	/*
1689 	 * now allocate a buf for the i/o.
1690 	 */
1691 
1692 	s = splbio();
1693 	bp = pool_get(&bufpool, PR_WAITOK);
1694 	splx(s);
1695 
1696 	/*
1697 	 * fill in the bp/sbp.   we currently route our i/o through
1698 	 * /dev/drum's vnode [swapdev_vp].
1699 	 */
1700 
1701 	BUF_INIT(bp);
1702 	bp->b_flags = B_BUSY | B_NOCACHE | (flags & (B_READ|B_ASYNC));
1703 	bp->b_proc = &proc0;	/* XXX */
1704 	bp->b_vnbufs.le_next = NOLIST;
1705 	bp->b_data = (caddr_t)kva;
1706 	bp->b_blkno = startblk;
1707 	bp->b_vp = swapdev_vp;
1708 	bp->b_dev = swapdev_vp->v_rdev;
1709 	bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT;
1710 
1711 	/*
1712 	 * bump v_numoutput (counter of number of active outputs).
1713 	 */
1714 
1715 	if (write) {
1716 		s = splbio();
1717 		V_INCR_NUMOUTPUT(swapdev_vp);
1718 		splx(s);
1719 	}
1720 
1721 	/*
1722 	 * for async ops we must set up the iodone handler.
1723 	 */
1724 
1725 	if (async) {
1726 		bp->b_flags |= B_CALL;
1727 		bp->b_iodone = uvm_aio_biodone;
1728 		UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
1729 	}
1730 	UVMHIST_LOG(pdhist,
1731 	    "about to start io: data = %p blkno = 0x%x, bcount = %ld",
1732 	    bp->b_data, bp->b_blkno, bp->b_bcount, 0);
1733 
1734 	/*
1735 	 * now we start the I/O, and if async, return.
1736 	 */
1737 
1738 	VOP_STRATEGY(bp);
1739 	if (async)
1740 		return 0;
1741 
1742 	/*
1743 	 * must be sync i/o.   wait for it to finish
1744 	 */
1745 
1746 	error = biowait(bp);
1747 
1748 	/*
1749 	 * kill the pager mapping
1750 	 */
1751 
1752 	uvm_pagermapout(kva, npages);
1753 
1754 	/*
1755 	 * now dispose of the buf and we're done.
1756 	 */
1757 
1758 	s = splbio();
1759 	if (write)
1760 		vwakeup(bp);
1761 	pool_put(&bufpool, bp);
1762 	splx(s);
1763 	UVMHIST_LOG(pdhist, "<- done (sync)  error=%d", error, 0, 0, 0);
1764 	return (error);
1765 }
1766