xref: /openbsd-src/sys/uvm/uvm_swap.c (revision a28daedfc357b214be5c701aa8ba8adb29a7f1c2)
1 /*	$OpenBSD: uvm_swap.c,v 1.87 2009/03/23 22:07:41 oga Exp $	*/
2 /*	$NetBSD: uvm_swap.c,v 1.40 2000/11/17 11:39:39 mrg Exp $	*/
3 
4 /*
5  * Copyright (c) 1995, 1996, 1997 Matthew R. Green
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. The name of the author may not be used to endorse or promote products
17  *    derived from this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
32  * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/buf.h>
38 #include <sys/conf.h>
39 #include <sys/proc.h>
40 #include <sys/namei.h>
41 #include <sys/disklabel.h>
42 #include <sys/errno.h>
43 #include <sys/kernel.h>
44 #include <sys/malloc.h>
45 #include <sys/vnode.h>
46 #include <sys/file.h>
47 #include <sys/extent.h>
48 #include <sys/mount.h>
49 #include <sys/pool.h>
50 #include <sys/syscallargs.h>
51 #include <sys/swap.h>
52 
53 #include <uvm/uvm.h>
54 #ifdef UVM_SWAP_ENCRYPT
55 #include <dev/rndvar.h>
56 #include <sys/syslog.h>
57 #endif
58 
59 #include <miscfs/specfs/specdev.h>
60 
61 /*
62  * uvm_swap.c: manage configuration and i/o to swap space.
63  */
64 
65 /*
66  * swap space is managed in the following way:
67  *
68  * each swap partition or file is described by a "swapdev" structure.
69  * each "swapdev" structure contains a "swapent" structure which contains
70  * information that is passed up to the user (via system calls).
71  *
72  * each swap partition is assigned a "priority" (int) which controls
73  * swap partition usage.
74  *
75  * the system maintains a global data structure describing all swap
76  * partitions/files.   there is a sorted LIST of "swappri" structures
77  * which describe "swapdev"'s at that priority.   this LIST is headed
78  * by the "swap_priority" global var.    each "swappri" contains a
79  * CIRCLEQ of "swapdev" structures at that priority.
80  *
81  * locking:
82  *  - swap_syscall_lock (sleep lock): this lock serializes the swapctl
83  *    system call and prevents the swap priority list from changing
84  *    while we are in the middle of a system call (e.g. SWAP_STATS).
85  *  - uvm.swap_data_lock (simple_lock): this lock protects all swap data
86  *    structures including the priority list, the swapdev structures,
87  *    and the swapmap extent.
88  *
89  * each swap device has the following info:
90  *  - swap device in use (could be disabled, preventing future use)
91  *  - swap enabled (allows new allocations on swap)
92  *  - map info in /dev/drum
93  *  - vnode pointer
94  * for swap files only:
95  *  - block size
96  *  - max byte count in buffer
97  *  - buffer
98  *  - credentials to use when doing i/o to file
99  *
100  * userland controls and configures swap with the swapctl(2) system call.
101  * the sys_swapctl performs the following operations:
102  *  [1] SWAP_NSWAP: returns the number of swap devices currently configured
103  *  [2] SWAP_STATS: given a pointer to an array of swapent structures
104  *	(passed in via "arg") of a size passed in via "misc" ... we load
105  *	the current swap config into the array.
106  *  [3] SWAP_ON: given a pathname in arg (could be device or file) and a
107  *	priority in "misc", start swapping on it.
108  *  [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
109  *  [5] SWAP_CTL: changes the priority of a swap device (new priority in
110  *	"misc")
111  */
112 
113 /*
114  * swapdev: describes a single swap partition/file
115  *
116  * note the following should be true:
117  * swd_inuse <= swd_nblks  [number of blocks in use is <= total blocks]
118  * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
119  */
120 struct swapdev {
121 	struct swapent	swd_se;
122 #define	swd_dev		swd_se.se_dev		/* device id */
123 #define	swd_flags	swd_se.se_flags		/* flags:inuse/enable/fake */
124 #define	swd_priority	swd_se.se_priority	/* our priority */
125 #define	swd_inuse	swd_se.se_inuse		/* our priority */
126 #define	swd_nblks	swd_se.se_nblks		/* our priority */
127 	char			*swd_path;	/* saved pathname of device */
128 	int			swd_pathlen;	/* length of pathname */
129 	int			swd_npages;	/* #pages we can use */
130 	int			swd_npginuse;	/* #pages in use */
131 	int			swd_npgbad;	/* #pages bad */
132 	int			swd_drumoffset;	/* page0 offset in drum */
133 	int			swd_drumsize;	/* #pages in drum */
134 	struct extent		*swd_ex;	/* extent for this swapdev */
135 	char			swd_exname[12];	/* name of extent above */
136 	struct vnode		*swd_vp;	/* backing vnode */
137 	CIRCLEQ_ENTRY(swapdev)	swd_next;	/* priority circleq */
138 
139 	int			swd_bsize;	/* blocksize (bytes) */
140 	int			swd_maxactive;	/* max active i/o reqs */
141 	struct buf		swd_tab;	/* buffer list */
142 	struct ucred		*swd_cred;	/* cred for file access */
143 #ifdef UVM_SWAP_ENCRYPT
144 #define SWD_KEY_SHIFT		7		/* One key per 0.5 MByte */
145 #define SWD_KEY(x,y)		&((x)->swd_keys[((y) - (x)->swd_drumoffset) >> SWD_KEY_SHIFT])
146 #define	SWD_KEY_SIZE(x)	(((x) + (1 << SWD_KEY_SHIFT) - 1) >> SWD_KEY_SHIFT)
147 
148 #define SWD_DCRYPT_SHIFT	5
149 #define SWD_DCRYPT_BITS		32
150 #define SWD_DCRYPT_MASK		(SWD_DCRYPT_BITS - 1)
151 #define SWD_DCRYPT_OFF(x)	((x) >> SWD_DCRYPT_SHIFT)
152 #define SWD_DCRYPT_BIT(x)	((x) & SWD_DCRYPT_MASK)
153 #define SWD_DCRYPT_SIZE(x)	(SWD_DCRYPT_OFF((x) + SWD_DCRYPT_MASK) * sizeof(u_int32_t))
154 	u_int32_t		*swd_decrypt;	/* bitmap for decryption */
155 	struct swap_key		*swd_keys;	/* keys for different parts */
156 #endif
157 };
158 
159 /*
160  * swap device priority entry; the list is kept sorted on `spi_priority'.
161  */
162 struct swappri {
163 	int			spi_priority;     /* priority */
164 	CIRCLEQ_HEAD(spi_swapdev, swapdev)	spi_swapdev;
165 	/* circleq of swapdevs at this priority */
166 	LIST_ENTRY(swappri)	spi_swappri;      /* global list of pri's */
167 };
168 
169 /*
170  * The following two structures are used to keep track of data transfers
171  * on swap devices associated with regular files.
172  * NOTE: this code is more or less a copy of vnd.c; we use the same
173  * structure names here to ease porting..
174  */
175 struct vndxfer {
176 	struct buf	*vx_bp;		/* Pointer to parent buffer */
177 	struct swapdev	*vx_sdp;
178 	int		vx_error;
179 	int		vx_pending;	/* # of pending aux buffers */
180 	int		vx_flags;
181 #define VX_BUSY		1
182 #define VX_DEAD		2
183 };
184 
185 struct vndbuf {
186 	struct buf	vb_buf;
187 	struct vndxfer	*vb_xfer;
188 };
189 
190 
191 /*
192  * We keep a of pool vndbuf's and vndxfer structures.
193  */
194 struct pool vndxfer_pool;
195 struct pool vndbuf_pool;
196 
197 #define	getvndxfer(vnx)	do {						\
198 	int s = splbio();						\
199 	vnx = pool_get(&vndxfer_pool, PR_WAITOK);			\
200 	splx(s);							\
201 } while (0)
202 
203 #define putvndxfer(vnx) {						\
204 	pool_put(&vndxfer_pool, (void *)(vnx));				\
205 }
206 
207 #define	getvndbuf(vbp)	do {						\
208 	int s = splbio();						\
209 	vbp = pool_get(&vndbuf_pool, PR_WAITOK);			\
210 	splx(s);							\
211 } while (0)
212 
213 #define putvndbuf(vbp) {						\
214 	pool_put(&vndbuf_pool, (void *)(vbp));				\
215 }
216 
217 /*
218  * local variables
219  */
220 static struct extent *swapmap;		/* controls the mapping of /dev/drum */
221 
222 /* list of all active swap devices [by priority] */
223 LIST_HEAD(swap_priority, swappri);
224 static struct swap_priority swap_priority;
225 
226 /* locks */
227 struct rwlock swap_syscall_lock = RWLOCK_INITIALIZER("swplk");
228 
229 /*
230  * prototypes
231  */
232 static void		 swapdrum_add(struct swapdev *, int);
233 static struct swapdev	*swapdrum_getsdp(int);
234 
235 static struct swapdev	*swaplist_find(struct vnode *, int);
236 static void		 swaplist_insert(struct swapdev *,
237 					     struct swappri *, int);
238 static void		 swaplist_trim(void);
239 
240 static int swap_on(struct proc *, struct swapdev *);
241 static int swap_off(struct proc *, struct swapdev *);
242 
243 static void sw_reg_strategy(struct swapdev *, struct buf *, int);
244 static void sw_reg_iodone(struct buf *);
245 static void sw_reg_start(struct swapdev *);
246 
247 static int uvm_swap_io(struct vm_page **, int, int, int);
248 
249 static void swapmount(void);
250 
251 #ifdef UVM_SWAP_ENCRYPT
252 /* for swap encrypt */
253 boolean_t uvm_swap_allocpages(struct vm_page **, int);
254 void uvm_swap_markdecrypt(struct swapdev *, int, int, int);
255 boolean_t uvm_swap_needdecrypt(struct swapdev *, int);
256 void uvm_swap_initcrypt(struct swapdev *, int);
257 #endif
258 
259 /*
260  * uvm_swap_init: init the swap system data structures and locks
261  *
262  * => called at boot time from init_main.c after the filesystems
263  *	are brought up (which happens after uvm_init())
264  */
265 void
266 uvm_swap_init(void)
267 {
268 	UVMHIST_FUNC("uvm_swap_init");
269 
270 	UVMHIST_CALLED(pdhist);
271 	/*
272 	 * first, init the swap list, its counter, and its lock.
273 	 * then get a handle on the vnode for /dev/drum by using
274 	 * the its dev_t number ("swapdev", from MD conf.c).
275 	 */
276 
277 	LIST_INIT(&swap_priority);
278 	uvmexp.nswapdev = 0;
279 	simple_lock_init(&uvm.swap_data_lock);
280 
281 	if (!swapdev_vp && bdevvp(swapdev, &swapdev_vp))
282 		panic("uvm_swap_init: can't get vnode for swap device");
283 
284 	/*
285 	 * create swap block resource map to map /dev/drum.   the range
286 	 * from 1 to INT_MAX allows 2 gigablocks of swap space.  note
287 	 * that block 0 is reserved (used to indicate an allocation
288 	 * failure, or no allocation).
289 	 */
290 	swapmap = extent_create("swapmap", 1, INT_MAX,
291 				M_VMSWAP, 0, 0, EX_NOWAIT);
292 	if (swapmap == 0)
293 		panic("uvm_swap_init: extent_create failed");
294 
295 	/*
296 	 * allocate pools for structures used for swapping to files.
297 	 */
298 
299 
300 	pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx",
301 	    NULL);
302 
303 	pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd",
304 	    NULL);
305 
306 	/*
307 	 * Setup the initial swap partition
308 	 */
309 	swapmount();
310 
311 	/*
312 	 * done!
313 	 */
314 	UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
315 }
316 
317 #ifdef UVM_SWAP_ENCRYPT
318 void
319 uvm_swap_initcrypt_all(void)
320 {
321 	struct swapdev *sdp;
322 	struct swappri *spp;
323 	int npages;
324 
325 	simple_lock(&uvm.swap_data_lock);
326 
327 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
328 		CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next)
329 			if (sdp->swd_decrypt == NULL) {
330 				npages = dbtob((uint64_t)sdp->swd_nblks) >>
331 				    PAGE_SHIFT;
332 				uvm_swap_initcrypt(sdp, npages);
333 			}
334 	}
335 	simple_unlock(&uvm.swap_data_lock);
336 }
337 
338 void
339 uvm_swap_initcrypt(struct swapdev *sdp, int npages)
340 {
341 	/*
342 	 * keep information if a page needs to be decrypted when we get it
343 	 * from the swap device.
344 	 * We cannot chance a malloc later, if we are doing ASYNC puts,
345 	 * we may not call malloc with M_WAITOK.  This consumes only
346 	 * 8KB memory for a 256MB swap partition.
347 	 */
348 	sdp->swd_decrypt = malloc(SWD_DCRYPT_SIZE(npages), M_VMSWAP,
349 	    M_WAITOK|M_ZERO);
350 	sdp->swd_keys = malloc(SWD_KEY_SIZE(npages) * sizeof(struct swap_key),
351 	    M_VMSWAP, M_WAITOK|M_ZERO);
352 }
353 
354 boolean_t
355 uvm_swap_allocpages(struct vm_page **pps, int npages)
356 {
357 	int i;
358 	boolean_t fail;
359 
360 	/* Estimate if we will succeed */
361 	uvm_lock_fpageq();
362 
363 	fail = uvmexp.free - npages < uvmexp.reserve_kernel;
364 
365 	uvm_unlock_fpageq();
366 
367 	if (fail)
368 		return FALSE;
369 
370 	/* Get new pages */
371 	for (i = 0; i < npages; i++) {
372 		pps[i] = uvm_pagealloc(NULL, 0, NULL, 0);
373 		if (pps[i] == NULL)
374 			break;
375 	}
376 
377 	/* On failure free and return */
378 	if (i < npages) {
379 		uvm_swap_freepages(pps, i);
380 		return FALSE;
381 	}
382 
383 	return TRUE;
384 }
385 
386 void
387 uvm_swap_freepages(struct vm_page **pps, int npages)
388 {
389 	int i;
390 
391 	uvm_lock_pageq();
392 	for (i = 0; i < npages; i++)
393 		uvm_pagefree(pps[i]);
394 	uvm_unlock_pageq();
395 }
396 
397 /*
398  * Mark pages on the swap device for later decryption
399  */
400 
401 void
402 uvm_swap_markdecrypt(struct swapdev *sdp, int startslot, int npages,
403     int decrypt)
404 {
405 	int pagestart, i;
406 	int off, bit;
407 
408 	if (!sdp)
409 		return;
410 
411 	pagestart = startslot - sdp->swd_drumoffset;
412 	for (i = 0; i < npages; i++, pagestart++) {
413 		off = SWD_DCRYPT_OFF(pagestart);
414 		bit = SWD_DCRYPT_BIT(pagestart);
415 		if (decrypt)
416 			/* pages read need decryption */
417 			sdp->swd_decrypt[off] |= 1 << bit;
418 		else
419 			/* pages read do not need decryption */
420 			sdp->swd_decrypt[off] &= ~(1 << bit);
421 	}
422 }
423 
424 /*
425  * Check if the page that we got from disk needs to be decrypted
426  */
427 
428 boolean_t
429 uvm_swap_needdecrypt(struct swapdev *sdp, int off)
430 {
431 	if (!sdp)
432 		return FALSE;
433 
434 	off -= sdp->swd_drumoffset;
435 	return sdp->swd_decrypt[SWD_DCRYPT_OFF(off)] & (1 << SWD_DCRYPT_BIT(off)) ?
436 		TRUE : FALSE;
437 }
438 
439 void
440 uvm_swap_finicrypt_all(void)
441 {
442 	struct swapdev *sdp;
443 	struct swappri *spp;
444 	struct swap_key *key;
445 	unsigned int nkeys;
446 
447 	simple_lock(&uvm.swap_data_lock);
448 
449 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
450 		CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
451 			if (sdp->swd_decrypt == NULL)
452 				continue;
453 
454 			nkeys = dbtob((uint64_t)sdp->swd_nblks) >> PAGE_SHIFT;
455 			key = sdp->swd_keys + (SWD_KEY_SIZE(nkeys) - 1);
456 			do {
457 				if (key->refcount != 0)
458 					swap_key_delete(key);
459 			} while (key-- != sdp->swd_keys);
460 		}
461 	}
462 	simple_unlock(&uvm.swap_data_lock);
463 }
464 #endif /* UVM_SWAP_ENCRYPT */
465 
466 /*
467  * swaplist functions: functions that operate on the list of swap
468  * devices on the system.
469  */
470 
471 /*
472  * swaplist_insert: insert swap device "sdp" into the global list
473  *
474  * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
475  * => caller must provide a newly malloc'd swappri structure (we will
476  *	FREE it if we don't need it... this it to prevent malloc blocking
477  *	here while adding swap)
478  */
479 static void
480 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority)
481 {
482 	struct swappri *spp, *pspp;
483 	UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist);
484 
485 	/*
486 	 * find entry at or after which to insert the new device.
487 	 */
488 	for (pspp = NULL, spp = LIST_FIRST(&swap_priority); spp != NULL;
489 	     spp = LIST_NEXT(spp, spi_swappri)) {
490 		if (priority <= spp->spi_priority)
491 			break;
492 		pspp = spp;
493 	}
494 
495 	/*
496 	 * new priority?
497 	 */
498 	if (spp == NULL || spp->spi_priority != priority) {
499 		spp = newspp;  /* use newspp! */
500 		UVMHIST_LOG(pdhist, "created new swappri = %ld",
501 			    priority, 0, 0, 0);
502 
503 		spp->spi_priority = priority;
504 		CIRCLEQ_INIT(&spp->spi_swapdev);
505 
506 		if (pspp)
507 			LIST_INSERT_AFTER(pspp, spp, spi_swappri);
508 		else
509 			LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
510 	} else {
511 	  	/* we don't need a new priority structure, free it */
512 		free(newspp, M_VMSWAP);
513 	}
514 
515 	/*
516 	 * priority found (or created).   now insert on the priority's
517 	 * circleq list and bump the total number of swapdevs.
518 	 */
519 	sdp->swd_priority = priority;
520 	CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
521 	uvmexp.nswapdev++;
522 }
523 
524 /*
525  * swaplist_find: find and optionally remove a swap device from the
526  *	global list.
527  *
528  * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
529  * => we return the swapdev we found (and removed)
530  */
531 static struct swapdev *
532 swaplist_find(struct vnode *vp, boolean_t remove)
533 {
534 	struct swapdev *sdp;
535 	struct swappri *spp;
536 
537 	/*
538 	 * search the lists for the requested vp
539 	 */
540 	for (spp = LIST_FIRST(&swap_priority); spp != NULL;
541 	     spp = LIST_NEXT(spp, spi_swappri)) {
542 		for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
543 		     sdp != (void *)&spp->spi_swapdev;
544 		     sdp = CIRCLEQ_NEXT(sdp, swd_next))
545 			if (sdp->swd_vp == vp) {
546 				if (remove) {
547 					CIRCLEQ_REMOVE(&spp->spi_swapdev,
548 					    sdp, swd_next);
549 					uvmexp.nswapdev--;
550 				}
551 				return(sdp);
552 			}
553 	}
554 	return (NULL);
555 }
556 
557 
558 /*
559  * swaplist_trim: scan priority list for empty priority entries and kill
560  *	them.
561  *
562  * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
563  */
564 static void
565 swaplist_trim(void)
566 {
567 	struct swappri *spp, *nextspp;
568 
569 	for (spp = LIST_FIRST(&swap_priority); spp != NULL; spp = nextspp) {
570 		nextspp = LIST_NEXT(spp, spi_swappri);
571 		if (CIRCLEQ_FIRST(&spp->spi_swapdev) !=
572 		    (void *)&spp->spi_swapdev)
573 			continue;
574 		LIST_REMOVE(spp, spi_swappri);
575 		free(spp, M_VMSWAP);
576 	}
577 }
578 
579 /*
580  * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area.
581  *
582  * => caller must hold swap_syscall_lock
583  * => uvm.swap_data_lock should be unlocked (we may sleep)
584  */
585 static void
586 swapdrum_add(struct swapdev *sdp, int npages)
587 {
588 	u_long result;
589 
590 	if (extent_alloc(swapmap, npages, EX_NOALIGN, 0, EX_NOBOUNDARY,
591 	    EX_WAITOK, &result))
592 		panic("swapdrum_add");
593 
594 	sdp->swd_drumoffset = result;
595 	sdp->swd_drumsize = npages;
596 }
597 
598 /*
599  * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
600  *	to the "swapdev" that maps that section of the drum.
601  *
602  * => each swapdev takes one big contig chunk of the drum
603  * => caller must hold uvm.swap_data_lock
604  */
605 static struct swapdev *
606 swapdrum_getsdp(int pgno)
607 {
608 	struct swapdev *sdp;
609 	struct swappri *spp;
610 
611 	for (spp = LIST_FIRST(&swap_priority); spp != NULL;
612 	     spp = LIST_NEXT(spp, spi_swappri))
613 		for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
614 		     sdp != (void *)&spp->spi_swapdev;
615 		     sdp = CIRCLEQ_NEXT(sdp, swd_next))
616 			if (pgno >= sdp->swd_drumoffset &&
617 			    pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
618 				return sdp;
619 			}
620 	return NULL;
621 }
622 
623 
624 /*
625  * sys_swapctl: main entry point for swapctl(2) system call
626  * 	[with two helper functions: swap_on and swap_off]
627  */
628 int
629 sys_swapctl(struct proc *p, void *v, register_t *retval)
630 {
631 	struct sys_swapctl_args /* {
632 		syscallarg(int) cmd;
633 		syscallarg(void *) arg;
634 		syscallarg(int) misc;
635 	} */ *uap = (struct sys_swapctl_args *)v;
636 	struct vnode *vp;
637 	struct nameidata nd;
638 	struct swappri *spp;
639 	struct swapdev *sdp;
640 	struct swapent *sep;
641 	char	userpath[MAXPATHLEN];
642 	size_t	len;
643 	int	count, error, misc;
644 	int	priority;
645 	UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist);
646 
647 	misc = SCARG(uap, misc);
648 
649 	/*
650 	 * ensure serialized syscall access by grabbing the swap_syscall_lock
651 	 */
652 	rw_enter_write(&swap_syscall_lock);
653 
654 	/*
655 	 * we handle the non-priv NSWAP and STATS request first.
656 	 *
657 	 * SWAP_NSWAP: return number of config'd swap devices
658 	 * [can also be obtained with uvmexp sysctl]
659 	 */
660 	if (SCARG(uap, cmd) == SWAP_NSWAP) {
661 		UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%ld", uvmexp.nswapdev,
662 		    0, 0, 0);
663 		*retval = uvmexp.nswapdev;
664 		error = 0;
665 		goto out;
666 	}
667 
668 	/*
669 	 * SWAP_STATS: get stats on current # of configured swap devs
670 	 *
671 	 * note that the swap_priority list can't change as long
672 	 * as we are holding the swap_syscall_lock.  we don't want
673 	 * to grab the uvm.swap_data_lock because we may fault&sleep during
674 	 * copyout() and we don't want to be holding that lock then!
675 	 */
676 	if (SCARG(uap, cmd) == SWAP_STATS) {
677 		sep = (struct swapent *)SCARG(uap, arg);
678 		count = 0;
679 
680 		for (spp = LIST_FIRST(&swap_priority); spp != NULL;
681 		    spp = LIST_NEXT(spp, spi_swappri)) {
682 			for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
683 			     sdp != (void *)&spp->spi_swapdev && misc-- > 0;
684 			     sdp = CIRCLEQ_NEXT(sdp, swd_next)) {
685 				sdp->swd_inuse =
686 				    btodb((u_int64_t)sdp->swd_npginuse <<
687 				    PAGE_SHIFT);
688 				error = copyout(&sdp->swd_se, sep,
689 				    sizeof(struct swapent));
690 
691 				/* now copy out the path if necessary */
692 				if (error == 0)
693 					error = copyout(sdp->swd_path,
694 					    &sep->se_path, sdp->swd_pathlen);
695 
696 				if (error)
697 					goto out;
698 				count++;
699 				sep++;
700 			}
701 		}
702 
703 		UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
704 
705 		*retval = count;
706 		error = 0;
707 		goto out;
708 	}
709 
710 	/*
711 	 * all other requests require superuser privs.   verify.
712 	 */
713 	if ((error = suser(p, 0)))
714 		goto out;
715 
716 	/*
717 	 * at this point we expect a path name in arg.   we will
718 	 * use namei() to gain a vnode reference (vref), and lock
719 	 * the vnode (VOP_LOCK).
720 	 *
721 	 * XXX: a NULL arg means use the root vnode pointer (e.g. for
722 	 * miniroot)
723 	 */
724 	if (SCARG(uap, arg) == NULL) {
725 		vp = rootvp;		/* miniroot */
726 		if (vget(vp, LK_EXCLUSIVE, p)) {
727 			error = EBUSY;
728 			goto out;
729 		}
730 		if (SCARG(uap, cmd) == SWAP_ON &&
731 		    copystr("miniroot", userpath, sizeof userpath, &len))
732 			panic("swapctl: miniroot copy failed");
733 	} else {
734 		int	space;
735 		char	*where;
736 
737 		if (SCARG(uap, cmd) == SWAP_ON) {
738 			if ((error = copyinstr(SCARG(uap, arg), userpath,
739 			    sizeof userpath, &len)))
740 				goto out;
741 			space = UIO_SYSSPACE;
742 			where = userpath;
743 		} else {
744 			space = UIO_USERSPACE;
745 			where = (char *)SCARG(uap, arg);
746 		}
747 		NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, space, where, p);
748 		if ((error = namei(&nd)))
749 			goto out;
750 		vp = nd.ni_vp;
751 	}
752 	/* note: "vp" is referenced and locked */
753 
754 	error = 0;		/* assume no error */
755 	switch(SCARG(uap, cmd)) {
756 
757 	case SWAP_DUMPDEV:
758 		if (vp->v_type != VBLK) {
759 			error = ENOTBLK;
760 			break;
761 		}
762 		dumpdev = vp->v_rdev;
763 		break;
764 
765 	case SWAP_CTL:
766 		/*
767 		 * get new priority, remove old entry (if any) and then
768 		 * reinsert it in the correct place.  finally, prune out
769 		 * any empty priority structures.
770 		 */
771 		priority = SCARG(uap, misc);
772 		spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
773 		simple_lock(&uvm.swap_data_lock);
774 		if ((sdp = swaplist_find(vp, 1)) == NULL) {
775 			error = ENOENT;
776 		} else {
777 			swaplist_insert(sdp, spp, priority);
778 			swaplist_trim();
779 		}
780 		simple_unlock(&uvm.swap_data_lock);
781 		if (error)
782 			free(spp, M_VMSWAP);
783 		break;
784 
785 	case SWAP_ON:
786 
787 		/*
788 		 * check for duplicates.   if none found, then insert a
789 		 * dummy entry on the list to prevent someone else from
790 		 * trying to enable this device while we are working on
791 		 * it.
792 		 */
793 
794 		priority = SCARG(uap, misc);
795 		simple_lock(&uvm.swap_data_lock);
796 		if ((sdp = swaplist_find(vp, 0)) != NULL) {
797 			error = EBUSY;
798 			simple_unlock(&uvm.swap_data_lock);
799 			break;
800 		}
801 		sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK|M_ZERO);
802 		spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
803 		sdp->swd_flags = SWF_FAKE;	/* placeholder only */
804 		sdp->swd_vp = vp;
805 		sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
806 
807 		/*
808 		 * XXX Is NFS elaboration necessary?
809 		 */
810 		if (vp->v_type == VREG) {
811 			sdp->swd_cred = crdup(p->p_ucred);
812 		}
813 
814 		swaplist_insert(sdp, spp, priority);
815 		simple_unlock(&uvm.swap_data_lock);
816 
817 		sdp->swd_pathlen = len;
818 		sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK);
819 		if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0)
820 			panic("swapctl: copystr");
821 
822 		/*
823 		 * we've now got a FAKE placeholder in the swap list.
824 		 * now attempt to enable swap on it.  if we fail, undo
825 		 * what we've done and kill the fake entry we just inserted.
826 		 * if swap_on is a success, it will clear the SWF_FAKE flag
827 		 */
828 
829 		if ((error = swap_on(p, sdp)) != 0) {
830 			simple_lock(&uvm.swap_data_lock);
831 			(void) swaplist_find(vp, 1);  /* kill fake entry */
832 			swaplist_trim();
833 			simple_unlock(&uvm.swap_data_lock);
834 			if (vp->v_type == VREG) {
835 				crfree(sdp->swd_cred);
836 			}
837 			free(sdp->swd_path, M_VMSWAP);
838 			free(sdp, M_VMSWAP);
839 			break;
840 		}
841 		break;
842 
843 	case SWAP_OFF:
844 		simple_lock(&uvm.swap_data_lock);
845 		if ((sdp = swaplist_find(vp, 0)) == NULL) {
846 			simple_unlock(&uvm.swap_data_lock);
847 			error = ENXIO;
848 			break;
849 		}
850 
851 		/*
852 		 * If a device isn't in use or enabled, we
853 		 * can't stop swapping from it (again).
854 		 */
855 		if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
856 			simple_unlock(&uvm.swap_data_lock);
857 			error = EBUSY;
858 			break;
859 		}
860 
861 		/*
862 		 * do the real work.
863 		 */
864 		error = swap_off(p, sdp);
865 		break;
866 
867 	default:
868 		error = EINVAL;
869 	}
870 
871 	/*
872 	 * done!  release the ref gained by namei() and unlock.
873 	 */
874 	vput(vp);
875 
876 out:
877 	rw_exit_write(&swap_syscall_lock);
878 
879 	UVMHIST_LOG(pdhist, "<- done!  error=%ld", error, 0, 0, 0);
880 	return (error);
881 }
882 
883 /*
884  * swap_on: attempt to enable a swapdev for swapping.   note that the
885  *	swapdev is already on the global list, but disabled (marked
886  *	SWF_FAKE).
887  *
888  * => we avoid the start of the disk (to protect disk labels)
889  * => we also avoid the miniroot, if we are swapping to root.
890  * => caller should leave uvm.swap_data_lock unlocked, we may lock it
891  *	if needed.
892  */
893 static int
894 swap_on(struct proc *p, struct swapdev *sdp)
895 {
896 	static int count = 0;	/* static */
897 	struct vnode *vp;
898 	int error, npages, nblocks, size;
899 	long addr;
900 	struct vattr va;
901 #if defined(NFSCLIENT)
902 	extern int (**nfsv2_vnodeop_p)(void *);
903 #endif /* defined(NFSCLIENT) */
904 	dev_t dev;
905 	UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist);
906 
907 	/*
908 	 * we want to enable swapping on sdp.   the swd_vp contains
909 	 * the vnode we want (locked and ref'd), and the swd_dev
910 	 * contains the dev_t of the file, if it a block device.
911 	 */
912 
913 	vp = sdp->swd_vp;
914 	dev = sdp->swd_dev;
915 
916 	/*
917 	 * open the swap file (mostly useful for block device files to
918 	 * let device driver know what is up).
919 	 *
920 	 * we skip the open/close for root on swap because the root
921 	 * has already been opened when root was mounted (mountroot).
922 	 */
923 	if (vp != rootvp) {
924 		if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p)))
925 			return (error);
926 	}
927 
928 	/* XXX this only works for block devices */
929 	UVMHIST_LOG(pdhist, "  dev=%ld, major(dev)=%ld", dev, major(dev), 0,0);
930 
931 	/*
932 	 * we now need to determine the size of the swap area.   for
933 	 * block specials we can call the d_psize function.
934 	 * for normal files, we must stat [get attrs].
935 	 *
936 	 * we put the result in nblks.
937 	 * for normal files, we also want the filesystem block size
938 	 * (which we get with statfs).
939 	 */
940 	switch (vp->v_type) {
941 	case VBLK:
942 		if (bdevsw[major(dev)].d_psize == 0 ||
943 		    (nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) {
944 			error = ENXIO;
945 			goto bad;
946 		}
947 		break;
948 
949 	case VREG:
950 		if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)))
951 			goto bad;
952 		nblocks = (int)btodb(va.va_size);
953 		if ((error =
954 		     VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0)
955 			goto bad;
956 
957 		sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize;
958 		/*
959 		 * limit the max # of outstanding I/O requests we issue
960 		 * at any one time.   take it easy on NFS servers.
961 		 */
962 #if defined(NFSCLIENT)
963 		if (vp->v_op == nfsv2_vnodeop_p)
964 			sdp->swd_maxactive = 2; /* XXX */
965 		else
966 #endif /* defined(NFSCLIENT) */
967 			sdp->swd_maxactive = 8; /* XXX */
968 		break;
969 
970 	default:
971 		error = ENXIO;
972 		goto bad;
973 	}
974 
975 	/*
976 	 * save nblocks in a safe place and convert to pages.
977 	 */
978 
979 	sdp->swd_nblks = nblocks;
980 	npages = dbtob((u_int64_t)nblocks) >> PAGE_SHIFT;
981 
982 	/*
983 	 * for block special files, we want to make sure that leave
984 	 * the disklabel and bootblocks alone, so we arrange to skip
985 	 * over them (arbitrarily choosing to skip PAGE_SIZE bytes).
986 	 * note that because of this the "size" can be less than the
987 	 * actual number of blocks on the device.
988 	 */
989 	if (vp->v_type == VBLK) {
990 		/* we use pages 1 to (size - 1) [inclusive] */
991 		size = npages - 1;
992 		addr = 1;
993 	} else {
994 		/* we use pages 0 to (size - 1) [inclusive] */
995 		size = npages;
996 		addr = 0;
997 	}
998 
999 	/*
1000 	 * make sure we have enough blocks for a reasonable sized swap
1001 	 * area.   we want at least one page.
1002 	 */
1003 
1004 	if (size < 1) {
1005 		UVMHIST_LOG(pdhist, "  size <= 1!!", 0, 0, 0, 0);
1006 		error = EINVAL;
1007 		goto bad;
1008 	}
1009 
1010 	UVMHIST_LOG(pdhist, "  dev=%lx: size=%ld addr=0x%lx\n",
1011 	    dev, size, addr, 0);
1012 
1013 	/*
1014 	 * now we need to allocate an extent to manage this swap device
1015 	 */
1016 	snprintf(sdp->swd_exname, sizeof(sdp->swd_exname), "swap0x%04x",
1017 	    count++);
1018 
1019 	/* note that extent_create's 3rd arg is inclusive, thus "- 1" */
1020 	sdp->swd_ex = extent_create(sdp->swd_exname, 0, npages - 1, M_VMSWAP,
1021 				    0, 0, EX_WAITOK);
1022 	/* allocate the `saved' region from the extent so it won't be used */
1023 	if (addr) {
1024 		if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK))
1025 			panic("disklabel region");
1026 	}
1027 
1028 	/*
1029 	 * if the vnode we are swapping to is the root vnode
1030 	 * (i.e. we are swapping to the miniroot) then we want
1031 	 * to make sure we don't overwrite it.   do a statfs to
1032 	 * find its size and skip over it.
1033 	 */
1034 	if (vp == rootvp) {
1035 		struct mount *mp;
1036 		struct statfs *sp;
1037 		int rootblocks, rootpages;
1038 
1039 		mp = rootvnode->v_mount;
1040 		sp = &mp->mnt_stat;
1041 		rootblocks = sp->f_blocks * btodb(sp->f_bsize);
1042 		rootpages = round_page(dbtob((u_int64_t)rootblocks))
1043 		    >> PAGE_SHIFT;
1044 		if (rootpages >= size)
1045 			panic("swap_on: miniroot larger than swap?");
1046 
1047 		if (extent_alloc_region(sdp->swd_ex, addr,
1048 					rootpages, EX_WAITOK))
1049 			panic("swap_on: unable to preserve miniroot");
1050 
1051 		size -= rootpages;
1052 		printf("Preserved %d pages of miniroot ", rootpages);
1053 		printf("leaving %d pages of swap\n", size);
1054 	}
1055 
1056 	/*
1057 	 * add a ref to vp to reflect usage as a swap device.
1058 	 */
1059 	vref(vp);
1060 
1061 #ifdef UVM_SWAP_ENCRYPT
1062 	if (uvm_doswapencrypt)
1063 		uvm_swap_initcrypt(sdp, npages);
1064 #endif
1065 	/*
1066 	 * now add the new swapdev to the drum and enable.
1067 	 */
1068 	simple_lock(&uvm.swap_data_lock);
1069 	swapdrum_add(sdp, npages);
1070 	sdp->swd_npages = size;
1071 	sdp->swd_flags &= ~SWF_FAKE;	/* going live */
1072 	sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
1073 	uvmexp.swpages += size;
1074 	simple_unlock(&uvm.swap_data_lock);
1075 	return (0);
1076 
1077 bad:
1078 	/*
1079 	 * failure: close device if necessary and return error.
1080 	 */
1081 	if (vp != rootvp)
1082 		(void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p);
1083 	return (error);
1084 }
1085 
1086 /*
1087  * swap_off: stop swapping on swapdev
1088  *
1089  * => swap data should be locked, we will unlock.
1090  */
1091 static int
1092 swap_off(struct proc *p, struct swapdev *sdp)
1093 {
1094 	int error = 0;
1095 	UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist);
1096 	UVMHIST_LOG(pdhist, "  dev=%lx", sdp->swd_dev,0,0,0);
1097 
1098 	/* disable the swap area being removed */
1099 	sdp->swd_flags &= ~SWF_ENABLE;
1100 	simple_unlock(&uvm.swap_data_lock);
1101 
1102 	/*
1103 	 * the idea is to find all the pages that are paged out to this
1104 	 * device, and page them all in.  in uvm, swap-backed pageable
1105 	 * memory can take two forms: aobjs and anons.  call the
1106 	 * swapoff hook for each subsystem to bring in pages.
1107 	 */
1108 
1109 	if (uao_swap_off(sdp->swd_drumoffset,
1110 			 sdp->swd_drumoffset + sdp->swd_drumsize) ||
1111 	    amap_swap_off(sdp->swd_drumoffset,
1112 			  sdp->swd_drumoffset + sdp->swd_drumsize)) {
1113 
1114 		error = ENOMEM;
1115 	} else if (sdp->swd_npginuse > sdp->swd_npgbad) {
1116 		error = EBUSY;
1117 	}
1118 
1119 	if (error) {
1120 		simple_lock(&uvm.swap_data_lock);
1121 		sdp->swd_flags |= SWF_ENABLE;
1122 		simple_unlock(&uvm.swap_data_lock);
1123 		return (error);
1124 	}
1125 
1126 	/*
1127 	 * done with the vnode and saved creds.
1128 	 * drop our ref on the vnode before calling VOP_CLOSE()
1129 	 * so that spec_close() can tell if this is the last close.
1130 	 */
1131 	if (sdp->swd_vp->v_type == VREG) {
1132 		crfree(sdp->swd_cred);
1133 	}
1134 	vrele(sdp->swd_vp);
1135 	if (sdp->swd_vp != rootvp) {
1136 		(void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p);
1137 	}
1138 
1139 	simple_lock(&uvm.swap_data_lock);
1140 	uvmexp.swpages -= sdp->swd_npages;
1141 
1142 	if (swaplist_find(sdp->swd_vp, 1) == NULL)
1143 		panic("swap_off: swapdev not in list");
1144 	swaplist_trim();
1145 
1146 	/*
1147 	 * free all resources!
1148 	 */
1149 	extent_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize,
1150 		    EX_WAITOK);
1151 	extent_destroy(sdp->swd_ex);
1152 	free(sdp, M_VMSWAP);
1153 	simple_unlock(&uvm.swap_data_lock);
1154 	return (0);
1155 }
1156 
1157 /*
1158  * /dev/drum interface and i/o functions
1159  */
1160 
1161 /*
1162  * swstrategy: perform I/O on the drum
1163  *
1164  * => we must map the i/o request from the drum to the correct swapdev.
1165  */
1166 void
1167 swstrategy(struct buf *bp)
1168 {
1169 	struct swapdev *sdp;
1170 	int s, pageno, bn;
1171 	UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist);
1172 
1173 	/*
1174 	 * convert block number to swapdev.   note that swapdev can't
1175 	 * be yanked out from under us because we are holding resources
1176 	 * in it (i.e. the blocks we are doing I/O on).
1177 	 */
1178 	pageno = dbtob((u_int64_t)bp->b_blkno) >> PAGE_SHIFT;
1179 	simple_lock(&uvm.swap_data_lock);
1180 	sdp = swapdrum_getsdp(pageno);
1181 	simple_unlock(&uvm.swap_data_lock);
1182 	if (sdp == NULL) {
1183 		bp->b_error = EINVAL;
1184 		bp->b_flags |= B_ERROR;
1185 		s = splbio();
1186 		biodone(bp);
1187 		splx(s);
1188 		UVMHIST_LOG(pdhist, "  failed to get swap device", 0, 0, 0, 0);
1189 		return;
1190 	}
1191 
1192 	/*
1193 	 * convert drum page number to block number on this swapdev.
1194 	 */
1195 
1196 	pageno -= sdp->swd_drumoffset;	/* page # on swapdev */
1197 	bn = btodb((u_int64_t)pageno << PAGE_SHIFT); /* convert to diskblock */
1198 
1199 	UVMHIST_LOG(pdhist, "  %s: mapoff=%lx bn=0x%lx bcount=%ld",
1200 		((bp->b_flags & B_READ) == 0) ? "write" : "read",
1201 		sdp->swd_drumoffset, bn, bp->b_bcount);
1202 
1203 	/*
1204 	 * for block devices we finish up here.
1205 	 * for regular files we have to do more work which we delegate
1206 	 * to sw_reg_strategy().
1207 	 */
1208 
1209 	switch (sdp->swd_vp->v_type) {
1210 	default:
1211 		panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type);
1212 
1213 	case VBLK:
1214 
1215 		/*
1216 		 * must convert "bp" from an I/O on /dev/drum to an I/O
1217 		 * on the swapdev (sdp).
1218 		 */
1219 		s = splbio();
1220 		buf_replacevnode(bp, sdp->swd_vp);
1221 
1222 		bp->b_blkno = bn;
1223       		splx(s);
1224 		VOP_STRATEGY(bp);
1225 		return;
1226 
1227 	case VREG:
1228 		/*
1229 		 * delegate to sw_reg_strategy function.
1230 		 */
1231 		sw_reg_strategy(sdp, bp, bn);
1232 		return;
1233 	}
1234 	/* NOTREACHED */
1235 }
1236 
1237 /*
1238  * sw_reg_strategy: handle swap i/o to regular files
1239  */
1240 static void
1241 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn)
1242 {
1243 	struct vnode	*vp;
1244 	struct vndxfer	*vnx;
1245 	daddr64_t	nbn;
1246 	caddr_t		addr;
1247 	off_t		byteoff;
1248 	int		s, off, nra, error, sz, resid;
1249 	UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist);
1250 
1251 	/*
1252 	 * allocate a vndxfer head for this transfer and point it to
1253 	 * our buffer.
1254 	 */
1255 	getvndxfer(vnx);
1256 	vnx->vx_flags = VX_BUSY;
1257 	vnx->vx_error = 0;
1258 	vnx->vx_pending = 0;
1259 	vnx->vx_bp = bp;
1260 	vnx->vx_sdp = sdp;
1261 
1262 	/*
1263 	 * setup for main loop where we read filesystem blocks into
1264 	 * our buffer.
1265 	 */
1266 	error = 0;
1267 	bp->b_resid = bp->b_bcount;	/* nothing transferred yet! */
1268 	addr = bp->b_data;		/* current position in buffer */
1269 	byteoff = dbtob((u_int64_t)bn);
1270 
1271 	for (resid = bp->b_resid; resid; resid -= sz) {
1272 		struct vndbuf	*nbp;
1273 
1274 		/*
1275 		 * translate byteoffset into block number.  return values:
1276 		 *   vp = vnode of underlying device
1277 		 *  nbn = new block number (on underlying vnode dev)
1278 		 *  nra = num blocks we can read-ahead (excludes requested
1279 		 *	block)
1280 		 */
1281 		nra = 0;
1282 		error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
1283 				 	&vp, &nbn, &nra);
1284 
1285 		if (error == 0 && nbn == (daddr64_t)-1) {
1286 			/*
1287 			 * this used to just set error, but that doesn't
1288 			 * do the right thing.  Instead, it causes random
1289 			 * memory errors.  The panic() should remain until
1290 			 * this condition doesn't destabilize the system.
1291 			 */
1292 #if 1
1293 			panic("sw_reg_strategy: swap to sparse file");
1294 #else
1295 			error = EIO;	/* failure */
1296 #endif
1297 		}
1298 
1299 		/*
1300 		 * punt if there was an error or a hole in the file.
1301 		 * we must wait for any i/o ops we have already started
1302 		 * to finish before returning.
1303 		 *
1304 		 * XXX we could deal with holes here but it would be
1305 		 * a hassle (in the write case).
1306 		 */
1307 		if (error) {
1308 			s = splbio();
1309 			vnx->vx_error = error;	/* pass error up */
1310 			goto out;
1311 		}
1312 
1313 		/*
1314 		 * compute the size ("sz") of this transfer (in bytes).
1315 		 */
1316 		off = byteoff % sdp->swd_bsize;
1317 		sz = (1 + nra) * sdp->swd_bsize - off;
1318 		if (sz > resid)
1319 			sz = resid;
1320 
1321 		UVMHIST_LOG(pdhist, "sw_reg_strategy: "
1322 			    "vp %p/%p offset 0x%lx/0x%llx",
1323 			    sdp->swd_vp, vp, (u_long)byteoff, nbn);
1324 
1325 		/*
1326 		 * now get a buf structure.   note that the vb_buf is
1327 		 * at the front of the nbp structure so that you can
1328 		 * cast pointers between the two structure easily.
1329 		 */
1330 		getvndbuf(nbp);
1331 		nbp->vb_buf.b_flags    = bp->b_flags | B_CALL;
1332 		nbp->vb_buf.b_bcount   = sz;
1333 		nbp->vb_buf.b_bufsize  = sz;
1334 		nbp->vb_buf.b_error    = 0;
1335 		nbp->vb_buf.b_data     = addr;
1336 		nbp->vb_buf.b_blkno    = nbn + btodb(off);
1337 		nbp->vb_buf.b_proc     = bp->b_proc;
1338 		nbp->vb_buf.b_iodone   = sw_reg_iodone;
1339 		nbp->vb_buf.b_vp       = NULLVP;
1340 		nbp->vb_buf.b_vnbufs.le_next = NOLIST;
1341 		LIST_INIT(&nbp->vb_buf.b_dep);
1342 
1343 		/*
1344 		 * set b_dirtyoff/end and b_validoff/end.   this is
1345 		 * required by the NFS client code (otherwise it will
1346 		 * just discard our I/O request).
1347 		 */
1348 		if (bp->b_dirtyend == 0) {
1349 			nbp->vb_buf.b_dirtyoff = 0;
1350 			nbp->vb_buf.b_dirtyend = sz;
1351 		} else {
1352 			nbp->vb_buf.b_dirtyoff =
1353 			    max(0, bp->b_dirtyoff - (bp->b_bcount-resid));
1354 			nbp->vb_buf.b_dirtyend =
1355 			    min(sz,
1356 				max(0, bp->b_dirtyend - (bp->b_bcount-resid)));
1357 		}
1358 		if (bp->b_validend == 0) {
1359 			nbp->vb_buf.b_validoff = 0;
1360 			nbp->vb_buf.b_validend = sz;
1361 		} else {
1362 			nbp->vb_buf.b_validoff =
1363 			    max(0, bp->b_validoff - (bp->b_bcount-resid));
1364 			nbp->vb_buf.b_validend =
1365 			    min(sz,
1366 				max(0, bp->b_validend - (bp->b_bcount-resid)));
1367 		}
1368 
1369 		nbp->vb_xfer = vnx;	/* patch it back in to vnx */
1370 
1371 		/*
1372 		 * Just sort by block number
1373 		 */
1374 		nbp->vb_buf.b_cylinder = nbp->vb_buf.b_blkno;
1375 		s = splbio();
1376 		if (vnx->vx_error != 0) {
1377 			putvndbuf(nbp);
1378 			goto out;
1379 		}
1380 		vnx->vx_pending++;
1381 
1382 		/* assoc new buffer with underlying vnode */
1383 		bgetvp(vp, &nbp->vb_buf);
1384 
1385 		/* sort it in and start I/O if we are not over our limit */
1386 		disksort(&sdp->swd_tab, &nbp->vb_buf);
1387 		sw_reg_start(sdp);
1388 		splx(s);
1389 
1390 		/*
1391 		 * advance to the next I/O
1392 		 */
1393 		byteoff += sz;
1394 		addr += sz;
1395 	}
1396 
1397 	s = splbio();
1398 
1399 out: /* Arrive here at splbio */
1400 	vnx->vx_flags &= ~VX_BUSY;
1401 	if (vnx->vx_pending == 0) {
1402 		if (vnx->vx_error != 0) {
1403 			bp->b_error = vnx->vx_error;
1404 			bp->b_flags |= B_ERROR;
1405 		}
1406 		putvndxfer(vnx);
1407 		biodone(bp);
1408 	}
1409 	splx(s);
1410 }
1411 
1412 /*
1413  * sw_reg_start: start an I/O request on the requested swapdev
1414  *
1415  * => reqs are sorted by disksort (above)
1416  */
1417 static void
1418 sw_reg_start(struct swapdev *sdp)
1419 {
1420 	struct buf	*bp;
1421 	UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist);
1422 
1423 	/* recursion control */
1424 	if ((sdp->swd_flags & SWF_BUSY) != 0)
1425 		return;
1426 
1427 	sdp->swd_flags |= SWF_BUSY;
1428 
1429 	while (sdp->swd_tab.b_active < sdp->swd_maxactive) {
1430 		bp = sdp->swd_tab.b_actf;
1431 		if (bp == NULL)
1432 			break;
1433 		sdp->swd_tab.b_actf = bp->b_actf;
1434 		sdp->swd_tab.b_active++;
1435 
1436 		UVMHIST_LOG(pdhist,
1437 		    "sw_reg_start:  bp %p vp %p blkno 0x%lx cnt 0x%lx",
1438 		    bp, bp->b_vp, bp->b_blkno, bp->b_bcount);
1439 		if ((bp->b_flags & B_READ) == 0)
1440 			bp->b_vp->v_numoutput++;
1441 
1442 		VOP_STRATEGY(bp);
1443 	}
1444 	sdp->swd_flags &= ~SWF_BUSY;
1445 }
1446 
1447 /*
1448  * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
1449  *
1450  * => note that we can recover the vndbuf struct by casting the buf ptr
1451  */
1452 static void
1453 sw_reg_iodone(struct buf *bp)
1454 {
1455 	struct vndbuf *vbp = (struct vndbuf *) bp;
1456 	struct vndxfer *vnx = vbp->vb_xfer;
1457 	struct buf *pbp = vnx->vx_bp;		/* parent buffer */
1458 	struct swapdev	*sdp = vnx->vx_sdp;
1459 	int resid;
1460 	UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist);
1461 
1462 	UVMHIST_LOG(pdhist, "  vbp=%p vp=%p blkno=0x%lx addr=%p",
1463 	    vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data);
1464 	UVMHIST_LOG(pdhist, "  cnt=%lx resid=%lx",
1465 	    vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
1466 
1467 	splassert(IPL_BIO);
1468 
1469 	resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
1470 	pbp->b_resid -= resid;
1471 	vnx->vx_pending--;
1472 
1473 	if (vbp->vb_buf.b_error) {
1474 		UVMHIST_LOG(pdhist, "  got error=%ld !",
1475 		    vbp->vb_buf.b_error, 0, 0, 0);
1476 
1477 		/* pass error upward */
1478 		vnx->vx_error = vbp->vb_buf.b_error;
1479 	}
1480 
1481 	/*
1482 	 * disassociate this buffer from the vnode (if any).
1483 	 */
1484 	if (vbp->vb_buf.b_vp != NULL) {
1485 		brelvp(&vbp->vb_buf);
1486 	}
1487 
1488 	/*
1489 	 * kill vbp structure
1490 	 */
1491 	putvndbuf(vbp);
1492 
1493 	/*
1494 	 * wrap up this transaction if it has run to completion or, in
1495 	 * case of an error, when all auxiliary buffers have returned.
1496 	 */
1497 	if (vnx->vx_error != 0) {
1498 		/* pass error upward */
1499 		pbp->b_flags |= B_ERROR;
1500 		pbp->b_error = vnx->vx_error;
1501 		if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
1502 			putvndxfer(vnx);
1503 			biodone(pbp);
1504 		}
1505 	} else if (pbp->b_resid == 0) {
1506 		KASSERT(vnx->vx_pending == 0);
1507 		if ((vnx->vx_flags & VX_BUSY) == 0) {
1508 			UVMHIST_LOG(pdhist, "  iodone error=%ld !",
1509 			    pbp, vnx->vx_error, 0, 0);
1510 			putvndxfer(vnx);
1511 			biodone(pbp);
1512 		}
1513 	}
1514 
1515 	/*
1516 	 * done!   start next swapdev I/O if one is pending
1517 	 */
1518 	sdp->swd_tab.b_active--;
1519 	sw_reg_start(sdp);
1520 }
1521 
1522 
1523 /*
1524  * uvm_swap_alloc: allocate space on swap
1525  *
1526  * => allocation is done "round robin" down the priority list, as we
1527  *	allocate in a priority we "rotate" the circle queue.
1528  * => space can be freed with uvm_swap_free
1529  * => we return the page slot number in /dev/drum (0 == invalid slot)
1530  * => we lock uvm.swap_data_lock
1531  * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
1532  */
1533 int
1534 uvm_swap_alloc(int *nslots, boolean_t lessok)
1535 {
1536 	struct swapdev *sdp;
1537 	struct swappri *spp;
1538 	u_long	result;
1539 	UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist);
1540 
1541 	/*
1542 	 * no swap devices configured yet?   definite failure.
1543 	 */
1544 	if (uvmexp.nswapdev < 1)
1545 		return 0;
1546 
1547 	/*
1548 	 * lock data lock, convert slots into blocks, and enter loop
1549 	 */
1550 	simple_lock(&uvm.swap_data_lock);
1551 
1552 ReTry:	/* XXXMRG */
1553 	for (spp = LIST_FIRST(&swap_priority); spp != NULL;
1554 	     spp = LIST_NEXT(spp, spi_swappri)) {
1555 		for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
1556 		     sdp != (void *)&spp->spi_swapdev;
1557 		     sdp = CIRCLEQ_NEXT(sdp,swd_next)) {
1558 			/* if it's not enabled, then we can't swap from it */
1559 			if ((sdp->swd_flags & SWF_ENABLE) == 0)
1560 				continue;
1561 			if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
1562 				continue;
1563 			if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN, 0,
1564 					 EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT,
1565 					 &result) != 0) {
1566 				continue;
1567 			}
1568 
1569 			/*
1570 			 * successful allocation!  now rotate the circleq.
1571 			 */
1572 			CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1573 			CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
1574 			sdp->swd_npginuse += *nslots;
1575 			uvmexp.swpginuse += *nslots;
1576 			simple_unlock(&uvm.swap_data_lock);
1577 			/* done!  return drum slot number */
1578 			UVMHIST_LOG(pdhist,
1579 			    "success!  returning %ld slots starting at %ld",
1580 			    *nslots, result + sdp->swd_drumoffset, 0, 0);
1581 			return(result + sdp->swd_drumoffset);
1582 		}
1583 	}
1584 
1585 	/* XXXMRG: BEGIN HACK */
1586 	if (*nslots > 1 && lessok) {
1587 		*nslots = 1;
1588 		goto ReTry;	/* XXXMRG: ugh!  extent should support this for us */
1589 	}
1590 	/* XXXMRG: END HACK */
1591 
1592 	simple_unlock(&uvm.swap_data_lock);
1593 	return 0;		/* failed */
1594 }
1595 
1596 /*
1597  * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
1598  *
1599  * => we lock uvm.swap_data_lock
1600  */
1601 void
1602 uvm_swap_markbad(int startslot, int nslots)
1603 {
1604 	struct swapdev *sdp;
1605 	UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist);
1606 
1607 	simple_lock(&uvm.swap_data_lock);
1608 	sdp = swapdrum_getsdp(startslot);
1609 	if (sdp != NULL) {
1610 		/*
1611 		 * we just keep track of how many pages have been marked bad
1612 		 * in this device, to make everything add up in swap_off().
1613 		 * we assume here that the range of slots will all be within
1614 		 * one swap device.
1615 		 */
1616 		sdp->swd_npgbad += nslots;
1617 		UVMHIST_LOG(pdhist, "now %ld bad", sdp->swd_npgbad, 0,0,0);
1618 	}
1619 	simple_unlock(&uvm.swap_data_lock);
1620 }
1621 
1622 /*
1623  * uvm_swap_free: free swap slots
1624  *
1625  * => this can be all or part of an allocation made by uvm_swap_alloc
1626  * => we lock uvm.swap_data_lock
1627  */
1628 void
1629 uvm_swap_free(int startslot, int nslots)
1630 {
1631 	struct swapdev *sdp;
1632 	UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist);
1633 
1634 	UVMHIST_LOG(pdhist, "freeing %ld slots starting at %ld", nslots,
1635 	    startslot, 0, 0);
1636 
1637 	/*
1638 	 * ignore attempts to free the "bad" slot.
1639 	 */
1640 
1641 	if (startslot == SWSLOT_BAD) {
1642 		return;
1643 	}
1644 
1645 	/*
1646 	 * convert drum slot offset back to sdp, free the blocks
1647 	 * in the extent, and return.   must hold pri lock to do
1648 	 * lookup and access the extent.
1649 	 */
1650 
1651 	simple_lock(&uvm.swap_data_lock);
1652 	sdp = swapdrum_getsdp(startslot);
1653 	KASSERT(uvmexp.nswapdev >= 1);
1654 	KASSERT(sdp != NULL);
1655 	KASSERT(sdp->swd_npginuse >= nslots);
1656 	if (extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots,
1657 			EX_MALLOCOK|EX_NOWAIT) != 0) {
1658 		printf("warning: resource shortage: %d pages of swap lost\n",
1659 			nslots);
1660 	}
1661 
1662 	sdp->swd_npginuse -= nslots;
1663 	uvmexp.swpginuse -= nslots;
1664 #ifdef UVM_SWAP_ENCRYPT
1665 	{
1666 		int i;
1667 		if (swap_encrypt_initialized) {
1668 			/* Dereference keys */
1669 			for (i = 0; i < nslots; i++)
1670 				if (uvm_swap_needdecrypt(sdp, startslot + i)) {
1671 					struct swap_key *key;
1672 
1673 					key = SWD_KEY(sdp, startslot + i);
1674 					if (key->refcount != 0)
1675 						SWAP_KEY_PUT(sdp, key);
1676 				}
1677 
1678 			/* Mark range as not decrypt */
1679 			uvm_swap_markdecrypt(sdp, startslot, nslots, 0);
1680 		}
1681 	}
1682 #endif /* UVM_SWAP_ENCRYPT */
1683 	simple_unlock(&uvm.swap_data_lock);
1684 }
1685 
1686 /*
1687  * uvm_swap_put: put any number of pages into a contig place on swap
1688  *
1689  * => can be sync or async
1690  * => XXXMRG: consider making it an inline or macro
1691  */
1692 int
1693 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags)
1694 {
1695 	int	result;
1696 
1697 	result = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
1698 	    ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1699 
1700 	return (result);
1701 }
1702 
1703 /*
1704  * uvm_swap_get: get a single page from swap
1705  *
1706  * => usually a sync op (from fault)
1707  * => XXXMRG: consider making it an inline or macro
1708  */
1709 int
1710 uvm_swap_get(struct vm_page *page, int swslot, int flags)
1711 {
1712 	int	result;
1713 
1714 	uvmexp.nswget++;
1715 	KASSERT(flags & PGO_SYNCIO);
1716 	if (swslot == SWSLOT_BAD) {
1717 		return VM_PAGER_ERROR;
1718 	}
1719 
1720 	/*
1721 	 * this page is (about to be) no longer only in swap.
1722 	 */
1723 	simple_lock(&uvm.swap_data_lock);
1724 	uvmexp.swpgonly--;
1725 	simple_unlock(&uvm.swap_data_lock);
1726 
1727 	result = uvm_swap_io(&page, swslot, 1, B_READ |
1728 	    ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1729 
1730 	if (result != VM_PAGER_OK && result != VM_PAGER_PEND) {
1731 		/*
1732 		 * oops, the read failed so it really is still only in swap.
1733 		 */
1734 		simple_lock(&uvm.swap_data_lock);
1735 		uvmexp.swpgonly++;
1736 		simple_unlock(&uvm.swap_data_lock);
1737 	}
1738 
1739 	return (result);
1740 }
1741 
1742 /*
1743  * uvm_swap_io: do an i/o operation to swap
1744  */
1745 
1746 static int
1747 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags)
1748 {
1749 	daddr64_t startblk;
1750 	struct	buf *bp;
1751 	vaddr_t kva;
1752 	int	result, s, mapinflags, pflag;
1753 	boolean_t write, async;
1754 #ifdef UVM_SWAP_ENCRYPT
1755 	vaddr_t dstkva;
1756 	struct vm_page *tpps[MAXBSIZE >> PAGE_SHIFT];
1757 	struct swapdev *sdp;
1758 	int	encrypt = 0;
1759 #endif
1760 	UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist);
1761 
1762 	UVMHIST_LOG(pdhist, "<- called, startslot=%ld, npages=%ld, flags=%ld",
1763 	    startslot, npages, flags, 0);
1764 
1765 	write = (flags & B_READ) == 0;
1766 	async = (flags & B_ASYNC) != 0;
1767 
1768 	/*
1769 	 * convert starting drum slot to block number
1770 	 */
1771 	startblk = btodb((u_int64_t)startslot << PAGE_SHIFT);
1772 
1773 	/*
1774 	 * first, map the pages into the kernel (XXX: currently required
1775 	 * by buffer system).
1776 	 */
1777 	mapinflags = !write ? UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE;
1778 	if (!async)
1779 		mapinflags |= UVMPAGER_MAPIN_WAITOK;
1780 	kva = uvm_pagermapin(pps, npages, mapinflags);
1781 	if (kva == 0)
1782 		return (VM_PAGER_AGAIN);
1783 
1784 #ifdef UVM_SWAP_ENCRYPT
1785 	if (write) {
1786 		/*
1787 		 * Check if we need to do swap encryption on old pages.
1788 		 * Later we need a different scheme, that swap encrypts
1789 		 * all pages of a process that had at least one page swap
1790 		 * encrypted.  Then we might not need to copy all pages
1791 		 * in the cluster, and avoid the memory overheard in
1792 		 * swapping.
1793 		 */
1794 		if (uvm_doswapencrypt)
1795 			encrypt = 1;
1796 	}
1797 
1798 	if (swap_encrypt_initialized  || encrypt) {
1799 		/*
1800 		 * we need to know the swap device that we are swapping to/from
1801 		 * to see if the pages need to be marked for decryption or
1802 		 * actually need to be decrypted.
1803 		 * XXX - does this information stay the same over the whole
1804 		 * execution of this function?
1805 		 */
1806 		simple_lock(&uvm.swap_data_lock);
1807 		sdp = swapdrum_getsdp(startslot);
1808 		simple_unlock(&uvm.swap_data_lock);
1809 	}
1810 
1811 	/*
1812 	 * encrypt to swap
1813 	 */
1814 	if (write && encrypt) {
1815 		int i, opages;
1816 		caddr_t src, dst;
1817 		struct swap_key *key;
1818 		u_int64_t block;
1819 		int swmapflags;
1820 
1821 		/* We always need write access. */
1822 		swmapflags = UVMPAGER_MAPIN_READ;
1823 		if (!async)
1824 			swmapflags |= UVMPAGER_MAPIN_WAITOK;
1825 
1826 		if (!uvm_swap_allocpages(tpps, npages)) {
1827 			uvm_pagermapout(kva, npages);
1828 			return (VM_PAGER_AGAIN);
1829 		}
1830 
1831 		dstkva = uvm_pagermapin(tpps, npages, swmapflags);
1832 		if (dstkva == 0) {
1833 			uvm_pagermapout(kva, npages);
1834 			uvm_swap_freepages(tpps, npages);
1835 			return (VM_PAGER_AGAIN);
1836 		}
1837 
1838 		src = (caddr_t) kva;
1839 		dst = (caddr_t) dstkva;
1840 		block = startblk;
1841 		for (i = 0; i < npages; i++) {
1842 			key = SWD_KEY(sdp, startslot + i);
1843 			SWAP_KEY_GET(sdp, key);	/* add reference */
1844 
1845 			/* mark for async writes */
1846 			atomic_setbits_int(&tpps[i]->pg_flags, PQ_ENCRYPT);
1847 			swap_encrypt(key, src, dst, block, 1 << PAGE_SHIFT);
1848 			src += 1 << PAGE_SHIFT;
1849 			dst += 1 << PAGE_SHIFT;
1850 			block += btodb(1 << PAGE_SHIFT);
1851 		}
1852 
1853 		uvm_pagermapout(kva, npages);
1854 
1855 		/* dispose of pages we dont use anymore */
1856 		opages = npages;
1857 		uvm_pager_dropcluster(NULL, NULL, pps, &opages,
1858 				      PGO_PDFREECLUST);
1859 
1860 		kva = dstkva;
1861 	}
1862 #endif /* UVM_SWAP_ENCRYPT */
1863 
1864 	/*
1865 	 * now allocate a buf for the i/o.
1866 	 * [make sure we don't put the pagedaemon to sleep...]
1867 	 */
1868 	s = splbio();
1869 	pflag = (async || curproc == uvm.pagedaemon_proc) ? 0 : PR_WAITOK;
1870 	bp = pool_get(&bufpool, pflag);
1871 	splx(s);
1872 
1873 	/*
1874 	 * if we failed to get a swapbuf, return "try again"
1875 	 */
1876 	if (bp == NULL) {
1877 #ifdef UVM_SWAP_ENCRYPT
1878 		if (write && encrypt) {
1879 			int i;
1880 
1881 			/* swap encrypt needs cleanup */
1882 			for (i = 0; i < npages; i++)
1883 				SWAP_KEY_PUT(sdp, SWD_KEY(sdp, startslot + i));
1884 
1885 			uvm_pagermapout(kva, npages);
1886 			uvm_swap_freepages(tpps, npages);
1887 		}
1888 #endif
1889 		return (VM_PAGER_AGAIN);
1890 	}
1891 
1892 #ifdef UVM_SWAP_ENCRYPT
1893 	/*
1894 	 * prevent ASYNC reads.
1895 	 * uvm_swap_io is only called from uvm_swap_get, uvm_swap_get
1896 	 * assumes that all gets are SYNCIO.  Just make sure here.
1897 	 * XXXARTUBC - might not be true anymore.
1898 	 */
1899 	if (!write) {
1900 		flags &= ~B_ASYNC;
1901 		async = 0;
1902 	}
1903 #endif
1904 	/*
1905 	 * fill in the bp.   we currently route our i/o through
1906 	 * /dev/drum's vnode [swapdev_vp].
1907 	 */
1908 	bp->b_flags = B_BUSY | B_NOCACHE | B_RAW | (flags & (B_READ|B_ASYNC));
1909 	bp->b_proc = &proc0;	/* XXX */
1910 	bp->b_vnbufs.le_next = NOLIST;
1911 	bp->b_data = (caddr_t)kva;
1912 	bp->b_blkno = startblk;
1913 	LIST_INIT(&bp->b_dep);
1914 	s = splbio();
1915 	bp->b_vp = NULL;
1916 	buf_replacevnode(bp, swapdev_vp);
1917 	splx(s);
1918 	bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT;
1919 
1920 	/*
1921 	 * for pageouts we must set "dirtyoff" [NFS client code needs it].
1922 	 * and we bump v_numoutput (counter of number of active outputs).
1923 	 */
1924 	if (write) {
1925 		bp->b_dirtyoff = 0;
1926 		bp->b_dirtyend = npages << PAGE_SHIFT;
1927 #ifdef UVM_SWAP_ENCRYPT
1928 		/* mark the pages in the drum for decryption */
1929 		if (swap_encrypt_initialized)
1930 			uvm_swap_markdecrypt(sdp, startslot, npages, encrypt);
1931 #endif
1932 		s = splbio();
1933 		swapdev_vp->v_numoutput++;
1934 		splx(s);
1935 	}
1936 
1937 	/*
1938 	 * for async ops we must set up the iodone handler.
1939 	 */
1940 	if (async) {
1941 		bp->b_flags |= B_CALL | (curproc == uvm.pagedaemon_proc ?
1942 					 B_PDAEMON : 0);
1943 		bp->b_iodone = uvm_aio_biodone;
1944 		UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
1945 	}
1946 	UVMHIST_LOG(pdhist,
1947 	    "about to start io: data = %p blkno = 0x%lx, bcount = %ld",
1948 	    bp->b_data, bp->b_blkno, bp->b_bcount, 0);
1949 
1950 	/*
1951 	 * now we start the I/O, and if async, return.
1952 	 */
1953 	VOP_STRATEGY(bp);
1954 	if (async)
1955 		return (VM_PAGER_PEND);
1956 
1957 	/*
1958 	 * must be sync i/o.   wait for it to finish
1959 	 */
1960 	(void) biowait(bp);
1961 	result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK;
1962 
1963 #ifdef UVM_SWAP_ENCRYPT
1964 	/*
1965 	 * decrypt swap
1966 	 */
1967 	if (swap_encrypt_initialized &&
1968 	    (bp->b_flags & B_READ) && !(bp->b_flags & B_ERROR)) {
1969 		int i;
1970 		caddr_t data = bp->b_data;
1971 		u_int64_t block = startblk;
1972 		struct swap_key *key;
1973 
1974 		for (i = 0; i < npages; i++) {
1975 			/* Check if we need to decrypt */
1976 			if (uvm_swap_needdecrypt(sdp, startslot + i)) {
1977 				key = SWD_KEY(sdp, startslot + i);
1978 				if (key->refcount == 0) {
1979 					result = VM_PAGER_ERROR;
1980 					break;
1981 				}
1982 				swap_decrypt(key, data, data, block,
1983 					     1 << PAGE_SHIFT);
1984 			}
1985 			data += 1 << PAGE_SHIFT;
1986 			block += btodb(1 << PAGE_SHIFT);
1987 		}
1988 	}
1989 #endif
1990 	/*
1991 	 * kill the pager mapping
1992 	 */
1993 	uvm_pagermapout(kva, npages);
1994 
1995 #ifdef UVM_SWAP_ENCRYPT
1996 	/*
1997 	 *  Not anymore needed, free after encryption
1998 	 */
1999 	if ((bp->b_flags & B_READ) == 0 && encrypt)
2000 		uvm_swap_freepages(tpps, npages);
2001 #endif
2002 	/*
2003 	 * now dispose of the buf
2004 	 */
2005 	s = splbio();
2006 	if (bp->b_vp)
2007 		brelvp(bp);
2008 
2009 	if (write && bp->b_vp)
2010 		vwakeup(bp->b_vp);
2011 	pool_put(&bufpool, bp);
2012 	splx(s);
2013 
2014 	/*
2015 	 * finally return.
2016 	 */
2017 	UVMHIST_LOG(pdhist, "<- done (sync)  result=%ld", result, 0, 0, 0);
2018 	return (result);
2019 }
2020 
2021 static void
2022 swapmount(void)
2023 {
2024 	struct swapdev *sdp;
2025 	struct swappri *spp;
2026 	struct vnode *vp;
2027 	dev_t swap_dev = swdevt[0].sw_dev;
2028 
2029 	/*
2030 	 * No locking here since we happen to know that we will just be called
2031 	 * once before any other process has forked.
2032 	 */
2033 
2034 	if (swap_dev == NODEV) {
2035 		printf("swapmount: no device\n");
2036 		return;
2037 	}
2038 
2039 	if (bdevvp(swap_dev, &vp)) {
2040 		printf("swapmount: no device 2\n");
2041 		return;
2042 	}
2043 
2044 	sdp = malloc(sizeof(*sdp), M_VMSWAP, M_WAITOK|M_ZERO);
2045 	spp = malloc(sizeof(*spp), M_VMSWAP, M_WAITOK);
2046 
2047 	sdp->swd_flags = SWF_FAKE;
2048 	sdp->swd_dev = swap_dev;
2049 	sdp->swd_vp = vp;
2050 	swaplist_insert(sdp, spp, 0);
2051 	sdp->swd_pathlen = strlen("swap_device") + 1;
2052 	sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK);
2053 	if (copystr("swap_device", sdp->swd_path, sdp->swd_pathlen, 0))
2054 		panic("swapmount: copystr");
2055 
2056 	if (swap_on(curproc, sdp)) {
2057 		swaplist_find(vp, 1);
2058 		swaplist_trim();
2059 		vput(sdp->swd_vp);
2060 		free(sdp->swd_path, M_VMSWAP);
2061 		free(sdp, M_VMSWAP);
2062 		return;
2063 	}
2064 
2065 	VOP_UNLOCK(vp, 0, curproc);
2066 }
2067