xref: /openbsd-src/sys/uvm/uvm_swap.c (revision b2ea75c1b17e1a9a339660e7ed45cd24946b230e)
1 /*	$OpenBSD: uvm_swap.c,v 1.34 2001/08/11 10:57:22 art Exp $	*/
2 /*	$NetBSD: uvm_swap.c,v 1.37 2000/05/19 03:45:04 thorpej Exp $	*/
3 
4 /*
5  * Copyright (c) 1995, 1996, 1997 Matthew R. Green
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. The name of the author may not be used to endorse or promote products
17  *    derived from this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
32  * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/buf.h>
38 #include <sys/conf.h>
39 #include <sys/proc.h>
40 #include <sys/namei.h>
41 #include <sys/disklabel.h>
42 #include <sys/errno.h>
43 #include <sys/kernel.h>
44 #include <sys/malloc.h>
45 #include <sys/vnode.h>
46 #include <sys/file.h>
47 #include <sys/extent.h>
48 #include <sys/mount.h>
49 #include <sys/pool.h>
50 #include <sys/syscallargs.h>
51 #include <sys/swap.h>
52 
53 #include <vm/vm.h>
54 #include <uvm/uvm.h>
55 #ifdef UVM_SWAP_ENCRYPT
56 #include <sys/syslog.h>
57 #endif
58 
59 #include <miscfs/specfs/specdev.h>
60 
61 /*
62  * uvm_swap.c: manage configuration and i/o to swap space.
63  */
64 
65 /*
66  * swap space is managed in the following way:
67  *
68  * each swap partition or file is described by a "swapdev" structure.
69  * each "swapdev" structure contains a "swapent" structure which contains
70  * information that is passed up to the user (via system calls).
71  *
72  * each swap partition is assigned a "priority" (int) which controls
73  * swap partition usage.
74  *
75  * the system maintains a global data structure describing all swap
76  * partitions/files.   there is a sorted LIST of "swappri" structures
77  * which describe "swapdev"'s at that priority.   this LIST is headed
78  * by the "swap_priority" global var.    each "swappri" contains a
79  * CIRCLEQ of "swapdev" structures at that priority.
80  *
81  * the system maintains a fixed pool of "swapbuf" structures for use
82  * at swap i/o time.  a swapbuf includes a "buf" structure and an
83  * "aiodone" [we want to avoid malloc()'ing anything at swapout time
84  * since memory may be low].
85  *
86  * locking:
87  *  - swap_syscall_lock (sleep lock): this lock serializes the swapctl
88  *    system call and prevents the swap priority list from changing
89  *    while we are in the middle of a system call (e.g. SWAP_STATS).
90  *  - uvm.swap_data_lock (simple_lock): this lock protects all swap data
91  *    structures including the priority list, the swapdev structures,
92  *    and the swapmap extent.
93  *  - swap_buf_lock (simple_lock): this lock protects the free swapbuf
94  *    pool.
95  *
96  * each swap device has the following info:
97  *  - swap device in use (could be disabled, preventing future use)
98  *  - swap enabled (allows new allocations on swap)
99  *  - map info in /dev/drum
100  *  - vnode pointer
101  * for swap files only:
102  *  - block size
103  *  - max byte count in buffer
104  *  - buffer
105  *  - credentials to use when doing i/o to file
106  *
107  * userland controls and configures swap with the swapctl(2) system call.
108  * the sys_swapctl performs the following operations:
109  *  [1] SWAP_NSWAP: returns the number of swap devices currently configured
110  *  [2] SWAP_STATS: given a pointer to an array of swapent structures
111  *	(passed in via "arg") of a size passed in via "misc" ... we load
112  *	the current swap config into the array.
113  *  [3] SWAP_ON: given a pathname in arg (could be device or file) and a
114  *	priority in "misc", start swapping on it.
115  *  [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
116  *  [5] SWAP_CTL: changes the priority of a swap device (new priority in
117  *	"misc")
118  */
119 
120 /*
121  * swapdev: describes a single swap partition/file
122  *
123  * note the following should be true:
124  * swd_inuse <= swd_nblks  [number of blocks in use is <= total blocks]
125  * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
126  */
127 struct swapdev {
128 	struct swapent	swd_se;
129 #define	swd_dev		swd_se.se_dev		/* device id */
130 #define	swd_flags	swd_se.se_flags		/* flags:inuse/enable/fake */
131 #define	swd_priority	swd_se.se_priority	/* our priority */
132 #define	swd_inuse	swd_se.se_inuse		/* our priority */
133 #define	swd_nblks	swd_se.se_nblks		/* our priority */
134 	char			*swd_path;	/* saved pathname of device */
135 	int			swd_pathlen;	/* length of pathname */
136 	int			swd_npages;	/* #pages we can use */
137 	int			swd_npginuse;	/* #pages in use */
138 	int			swd_npgbad;	/* #pages bad */
139 	int			swd_drumoffset;	/* page0 offset in drum */
140 	int			swd_drumsize;	/* #pages in drum */
141 	struct extent		*swd_ex;	/* extent for this swapdev */
142 	struct vnode		*swd_vp;	/* backing vnode */
143 	CIRCLEQ_ENTRY(swapdev)	swd_next;	/* priority circleq */
144 
145 	int			swd_bsize;	/* blocksize (bytes) */
146 	int			swd_maxactive;	/* max active i/o reqs */
147 	struct buf		swd_tab;	/* buffer list */
148 	struct ucred		*swd_cred;	/* cred for file access */
149 #ifdef UVM_SWAP_ENCRYPT
150 #define SWD_KEY_SHIFT		7		/* One key per 0.5 MByte */
151 #define SWD_KEY(x,y)		&((x)->swd_keys[((y) - (x)->swd_drumoffset) >> SWD_KEY_SHIFT])
152 
153 #define SWD_DCRYPT_SHIFT	5
154 #define SWD_DCRYPT_BITS		32
155 #define SWD_DCRYPT_MASK		(SWD_DCRYPT_BITS - 1)
156 #define SWD_DCRYPT_OFF(x)	((x) >> SWD_DCRYPT_SHIFT)
157 #define SWD_DCRYPT_BIT(x)	((x) & SWD_DCRYPT_MASK)
158 #define SWD_DCRYPT_SIZE(x)	(SWD_DCRYPT_OFF((x) + SWD_DCRYPT_MASK) * sizeof(u_int32_t))
159 	u_int32_t		*swd_decrypt;	/* bitmap for decryption */
160 	struct swap_key		*swd_keys;	/* keys for different parts */
161 	int			swd_nkeys;	/* active keys */
162 #endif
163 };
164 
165 /*
166  * swap device priority entry; the list is kept sorted on `spi_priority'.
167  */
168 struct swappri {
169 	int			spi_priority;     /* priority */
170 	CIRCLEQ_HEAD(spi_swapdev, swapdev)	spi_swapdev;
171 	/* circleq of swapdevs at this priority */
172 	LIST_ENTRY(swappri)	spi_swappri;      /* global list of pri's */
173 };
174 
175 /*
176  * swapbuf, swapbuffer plus async i/o info
177  */
178 struct swapbuf {
179 	struct buf sw_buf;		/* a buffer structure */
180 	struct uvm_aiodesc sw_aio;	/* aiodesc structure, used if ASYNC */
181 	SIMPLEQ_ENTRY(swapbuf) sw_sq;	/* free list pointer */
182 };
183 
184 /*
185  * The following two structures are used to keep track of data transfers
186  * on swap devices associated with regular files.
187  * NOTE: this code is more or less a copy of vnd.c; we use the same
188  * structure names here to ease porting..
189  */
190 struct vndxfer {
191 	struct buf	*vx_bp;		/* Pointer to parent buffer */
192 	struct swapdev	*vx_sdp;
193 	int		vx_error;
194 	int		vx_pending;	/* # of pending aux buffers */
195 	int		vx_flags;
196 #define VX_BUSY		1
197 #define VX_DEAD		2
198 };
199 
200 struct vndbuf {
201 	struct buf	vb_buf;
202 	struct vndxfer	*vb_xfer;
203 };
204 
205 
206 /*
207  * We keep a of pool vndbuf's and vndxfer structures.
208  */
209 struct pool vndxfer_pool;
210 struct pool vndbuf_pool;
211 
212 #define	getvndxfer(vnx)	do {						\
213 	int s = splbio();						\
214 	vnx = pool_get(&vndxfer_pool, PR_MALLOCOK|PR_WAITOK);		\
215 	splx(s);							\
216 } while (0)
217 
218 #define putvndxfer(vnx) {						\
219 	pool_put(&vndxfer_pool, (void *)(vnx));				\
220 }
221 
222 #define	getvndbuf(vbp)	do {						\
223 	int s = splbio();						\
224 	vbp = pool_get(&vndbuf_pool, PR_MALLOCOK|PR_WAITOK);		\
225 	splx(s);							\
226 } while (0)
227 
228 #define putvndbuf(vbp) {						\
229 	pool_put(&vndbuf_pool, (void *)(vbp));				\
230 }
231 
232 /* /dev/drum */
233 bdev_decl(sw);
234 cdev_decl(sw);
235 
236 /*
237  * local variables
238  */
239 static struct extent *swapmap;		/* controls the mapping of /dev/drum */
240 SIMPLEQ_HEAD(swapbufhead, swapbuf);
241 struct pool swapbuf_pool;
242 
243 /* list of all active swap devices [by priority] */
244 LIST_HEAD(swap_priority, swappri);
245 static struct swap_priority swap_priority;
246 
247 /* locks */
248 lock_data_t swap_syscall_lock;
249 
250 /*
251  * prototypes
252  */
253 static void		 swapdrum_add __P((struct swapdev *, int));
254 static struct swapdev	*swapdrum_getsdp __P((int));
255 
256 static struct swapdev	*swaplist_find __P((struct vnode *, int));
257 static void		 swaplist_insert __P((struct swapdev *,
258 					     struct swappri *, int));
259 static void		 swaplist_trim __P((void));
260 
261 static int swap_on __P((struct proc *, struct swapdev *));
262 static int swap_off __P((struct proc *, struct swapdev *));
263 
264 static void sw_reg_strategy __P((struct swapdev *, struct buf *, int));
265 static void sw_reg_iodone __P((struct buf *));
266 static void sw_reg_start __P((struct swapdev *));
267 
268 static void uvm_swap_aiodone __P((struct uvm_aiodesc *));
269 static void uvm_swap_bufdone __P((struct buf *));
270 static int uvm_swap_io __P((struct vm_page **, int, int, int));
271 
272 static void swapmount __P((void));
273 
274 #ifdef UVM_SWAP_ENCRYPT
275 /* for swap encrypt */
276 boolean_t uvm_swap_allocpages __P((struct vm_page **, int));
277 void uvm_swap_freepages __P((struct vm_page **, int));
278 void uvm_swap_markdecrypt __P((struct swapdev *, int, int, int));
279 boolean_t uvm_swap_needdecrypt __P((struct swapdev *, int));
280 void uvm_swap_initcrypt __P((struct swapdev *, int));
281 #endif
282 
283 /*
284  * uvm_swap_init: init the swap system data structures and locks
285  *
286  * => called at boot time from init_main.c after the filesystems
287  *	are brought up (which happens after uvm_init())
288  */
289 void
290 uvm_swap_init()
291 {
292 	UVMHIST_FUNC("uvm_swap_init");
293 
294 	UVMHIST_CALLED(pdhist);
295 	/*
296 	 * first, init the swap list, its counter, and its lock.
297 	 * then get a handle on the vnode for /dev/drum by using
298 	 * the its dev_t number ("swapdev", from MD conf.c).
299 	 */
300 
301 	LIST_INIT(&swap_priority);
302 	uvmexp.nswapdev = 0;
303 	lockinit(&swap_syscall_lock, PVM, "swapsys", 0, 0);
304 	simple_lock_init(&uvm.swap_data_lock);
305 
306 	if (bdevvp(swapdev, &swapdev_vp))
307 		panic("uvm_swap_init: can't get vnode for swap device");
308 
309 	/*
310 	 * create swap block resource map to map /dev/drum.   the range
311 	 * from 1 to INT_MAX allows 2 gigablocks of swap space.  note
312 	 * that block 0 is reserved (used to indicate an allocation
313 	 * failure, or no allocation).
314 	 */
315 	swapmap = extent_create("swapmap", 1, INT_MAX,
316 				M_VMSWAP, 0, 0, EX_NOWAIT);
317 	if (swapmap == 0)
318 		panic("uvm_swap_init: extent_create failed");
319 
320 	/*
321 	 * allocate our private pool of "swapbuf" structures (includes
322 	 * a "buf" structure).  ["nswbuf" comes from param.c and can
323 	 * be adjusted by MD code before we get here].
324 	 */
325 
326 
327 	pool_init(&swapbuf_pool, sizeof(struct swapbuf), 0, 0, 0, "swp buf", 0,
328 			    NULL, NULL, 0);
329 	/* XXX - set a maximum on swapbuf_pool? */
330 
331 	pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx",
332 			    0, NULL, NULL, 0);
333 
334 	pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd", 0,
335 			    NULL, NULL, 0);
336 
337 	/*
338 	 * Setup the initial swap partition
339 	 */
340 	swapmount();
341 
342 	/*
343 	 * done!
344 	 */
345 	UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
346 }
347 
348 #ifdef UVM_SWAP_ENCRYPT
349 void
350 uvm_swap_initcrypt_all(void)
351 {
352 	struct swapdev *sdp;
353 	struct swappri *spp;
354 
355 	simple_lock(&uvm.swap_data_lock);
356 
357 	for (spp = swap_priority.lh_first; spp != NULL;
358 	     spp = spp->spi_swappri.le_next) {
359 		for (sdp = spp->spi_swapdev.cqh_first;
360 		     sdp != (void *)&spp->spi_swapdev;
361 		     sdp = sdp->swd_next.cqe_next)
362 			if (sdp->swd_decrypt == NULL)
363 				uvm_swap_initcrypt(sdp, sdp->swd_npages);
364 	}
365 	simple_unlock(&uvm.swap_data_lock);
366 }
367 
368 void
369 uvm_swap_initcrypt(struct swapdev *sdp, int npages)
370 {
371 	/*
372 	 * keep information if a page needs to be decrypted when we get it
373 	 * from the swap device.
374 	 * We cannot chance a malloc later, if we are doing ASYNC puts,
375 	 * we may not call malloc with M_WAITOK.  This consumes only
376 	 * 8KB memory for a 256MB swap partition.
377 	 */
378 	sdp->swd_decrypt = malloc(SWD_DCRYPT_SIZE(npages), M_VMSWAP, M_WAITOK);
379 	memset(sdp->swd_decrypt, 0, SWD_DCRYPT_SIZE(npages));
380 	sdp->swd_keys = malloc((npages >> SWD_KEY_SHIFT) * sizeof(struct swap_key),
381 			       M_VMSWAP, M_WAITOK);
382 	memset(sdp->swd_keys, 0, (npages >> SWD_KEY_SHIFT) * sizeof(struct swap_key));
383 	sdp->swd_nkeys = 0;
384 }
385 
386 boolean_t
387 uvm_swap_allocpages(struct vm_page **pps, int npages)
388 {
389 	int i, s;
390 	int minus, reserve;
391 	boolean_t fail;
392 
393 	/* Estimate if we will succeed */
394 	s = uvm_lock_fpageq();
395 
396 	minus = uvmexp.free - npages;
397 	reserve = uvmexp.reserve_kernel;
398 	fail = uvmexp.free - npages < uvmexp.reserve_kernel;
399 
400 	uvm_unlock_fpageq(s);
401 
402 	if (fail)
403 		return FALSE;
404 
405 	/* Get new pages */
406 	for (i = 0; i < npages; i++) {
407 		pps[i] = uvm_pagealloc(NULL, 0, NULL, 0);
408 		if (pps[i] == NULL)
409 			break;
410 	}
411 
412 	/* On failure free and return */
413 	if (i < npages) {
414 		uvm_swap_freepages(pps, i);
415 		return FALSE;
416 	}
417 
418 	return TRUE;
419 }
420 
421 void
422 uvm_swap_freepages(struct vm_page **pps, int npages)
423 {
424 	int i;
425 
426 	uvm_lock_pageq();
427 	for (i = 0; i < npages; i++)
428 		uvm_pagefree(pps[i]);
429 	uvm_unlock_pageq();
430 }
431 
432 /*
433  * Mark pages on the swap device for later decryption
434  */
435 
436 void
437 uvm_swap_markdecrypt(struct swapdev *sdp, int startslot, int npages,
438 		     int decrypt)
439 {
440 	int pagestart, i;
441 	int off, bit;
442 
443 	if (!sdp)
444 		return;
445 
446 	pagestart = startslot - sdp->swd_drumoffset;
447 	for (i = 0; i < npages; i++, pagestart++) {
448 		off = SWD_DCRYPT_OFF(pagestart);
449 		bit = SWD_DCRYPT_BIT(pagestart);
450 		if (decrypt)
451 			/* pages read need decryption */
452 			sdp->swd_decrypt[off] |= 1 << bit;
453 		else
454 			/* pages read do not need decryption */
455 			sdp->swd_decrypt[off] &= ~(1 << bit);
456 	}
457 }
458 
459 /*
460  * Check if the page that we got from disk needs to be decrypted
461  */
462 
463 boolean_t
464 uvm_swap_needdecrypt(struct swapdev *sdp, int off)
465 {
466 	if (!sdp)
467 		return FALSE;
468 
469 	off -= sdp->swd_drumoffset;
470 	return sdp->swd_decrypt[SWD_DCRYPT_OFF(off)] & (1 << SWD_DCRYPT_BIT(off)) ?
471 		TRUE : FALSE;
472 }
473 #endif /* UVM_SWAP_ENCRYPT */
474 /*
475  * swaplist functions: functions that operate on the list of swap
476  * devices on the system.
477  */
478 
479 /*
480  * swaplist_insert: insert swap device "sdp" into the global list
481  *
482  * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
483  * => caller must provide a newly malloc'd swappri structure (we will
484  *	FREE it if we don't need it... this it to prevent malloc blocking
485  *	here while adding swap)
486  */
487 static void
488 swaplist_insert(sdp, newspp, priority)
489 	struct swapdev *sdp;
490 	struct swappri *newspp;
491 	int priority;
492 {
493 	struct swappri *spp, *pspp;
494 	UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist);
495 
496 	/*
497 	 * find entry at or after which to insert the new device.
498 	 */
499 	for (pspp = NULL, spp = LIST_FIRST(&swap_priority); spp != NULL;
500 	     spp = LIST_NEXT(spp, spi_swappri)) {
501 		if (priority <= spp->spi_priority)
502 			break;
503 		pspp = spp;
504 	}
505 
506 	/*
507 	 * new priority?
508 	 */
509 	if (spp == NULL || spp->spi_priority != priority) {
510 		spp = newspp;  /* use newspp! */
511 		UVMHIST_LOG(pdhist, "created new swappri = %d",
512 			    priority, 0, 0, 0);
513 
514 		spp->spi_priority = priority;
515 		CIRCLEQ_INIT(&spp->spi_swapdev);
516 
517 		if (pspp)
518 			LIST_INSERT_AFTER(pspp, spp, spi_swappri);
519 		else
520 			LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
521 	} else {
522 	  	/* we don't need a new priority structure, free it */
523 		FREE(newspp, M_VMSWAP);
524 	}
525 
526 	/*
527 	 * priority found (or created).   now insert on the priority's
528 	 * circleq list and bump the total number of swapdevs.
529 	 */
530 	sdp->swd_priority = priority;
531 	CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
532 	uvmexp.nswapdev++;
533 }
534 
535 /*
536  * swaplist_find: find and optionally remove a swap device from the
537  *	global list.
538  *
539  * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
540  * => we return the swapdev we found (and removed)
541  */
542 static struct swapdev *
543 swaplist_find(vp, remove)
544 	struct vnode *vp;
545 	boolean_t remove;
546 {
547 	struct swapdev *sdp;
548 	struct swappri *spp;
549 
550 	/*
551 	 * search the lists for the requested vp
552 	 */
553 	for (spp = LIST_FIRST(&swap_priority); spp != NULL;
554 	     spp = LIST_NEXT(spp, spi_swappri)) {
555 		for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
556 		     sdp != (void *)&spp->spi_swapdev;
557 		     sdp = CIRCLEQ_NEXT(sdp, swd_next))
558 			if (sdp->swd_vp == vp) {
559 				if (remove) {
560 					CIRCLEQ_REMOVE(&spp->spi_swapdev,
561 					    sdp, swd_next);
562 					uvmexp.nswapdev--;
563 				}
564 				return(sdp);
565 			}
566 	}
567 	return (NULL);
568 }
569 
570 
571 /*
572  * swaplist_trim: scan priority list for empty priority entries and kill
573  *	them.
574  *
575  * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
576  */
577 static void
578 swaplist_trim()
579 {
580 	struct swappri *spp, *nextspp;
581 
582 	for (spp = LIST_FIRST(&swap_priority); spp != NULL; spp = nextspp) {
583 		nextspp = LIST_NEXT(spp, spi_swappri);
584 		if (CIRCLEQ_FIRST(&spp->spi_swapdev) !=
585 		    (void *)&spp->spi_swapdev)
586 			continue;
587 		LIST_REMOVE(spp, spi_swappri);
588 		free(spp, M_VMSWAP);
589 	}
590 }
591 
592 /*
593  * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area.
594  *
595  * => caller must hold swap_syscall_lock
596  * => uvm.swap_data_lock should be unlocked (we may sleep)
597  */
598 static void
599 swapdrum_add(sdp, npages)
600 	struct swapdev *sdp;
601 	int	npages;
602 {
603 	u_long result;
604 
605 	if (extent_alloc(swapmap, npages, EX_NOALIGN, 0, EX_NOBOUNDARY,
606 	    EX_WAITOK, &result))
607 		panic("swapdrum_add");
608 
609 	sdp->swd_drumoffset = result;
610 	sdp->swd_drumsize = npages;
611 }
612 
613 /*
614  * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
615  *	to the "swapdev" that maps that section of the drum.
616  *
617  * => each swapdev takes one big contig chunk of the drum
618  * => caller must hold uvm.swap_data_lock
619  */
620 static struct swapdev *
621 swapdrum_getsdp(pgno)
622 	int pgno;
623 {
624 	struct swapdev *sdp;
625 	struct swappri *spp;
626 
627 	for (spp = LIST_FIRST(&swap_priority); spp != NULL;
628 	     spp = LIST_NEXT(spp, spi_swappri))
629 		for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
630 		     sdp != (void *)&spp->spi_swapdev;
631 		     sdp = CIRCLEQ_NEXT(sdp, swd_next))
632 			if (pgno >= sdp->swd_drumoffset &&
633 			    pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
634 				return sdp;
635 			}
636 	return NULL;
637 }
638 
639 
640 /*
641  * sys_swapctl: main entry point for swapctl(2) system call
642  * 	[with two helper functions: swap_on and swap_off]
643  */
644 int
645 sys_swapctl(p, v, retval)
646 	struct proc *p;
647 	void *v;
648 	register_t *retval;
649 {
650 	struct sys_swapctl_args /* {
651 		syscallarg(int) cmd;
652 		syscallarg(void *) arg;
653 		syscallarg(int) misc;
654 	} */ *uap = (struct sys_swapctl_args *)v;
655 	struct vnode *vp;
656 	struct nameidata nd;
657 	struct swappri *spp;
658 	struct swapdev *sdp;
659 	struct swapent *sep;
660 	char	userpath[MAXPATHLEN];
661 	size_t	len;
662 	int	count, error, misc;
663 	int	priority;
664 	UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist);
665 
666 	misc = SCARG(uap, misc);
667 
668 	/*
669 	 * ensure serialized syscall access by grabbing the swap_syscall_lock
670 	 */
671 	lockmgr(&swap_syscall_lock, LK_EXCLUSIVE, NULL, p);
672 
673 	/*
674 	 * we handle the non-priv NSWAP and STATS request first.
675 	 *
676 	 * SWAP_NSWAP: return number of config'd swap devices
677 	 * [can also be obtained with uvmexp sysctl]
678 	 */
679 	if (SCARG(uap, cmd) == SWAP_NSWAP) {
680 		UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev,
681 		    0, 0, 0);
682 		*retval = uvmexp.nswapdev;
683 		error = 0;
684 		goto out;
685 	}
686 
687 	/*
688 	 * SWAP_STATS: get stats on current # of configured swap devs
689 	 *
690 	 * note that the swap_priority list can't change as long
691 	 * as we are holding the swap_syscall_lock.  we don't want
692 	 * to grab the uvm.swap_data_lock because we may fault&sleep during
693 	 * copyout() and we don't want to be holding that lock then!
694 	 */
695 	if (SCARG(uap, cmd) == SWAP_STATS
696 #if defined(COMPAT_13)
697 	    || SCARG(uap, cmd) == SWAP_OSTATS
698 #endif
699 	    ) {
700 		sep = (struct swapent *)SCARG(uap, arg);
701 		count = 0;
702 
703 		for (spp = LIST_FIRST(&swap_priority); spp != NULL;
704 		    spp = LIST_NEXT(spp, spi_swappri)) {
705 			for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
706 			     sdp != (void *)&spp->spi_swapdev && misc-- > 0;
707 			     sdp = CIRCLEQ_NEXT(sdp, swd_next)) {
708 				sdp->swd_inuse =
709 				    btodb(sdp->swd_npginuse << PAGE_SHIFT);
710 				error = copyout(&sdp->swd_se, sep,
711 				    sizeof(struct swapent));
712 
713 				/* now copy out the path if necessary */
714 #if defined(COMPAT_13)
715 				if (error == 0 && SCARG(uap, cmd) == SWAP_STATS)
716 #else
717 				if (error == 0)
718 #endif
719 					error = copyout(sdp->swd_path,
720 					    &sep->se_path, sdp->swd_pathlen);
721 
722 				if (error)
723 					goto out;
724 				count++;
725 #if defined(COMPAT_13)
726 				if (SCARG(uap, cmd) == SWAP_OSTATS)
727 					((struct oswapent *)sep)++;
728 				else
729 #endif
730 					sep++;
731 			}
732 		}
733 
734 		UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
735 
736 		*retval = count;
737 		error = 0;
738 		goto out;
739 	}
740 
741 	/*
742 	 * all other requests require superuser privs.   verify.
743 	 */
744 	if ((error = suser(p->p_ucred, &p->p_acflag)))
745 		goto out;
746 
747 	/*
748 	 * at this point we expect a path name in arg.   we will
749 	 * use namei() to gain a vnode reference (vref), and lock
750 	 * the vnode (VOP_LOCK).
751 	 *
752 	 * XXX: a NULL arg means use the root vnode pointer (e.g. for
753 	 * miniroot)
754 	 */
755 	if (SCARG(uap, arg) == NULL) {
756 		vp = rootvp;		/* miniroot */
757 		if (vget(vp, LK_EXCLUSIVE, p)) {
758 			error = EBUSY;
759 			goto out;
760 		}
761 		if (SCARG(uap, cmd) == SWAP_ON &&
762 		    copystr("miniroot", userpath, sizeof userpath, &len))
763 			panic("swapctl: miniroot copy failed");
764 	} else {
765 		int	space;
766 		char	*where;
767 
768 		if (SCARG(uap, cmd) == SWAP_ON) {
769 			if ((error = copyinstr(SCARG(uap, arg), userpath,
770 			    sizeof userpath, &len)))
771 				goto out;
772 			space = UIO_SYSSPACE;
773 			where = userpath;
774 		} else {
775 			space = UIO_USERSPACE;
776 			where = (char *)SCARG(uap, arg);
777 		}
778 		NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, space, where, p);
779 		if ((error = namei(&nd)))
780 			goto out;
781 		vp = nd.ni_vp;
782 	}
783 	/* note: "vp" is referenced and locked */
784 
785 	error = 0;		/* assume no error */
786 	switch(SCARG(uap, cmd)) {
787 	case SWAP_DUMPDEV:
788 		if (vp->v_type != VBLK) {
789 			error = ENOTBLK;
790 			goto out;
791 		}
792 		dumpdev = vp->v_rdev;
793 
794 		break;
795 
796 	case SWAP_CTL:
797 		/*
798 		 * get new priority, remove old entry (if any) and then
799 		 * reinsert it in the correct place.  finally, prune out
800 		 * any empty priority structures.
801 		 */
802 		priority = SCARG(uap, misc);
803 		spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
804 		simple_lock(&uvm.swap_data_lock);
805 		if ((sdp = swaplist_find(vp, 1)) == NULL) {
806 			error = ENOENT;
807 		} else {
808 			swaplist_insert(sdp, spp, priority);
809 			swaplist_trim();
810 		}
811 		simple_unlock(&uvm.swap_data_lock);
812 		if (error)
813 			free(spp, M_VMSWAP);
814 		break;
815 
816 	case SWAP_ON:
817 
818 		/*
819 		 * check for duplicates.   if none found, then insert a
820 		 * dummy entry on the list to prevent someone else from
821 		 * trying to enable this device while we are working on
822 		 * it.
823 		 */
824 
825 		priority = SCARG(uap, misc);
826 		simple_lock(&uvm.swap_data_lock);
827 		if ((sdp = swaplist_find(vp, 0)) != NULL) {
828 			error = EBUSY;
829 			simple_unlock(&uvm.swap_data_lock);
830 			break;
831 		}
832 		sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK);
833 		spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
834 		memset(sdp, 0, sizeof(*sdp));
835 		sdp->swd_flags = SWF_FAKE;	/* placeholder only */
836 		sdp->swd_vp = vp;
837 		sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
838 
839 		/*
840 		 * XXX Is NFS elaboration necessary?
841 		 */
842 		if (vp->v_type == VREG) {
843 			sdp->swd_cred = crdup(p->p_ucred);
844 		}
845 
846 		swaplist_insert(sdp, spp, priority);
847 		simple_unlock(&uvm.swap_data_lock);
848 
849 		sdp->swd_pathlen = len;
850 		sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK);
851 		if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0)
852 			panic("swapctl: copystr");
853 
854 		/*
855 		 * we've now got a FAKE placeholder in the swap list.
856 		 * now attempt to enable swap on it.  if we fail, undo
857 		 * what we've done and kill the fake entry we just inserted.
858 		 * if swap_on is a success, it will clear the SWF_FAKE flag
859 		 */
860 
861 		if ((error = swap_on(p, sdp)) != 0) {
862 			simple_lock(&uvm.swap_data_lock);
863 			(void) swaplist_find(vp, 1);  /* kill fake entry */
864 			swaplist_trim();
865 			simple_unlock(&uvm.swap_data_lock);
866 			if (vp->v_type == VREG) {
867 				crfree(sdp->swd_cred);
868 			}
869 			free(sdp->swd_path, M_VMSWAP);
870 			free(sdp, M_VMSWAP);
871 			break;
872 		}
873 
874 		/*
875 		 * got it!   now add a second reference to vp so that
876 		 * we keep a reference to the vnode after we return.
877 		 */
878 		vref(vp);
879 		break;
880 
881 	case SWAP_OFF:
882 		simple_lock(&uvm.swap_data_lock);
883 		if ((sdp = swaplist_find(vp, 0)) == NULL) {
884 			simple_unlock(&uvm.swap_data_lock);
885 			error = ENXIO;
886 			break;
887 		}
888 
889 		/*
890 		 * If a device isn't in use or enabled, we
891 		 * can't stop swapping from it (again).
892 		 */
893 		if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
894 			simple_unlock(&uvm.swap_data_lock);
895 			error = EBUSY;
896 			break;
897 		}
898 
899 		/*
900 		 * do the real work.
901 		 */
902 		if ((error = swap_off(p, sdp)) != 0)
903 			goto out;
904 
905 		break;
906 
907 	default:
908 		error = EINVAL;
909 	}
910 
911 	/*
912 	 * done!   use vput to drop our reference and unlock
913 	 */
914 	vput(vp);
915 out:
916 	lockmgr(&swap_syscall_lock, LK_RELEASE, NULL, p);
917 
918 	UVMHIST_LOG(pdhist, "<- done!  error=%d", error, 0, 0, 0);
919 	return (error);
920 }
921 
922 /*
923  * swap_on: attempt to enable a swapdev for swapping.   note that the
924  *	swapdev is already on the global list, but disabled (marked
925  *	SWF_FAKE).
926  *
927  * => we avoid the start of the disk (to protect disk labels)
928  * => we also avoid the miniroot, if we are swapping to root.
929  * => caller should leave uvm.swap_data_lock unlocked, we may lock it
930  *	if needed.
931  */
932 static int
933 swap_on(p, sdp)
934 	struct proc *p;
935 	struct swapdev *sdp;
936 {
937 	static int count = 0;	/* static */
938 	struct vnode *vp;
939 	int error, npages, nblocks, size;
940 	long addr;
941 	struct vattr va;
942 #if defined(NFSCLIENT)
943 	extern int (**nfsv2_vnodeop_p) __P((void *));
944 #endif /* defined(NFSCLIENT) */
945 	dev_t dev;
946 	char *name;
947 	UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist);
948 
949 	/*
950 	 * we want to enable swapping on sdp.   the swd_vp contains
951 	 * the vnode we want (locked and ref'd), and the swd_dev
952 	 * contains the dev_t of the file, if it a block device.
953 	 */
954 
955 	vp = sdp->swd_vp;
956 	dev = sdp->swd_dev;
957 
958 	/*
959 	 * open the swap file (mostly useful for block device files to
960 	 * let device driver know what is up).
961 	 *
962 	 * we skip the open/close for root on swap because the root
963 	 * has already been opened when root was mounted (mountroot).
964 	 */
965 	if (vp != rootvp) {
966 		if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p)))
967 			return (error);
968 	}
969 
970 	/* XXX this only works for block devices */
971 	UVMHIST_LOG(pdhist, "  dev=%d, major(dev)=%d", dev, major(dev), 0,0);
972 
973 	/*
974 	 * we now need to determine the size of the swap area.   for
975 	 * block specials we can call the d_psize function.
976 	 * for normal files, we must stat [get attrs].
977 	 *
978 	 * we put the result in nblks.
979 	 * for normal files, we also want the filesystem block size
980 	 * (which we get with statfs).
981 	 */
982 	switch (vp->v_type) {
983 	case VBLK:
984 		if (bdevsw[major(dev)].d_psize == 0 ||
985 		    (nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) {
986 			error = ENXIO;
987 			goto bad;
988 		}
989 		break;
990 
991 	case VREG:
992 		if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)))
993 			goto bad;
994 		nblocks = (int)btodb(va.va_size);
995 		if ((error =
996 		     VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0)
997 			goto bad;
998 
999 		sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize;
1000 		/*
1001 		 * limit the max # of outstanding I/O requests we issue
1002 		 * at any one time.   take it easy on NFS servers.
1003 		 */
1004 #if defined(NFSCLIENT)
1005 		if (vp->v_op == nfsv2_vnodeop_p)
1006 			sdp->swd_maxactive = 2; /* XXX */
1007 		else
1008 #endif /* defined(NFSCLIENT) */
1009 			sdp->swd_maxactive = 8; /* XXX */
1010 		break;
1011 
1012 	default:
1013 		error = ENXIO;
1014 		goto bad;
1015 	}
1016 
1017 	/*
1018 	 * save nblocks in a safe place and convert to pages.
1019 	 */
1020 
1021 	sdp->swd_nblks = nblocks;
1022 	npages = dbtob((u_int64_t)nblocks) >> PAGE_SHIFT;
1023 
1024 	/*
1025 	 * for block special files, we want to make sure that leave
1026 	 * the disklabel and bootblocks alone, so we arrange to skip
1027 	 * over them (arbitrarily choosing to skip PAGE_SIZE bytes).
1028 	 * note that because of this the "size" can be less than the
1029 	 * actual number of blocks on the device.
1030 	 */
1031 	if (vp->v_type == VBLK) {
1032 		/* we use pages 1 to (size - 1) [inclusive] */
1033 		size = npages - 1;
1034 		addr = 1;
1035 	} else {
1036 		/* we use pages 0 to (size - 1) [inclusive] */
1037 		size = npages;
1038 		addr = 0;
1039 	}
1040 
1041 	/*
1042 	 * make sure we have enough blocks for a reasonable sized swap
1043 	 * area.   we want at least one page.
1044 	 */
1045 
1046 	if (size < 1) {
1047 		UVMHIST_LOG(pdhist, "  size <= 1!!", 0, 0, 0, 0);
1048 		error = EINVAL;
1049 		goto bad;
1050 	}
1051 
1052 	UVMHIST_LOG(pdhist, "  dev=%x: size=%d addr=%ld\n", dev, size, addr, 0);
1053 
1054 	/*
1055 	 * now we need to allocate an extent to manage this swap device
1056 	 */
1057 	name = malloc(12, M_VMSWAP, M_WAITOK);
1058 	sprintf(name, "swap0x%04x", count++);
1059 
1060 	/* note that extent_create's 3rd arg is inclusive, thus "- 1" */
1061 	sdp->swd_ex = extent_create(name, 0, npages - 1, M_VMSWAP,
1062 				    0, 0, EX_WAITOK);
1063 	/* allocate the `saved' region from the extent so it won't be used */
1064 	if (addr) {
1065 		if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK))
1066 			panic("disklabel region");
1067 	}
1068 
1069 	/*
1070 	 * if the vnode we are swapping to is the root vnode
1071 	 * (i.e. we are swapping to the miniroot) then we want
1072 	 * to make sure we don't overwrite it.   do a statfs to
1073 	 * find its size and skip over it.
1074 	 */
1075 	if (vp == rootvp) {
1076 		struct mount *mp;
1077 		struct statfs *sp;
1078 		int rootblocks, rootpages;
1079 
1080 		mp = rootvnode->v_mount;
1081 		sp = &mp->mnt_stat;
1082 		rootblocks = sp->f_blocks * btodb(sp->f_bsize);
1083 		rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
1084 		if (rootpages > size)
1085 			panic("swap_on: miniroot larger than swap?");
1086 
1087 		if (extent_alloc_region(sdp->swd_ex, addr,
1088 					rootpages, EX_WAITOK))
1089 			panic("swap_on: unable to preserve miniroot");
1090 
1091 		size -= rootpages;
1092 		printf("Preserved %d pages of miniroot ", rootpages);
1093 		printf("leaving %d pages of swap\n", size);
1094 	}
1095 
1096   	/*
1097 	 * add anons to reflect the new swap space
1098 	 */
1099 	uvm_anon_add(size);
1100 
1101 #ifdef UVM_SWAP_ENCRYPT
1102 	if (uvm_doswapencrypt)
1103 		uvm_swap_initcrypt(sdp, npages);
1104 #endif
1105 	/*
1106 	 * now add the new swapdev to the drum and enable.
1107 	 */
1108 	simple_lock(&uvm.swap_data_lock);
1109 	swapdrum_add(sdp, npages);
1110 	sdp->swd_npages = size;
1111 	sdp->swd_flags &= ~SWF_FAKE;	/* going live */
1112 	sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
1113 	simple_unlock(&uvm.swap_data_lock);
1114 	uvmexp.swpages += size;
1115 
1116 	return (0);
1117 
1118 bad:
1119 	/*
1120 	 * failure: close device if necessary and return error.
1121 	 */
1122 	if (vp != rootvp)
1123 		(void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p);
1124 	return (error);
1125 }
1126 
1127 /*
1128  * swap_off: stop swapping on swapdev
1129  *
1130  * => swap data should be locked, we will unlock.
1131  */
1132 static int
1133 swap_off(p, sdp)
1134 	struct proc *p;
1135 	struct swapdev *sdp;
1136 {
1137 	void *name;
1138 	UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist);
1139 	UVMHIST_LOG(pdhist, "  dev=%x", sdp->swd_dev,0,0,0);
1140 
1141 	/* disable the swap area being removed */
1142 	sdp->swd_flags &= ~SWF_ENABLE;
1143 	simple_unlock(&uvm.swap_data_lock);
1144 
1145 	/*
1146 	 * the idea is to find all the pages that are paged out to this
1147 	 * device, and page them all in.  in uvm, swap-backed pageable
1148 	 * memory can take two forms: aobjs and anons.  call the
1149 	 * swapoff hook for each subsystem to bring in pages.
1150 	 */
1151 
1152 	if (uao_swap_off(sdp->swd_drumoffset,
1153 			 sdp->swd_drumoffset + sdp->swd_drumsize) ||
1154 	    anon_swap_off(sdp->swd_drumoffset,
1155 			  sdp->swd_drumoffset + sdp->swd_drumsize)) {
1156 
1157 		simple_lock(&uvm.swap_data_lock);
1158 		sdp->swd_flags |= SWF_ENABLE;
1159 		simple_unlock(&uvm.swap_data_lock);
1160 		return ENOMEM;
1161 	}
1162 
1163 #ifdef DIAGNOSTIC
1164 	if (sdp->swd_npginuse != sdp->swd_npgbad) {
1165 		panic("swap_off: sdp %p - %d pages still in use (%d bad)\n",
1166 		      sdp, sdp->swd_npginuse, sdp->swd_npgbad);
1167 	}
1168 #endif
1169 
1170 	/*
1171 	 * done with the vnode.
1172 	 */
1173 	if (sdp->swd_vp->v_type == VREG) {
1174 		crfree(sdp->swd_cred);
1175 	}
1176 	if (sdp->swd_vp != rootvp) {
1177 		(void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p);
1178 	}
1179 	if (sdp->swd_vp) {
1180 		vrele(sdp->swd_vp);
1181 	}
1182 
1183 	/* remove anons from the system */
1184 	uvm_anon_remove(sdp->swd_npages);
1185 
1186 	simple_lock(&uvm.swap_data_lock);
1187 	uvmexp.swpages -= sdp->swd_npages;
1188 
1189 	if (swaplist_find(sdp->swd_vp, 1) == NULL)
1190 		panic("swap_off: swapdev not in list\n");
1191 	swaplist_trim();
1192 
1193 	/*
1194 	 * free all resources!
1195 	 */
1196 	extent_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize,
1197 		    EX_WAITOK);
1198 	name = (void *)sdp->swd_ex->ex_name;
1199 	extent_destroy(sdp->swd_ex);
1200 	free(name, M_VMSWAP);
1201 	free(sdp, M_VMSWAP);
1202 	simple_unlock(&uvm.swap_data_lock);
1203 	return (0);
1204 }
1205 
1206 /*
1207  * /dev/drum interface and i/o functions
1208  */
1209 
1210 /*
1211  * swread: the read function for the drum (just a call to physio)
1212  */
1213 /*ARGSUSED*/
1214 int
1215 swread(dev, uio, ioflag)
1216 	dev_t dev;
1217 	struct uio *uio;
1218 	int ioflag;
1219 {
1220 	UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist);
1221 
1222 	UVMHIST_LOG(pdhist, "  dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1223 	return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
1224 }
1225 
1226 /*
1227  * swwrite: the write function for the drum (just a call to physio)
1228  */
1229 /*ARGSUSED*/
1230 int
1231 swwrite(dev, uio, ioflag)
1232 	dev_t dev;
1233 	struct uio *uio;
1234 	int ioflag;
1235 {
1236 	UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist);
1237 
1238 	UVMHIST_LOG(pdhist, "  dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1239 	return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
1240 }
1241 
1242 /*
1243  * swstrategy: perform I/O on the drum
1244  *
1245  * => we must map the i/o request from the drum to the correct swapdev.
1246  */
1247 void
1248 swstrategy(bp)
1249 	struct buf *bp;
1250 {
1251 	struct swapdev *sdp;
1252 	int s, pageno, bn;
1253 	UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist);
1254 
1255 	/*
1256 	 * convert block number to swapdev.   note that swapdev can't
1257 	 * be yanked out from under us because we are holding resources
1258 	 * in it (i.e. the blocks we are doing I/O on).
1259 	 */
1260 	pageno = dbtob(bp->b_blkno) >> PAGE_SHIFT;
1261 	simple_lock(&uvm.swap_data_lock);
1262 	sdp = swapdrum_getsdp(pageno);
1263 	simple_unlock(&uvm.swap_data_lock);
1264 	if (sdp == NULL) {
1265 		bp->b_error = EINVAL;
1266 		bp->b_flags |= B_ERROR;
1267 		biodone(bp);
1268 		UVMHIST_LOG(pdhist, "  failed to get swap device", 0, 0, 0, 0);
1269 		return;
1270 	}
1271 
1272 	/*
1273 	 * convert drum page number to block number on this swapdev.
1274 	 */
1275 
1276 	pageno -= sdp->swd_drumoffset;	/* page # on swapdev */
1277 	bn = btodb(pageno << PAGE_SHIFT);	/* convert to diskblock */
1278 
1279 	UVMHIST_LOG(pdhist, "  %s: mapoff=%x bn=%x bcount=%ld\n",
1280 		((bp->b_flags & B_READ) == 0) ? "write" : "read",
1281 		sdp->swd_drumoffset, bn, bp->b_bcount);
1282 
1283 	/*
1284 	 * for block devices we finish up here.
1285 	 * for regular files we have to do more work which we delegate
1286 	 * to sw_reg_strategy().
1287 	 */
1288 
1289 	switch (sdp->swd_vp->v_type) {
1290 	default:
1291 		panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type);
1292 
1293 	case VBLK:
1294 
1295 		/*
1296 		 * must convert "bp" from an I/O on /dev/drum to an I/O
1297 		 * on the swapdev (sdp).
1298 		 */
1299 		s = splbio();
1300 		buf_replacevnode(bp, sdp->swd_vp);
1301 
1302 		bp->b_blkno = bn;
1303       		splx(s);
1304 		VOP_STRATEGY(bp);
1305 		return;
1306 
1307 	case VREG:
1308 		/*
1309 		 * delegate to sw_reg_strategy function.
1310 		 */
1311 		sw_reg_strategy(sdp, bp, bn);
1312 		return;
1313 	}
1314 	/* NOTREACHED */
1315 }
1316 
1317 /*
1318  * sw_reg_strategy: handle swap i/o to regular files
1319  */
1320 static void
1321 sw_reg_strategy(sdp, bp, bn)
1322 	struct swapdev	*sdp;
1323 	struct buf	*bp;
1324 	int		bn;
1325 {
1326 	struct vnode	*vp;
1327 	struct vndxfer	*vnx;
1328 	daddr_t		nbn, byteoff;
1329 	caddr_t		addr;
1330 	int		s, off, nra, error, sz, resid;
1331 	UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist);
1332 
1333 	/*
1334 	 * allocate a vndxfer head for this transfer and point it to
1335 	 * our buffer.
1336 	 */
1337 	getvndxfer(vnx);
1338 	vnx->vx_flags = VX_BUSY;
1339 	vnx->vx_error = 0;
1340 	vnx->vx_pending = 0;
1341 	vnx->vx_bp = bp;
1342 	vnx->vx_sdp = sdp;
1343 
1344 	/*
1345 	 * setup for main loop where we read filesystem blocks into
1346 	 * our buffer.
1347 	 */
1348 	error = 0;
1349 	bp->b_resid = bp->b_bcount;	/* nothing transferred yet! */
1350 	addr = bp->b_data;		/* current position in buffer */
1351 	byteoff = dbtob(bn);
1352 
1353 	for (resid = bp->b_resid; resid; resid -= sz) {
1354 		struct vndbuf	*nbp;
1355 
1356 		/*
1357 		 * translate byteoffset into block number.  return values:
1358 		 *   vp = vnode of underlying device
1359 		 *  nbn = new block number (on underlying vnode dev)
1360 		 *  nra = num blocks we can read-ahead (excludes requested
1361 		 *	block)
1362 		 */
1363 		nra = 0;
1364 		error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
1365 				 	&vp, &nbn, &nra);
1366 
1367 		if (error == 0 && nbn == (daddr_t)-1) {
1368 			/*
1369 			 * this used to just set error, but that doesn't
1370 			 * do the right thing.  Instead, it causes random
1371 			 * memory errors.  The panic() should remain until
1372 			 * this condition doesn't destabilize the system.
1373 			 */
1374 #if 1
1375 			panic("sw_reg_strategy: swap to sparse file");
1376 #else
1377 			error = EIO;	/* failure */
1378 #endif
1379 		}
1380 
1381 		/*
1382 		 * punt if there was an error or a hole in the file.
1383 		 * we must wait for any i/o ops we have already started
1384 		 * to finish before returning.
1385 		 *
1386 		 * XXX we could deal with holes here but it would be
1387 		 * a hassle (in the write case).
1388 		 */
1389 		if (error) {
1390 			s = splbio();
1391 			vnx->vx_error = error;	/* pass error up */
1392 			goto out;
1393 		}
1394 
1395 		/*
1396 		 * compute the size ("sz") of this transfer (in bytes).
1397 		 * XXXCDC: ignores read-ahead for non-zero offset
1398 		 */
1399 		if ((off = (byteoff % sdp->swd_bsize)) != 0)
1400 			sz = sdp->swd_bsize - off;
1401 		else
1402 			sz = (1 + nra) * sdp->swd_bsize;
1403 
1404 		if (resid < sz)
1405 			sz = resid;
1406 
1407 		UVMHIST_LOG(pdhist, "sw_reg_strategy: vp %p/%p offset 0x%x/0x%x",
1408 				sdp->swd_vp, vp, byteoff, nbn);
1409 
1410 		/*
1411 		 * now get a buf structure.   note that the vb_buf is
1412 		 * at the front of the nbp structure so that you can
1413 		 * cast pointers between the two structure easily.
1414 		 */
1415 		getvndbuf(nbp);
1416 		nbp->vb_buf.b_flags    = bp->b_flags | B_CALL;
1417 		nbp->vb_buf.b_bcount   = sz;
1418 		nbp->vb_buf.b_bufsize  = sz;
1419 		nbp->vb_buf.b_error    = 0;
1420 		nbp->vb_buf.b_data     = addr;
1421 		nbp->vb_buf.b_blkno    = nbn + btodb(off);
1422 		nbp->vb_buf.b_proc     = bp->b_proc;
1423 		nbp->vb_buf.b_iodone   = sw_reg_iodone;
1424 		nbp->vb_buf.b_vp       = NULLVP;
1425 		nbp->vb_buf.b_vnbufs.le_next = NOLIST;
1426 		nbp->vb_buf.b_rcred    = sdp->swd_cred;
1427 		nbp->vb_buf.b_wcred    = sdp->swd_cred;
1428 		LIST_INIT(&nbp->vb_buf.b_dep);
1429 
1430 		/*
1431 		 * set b_dirtyoff/end and b_validoff/end.   this is
1432 		 * required by the NFS client code (otherwise it will
1433 		 * just discard our I/O request).
1434 		 */
1435 		if (bp->b_dirtyend == 0) {
1436 			nbp->vb_buf.b_dirtyoff = 0;
1437 			nbp->vb_buf.b_dirtyend = sz;
1438 		} else {
1439 			nbp->vb_buf.b_dirtyoff =
1440 			    max(0, bp->b_dirtyoff - (bp->b_bcount-resid));
1441 			nbp->vb_buf.b_dirtyend =
1442 			    min(sz,
1443 				max(0, bp->b_dirtyend - (bp->b_bcount-resid)));
1444 		}
1445 		if (bp->b_validend == 0) {
1446 			nbp->vb_buf.b_validoff = 0;
1447 			nbp->vb_buf.b_validend = sz;
1448 		} else {
1449 			nbp->vb_buf.b_validoff =
1450 			    max(0, bp->b_validoff - (bp->b_bcount-resid));
1451 			nbp->vb_buf.b_validend =
1452 			    min(sz,
1453 				max(0, bp->b_validend - (bp->b_bcount-resid)));
1454 		}
1455 
1456 		nbp->vb_xfer = vnx;	/* patch it back in to vnx */
1457 
1458 		/*
1459 		 * Just sort by block number
1460 		 */
1461 		nbp->vb_buf.b_cylinder = nbp->vb_buf.b_blkno;
1462 		s = splbio();
1463 		if (vnx->vx_error != 0) {
1464 			putvndbuf(nbp);
1465 			goto out;
1466 		}
1467 		vnx->vx_pending++;
1468 
1469 		/* assoc new buffer with underlying vnode */
1470 		bgetvp(vp, &nbp->vb_buf);
1471 
1472 		/* sort it in and start I/O if we are not over our limit */
1473 		disksort(&sdp->swd_tab, &nbp->vb_buf);
1474 		sw_reg_start(sdp);
1475 		splx(s);
1476 
1477 		/*
1478 		 * advance to the next I/O
1479 		 */
1480 		byteoff += sz;
1481 		addr += sz;
1482 	}
1483 
1484 	s = splbio();
1485 
1486 out: /* Arrive here at splbio */
1487 	vnx->vx_flags &= ~VX_BUSY;
1488 	if (vnx->vx_pending == 0) {
1489 		if (vnx->vx_error != 0) {
1490 			bp->b_error = vnx->vx_error;
1491 			bp->b_flags |= B_ERROR;
1492 		}
1493 		putvndxfer(vnx);
1494 		biodone(bp);
1495 	}
1496 	splx(s);
1497 }
1498 
1499 /*
1500  * sw_reg_start: start an I/O request on the requested swapdev
1501  *
1502  * => reqs are sorted by disksort (above)
1503  */
1504 static void
1505 sw_reg_start(sdp)
1506 	struct swapdev	*sdp;
1507 {
1508 	struct buf	*bp;
1509 	UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist);
1510 
1511 	/* recursion control */
1512 	if ((sdp->swd_flags & SWF_BUSY) != 0)
1513 		return;
1514 
1515 	sdp->swd_flags |= SWF_BUSY;
1516 
1517 	while (sdp->swd_tab.b_active < sdp->swd_maxactive) {
1518 		bp = sdp->swd_tab.b_actf;
1519 		if (bp == NULL)
1520 			break;
1521 		sdp->swd_tab.b_actf = bp->b_actf;
1522 		sdp->swd_tab.b_active++;
1523 
1524 		UVMHIST_LOG(pdhist,
1525 		    "sw_reg_start:  bp %p vp %p blkno %p cnt %lx",
1526 		    bp, bp->b_vp, bp->b_blkno, bp->b_bcount);
1527 		if ((bp->b_flags & B_READ) == 0)
1528 			bp->b_vp->v_numoutput++;
1529 		VOP_STRATEGY(bp);
1530 	}
1531 	sdp->swd_flags &= ~SWF_BUSY;
1532 }
1533 
1534 /*
1535  * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
1536  *
1537  * => note that we can recover the vndbuf struct by casting the buf ptr
1538  */
1539 static void
1540 sw_reg_iodone(bp)
1541 	struct buf *bp;
1542 {
1543 	struct vndbuf *vbp = (struct vndbuf *) bp;
1544 	struct vndxfer *vnx = vbp->vb_xfer;
1545 	struct buf *pbp = vnx->vx_bp;		/* parent buffer */
1546 	struct swapdev	*sdp = vnx->vx_sdp;
1547 	int		s, resid;
1548 	UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist);
1549 
1550 	UVMHIST_LOG(pdhist, "  vbp=%p vp=%p blkno=%x addr=%p",
1551 	    vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data);
1552 	UVMHIST_LOG(pdhist, "  cnt=%lx resid=%lx",
1553 	    vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
1554 
1555 	/*
1556 	 * protect vbp at splbio and update.
1557 	 */
1558 
1559 	s = splbio();
1560 	resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
1561 	pbp->b_resid -= resid;
1562 	vnx->vx_pending--;
1563 
1564 	if (vbp->vb_buf.b_error) {
1565 		UVMHIST_LOG(pdhist, "  got error=%d !",
1566 		    vbp->vb_buf.b_error, 0, 0, 0);
1567 
1568 		/* pass error upward */
1569 		vnx->vx_error = vbp->vb_buf.b_error;
1570 	}
1571 
1572 	/*
1573 	 * disassociate this buffer from the vnode (if any).
1574 	 */
1575 	if (vbp->vb_buf.b_vp != NULLVP) {
1576 		brelvp(&vbp->vb_buf);
1577 	}
1578 
1579 	/*
1580 	 * disassociate this buffer from the vnode (if any).
1581 	 */
1582 	if (vbp->vb_buf.b_vp != NULLVP) {
1583 		brelvp(&vbp->vb_buf);
1584 	}
1585 
1586 	/*
1587 	 * kill vbp structure
1588 	 */
1589 	putvndbuf(vbp);
1590 
1591 	/*
1592 	 * wrap up this transaction if it has run to completion or, in
1593 	 * case of an error, when all auxiliary buffers have returned.
1594 	 */
1595 	if (vnx->vx_error != 0) {
1596 		/* pass error upward */
1597 		pbp->b_flags |= B_ERROR;
1598 		pbp->b_error = vnx->vx_error;
1599 		if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
1600 			putvndxfer(vnx);
1601 			biodone(pbp);
1602 		}
1603 	} else if (pbp->b_resid == 0) {
1604 #ifdef DIAGNOSTIC
1605 		if (vnx->vx_pending != 0)
1606 			panic("sw_reg_iodone: vnx pending: %d",vnx->vx_pending);
1607 #endif
1608 
1609 		if ((vnx->vx_flags & VX_BUSY) == 0) {
1610 			UVMHIST_LOG(pdhist, "  iodone error=%d !",
1611 			    pbp, vnx->vx_error, 0, 0);
1612 			putvndxfer(vnx);
1613 			biodone(pbp);
1614 		}
1615 	}
1616 
1617 	/*
1618 	 * done!   start next swapdev I/O if one is pending
1619 	 */
1620 	sdp->swd_tab.b_active--;
1621 	sw_reg_start(sdp);
1622 	splx(s);
1623 }
1624 
1625 
1626 /*
1627  * uvm_swap_alloc: allocate space on swap
1628  *
1629  * => allocation is done "round robin" down the priority list, as we
1630  *	allocate in a priority we "rotate" the circle queue.
1631  * => space can be freed with uvm_swap_free
1632  * => we return the page slot number in /dev/drum (0 == invalid slot)
1633  * => we lock uvm.swap_data_lock
1634  * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
1635  */
1636 int
1637 uvm_swap_alloc(nslots, lessok)
1638 	int *nslots;	/* IN/OUT */
1639 	boolean_t lessok;
1640 {
1641 	struct swapdev *sdp;
1642 	struct swappri *spp;
1643 	u_long	result;
1644 	UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist);
1645 
1646 	/*
1647 	 * no swap devices configured yet?   definite failure.
1648 	 */
1649 	if (uvmexp.nswapdev < 1)
1650 		return 0;
1651 
1652 	/*
1653 	 * lock data lock, convert slots into blocks, and enter loop
1654 	 */
1655 	simple_lock(&uvm.swap_data_lock);
1656 
1657 ReTry:	/* XXXMRG */
1658 	for (spp = LIST_FIRST(&swap_priority); spp != NULL;
1659 	     spp = LIST_NEXT(spp, spi_swappri)) {
1660 		for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
1661 		     sdp != (void *)&spp->spi_swapdev;
1662 		     sdp = CIRCLEQ_NEXT(sdp,swd_next)) {
1663 			/* if it's not enabled, then we can't swap from it */
1664 			if ((sdp->swd_flags & SWF_ENABLE) == 0)
1665 				continue;
1666 			if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
1667 				continue;
1668 			if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN, 0,
1669 					 EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT,
1670 					 &result) != 0) {
1671 				continue;
1672 			}
1673 
1674 			/*
1675 			 * successful allocation!  now rotate the circleq.
1676 			 */
1677 			CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1678 			CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
1679 			sdp->swd_npginuse += *nslots;
1680 			uvmexp.swpginuse += *nslots;
1681 			simple_unlock(&uvm.swap_data_lock);
1682 			/* done!  return drum slot number */
1683 			UVMHIST_LOG(pdhist,
1684 			    "success!  returning %d slots starting at %d",
1685 			    *nslots, result + sdp->swd_drumoffset, 0, 0);
1686 			return(result + sdp->swd_drumoffset);
1687 		}
1688 	}
1689 
1690 	/* XXXMRG: BEGIN HACK */
1691 	if (*nslots > 1 && lessok) {
1692 		*nslots = 1;
1693 		goto ReTry;	/* XXXMRG: ugh!  extent should support this for us */
1694 	}
1695 	/* XXXMRG: END HACK */
1696 
1697 	simple_unlock(&uvm.swap_data_lock);
1698 	return 0;		/* failed */
1699 }
1700 
1701 /*
1702  * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
1703  *
1704  * => we lock uvm.swap_data_lock
1705  */
1706 void
1707 uvm_swap_markbad(startslot, nslots)
1708 	int startslot;
1709 	int nslots;
1710 {
1711 	struct swapdev *sdp;
1712 	UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist);
1713 
1714 	simple_lock(&uvm.swap_data_lock);
1715 	sdp = swapdrum_getsdp(startslot);
1716 
1717 	/*
1718 	 * we just keep track of how many pages have been marked bad
1719 	 * in this device, to make everything add up in swap_off().
1720 	 * we assume here that the range of slots will all be within
1721 	 * one swap device.
1722 	 */
1723 	sdp->swd_npgbad += nslots;
1724 
1725 	simple_unlock(&uvm.swap_data_lock);
1726 }
1727 
1728 /*
1729  * uvm_swap_free: free swap slots
1730  *
1731  * => this can be all or part of an allocation made by uvm_swap_alloc
1732  * => we lock uvm.swap_data_lock
1733  */
1734 void
1735 uvm_swap_free(startslot, nslots)
1736 	int startslot;
1737 	int nslots;
1738 {
1739 	struct swapdev *sdp;
1740 	UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist);
1741 
1742 	UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots,
1743 	    startslot, 0, 0);
1744 
1745 	/*
1746 	 * ignore attempts to free the "bad" slot.
1747 	 */
1748 	if (startslot == SWSLOT_BAD) {
1749 		return;
1750 	}
1751 
1752 	/*
1753 	 * convert drum slot offset back to sdp, free the blocks
1754 	 * in the extent, and return.   must hold pri lock to do
1755 	 * lookup and access the extent.
1756 	 */
1757 	simple_lock(&uvm.swap_data_lock);
1758 	sdp = swapdrum_getsdp(startslot);
1759 
1760 #ifdef DIAGNOSTIC
1761 	if (uvmexp.nswapdev < 1)
1762 		panic("uvm_swap_free: uvmexp.nswapdev < 1\n");
1763 	if (sdp == NULL) {
1764 		printf("uvm_swap_free: startslot %d, nslots %d\n", startslot,
1765 		    nslots);
1766 		panic("uvm_swap_free: unmapped address\n");
1767 	}
1768 #endif
1769 	if (extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots,
1770 			EX_MALLOCOK|EX_NOWAIT) != 0) {
1771 		printf("warning: resource shortage: %d pages of swap lost\n",
1772 			nslots);
1773 	}
1774 
1775 	sdp->swd_npginuse -= nslots;
1776 	uvmexp.swpginuse -= nslots;
1777 #ifdef DIAGNOSTIC
1778 	if (sdp->swd_npginuse < 0)
1779 		panic("uvm_swap_free: inuse < 0");
1780 #endif
1781 #ifdef UVM_SWAP_ENCRYPT
1782 	{
1783 		int i;
1784 		if (swap_encrypt_initalized) {
1785 			/* Dereference keys */
1786 			for (i = 0; i < nslots; i++)
1787 				if (uvm_swap_needdecrypt(sdp, startslot + i))
1788 					SWAP_KEY_PUT(sdp, SWD_KEY(sdp, startslot + i));
1789 
1790 			/* Mark range as not decrypt */
1791 			uvm_swap_markdecrypt(sdp, startslot, nslots, 0);
1792 		}
1793 	}
1794 #endif /* UVM_SWAP_ENCRYPT */
1795 	simple_unlock(&uvm.swap_data_lock);
1796 }
1797 
1798 /*
1799  * uvm_swap_put: put any number of pages into a contig place on swap
1800  *
1801  * => can be sync or async
1802  * => XXXMRG: consider making it an inline or macro
1803  */
1804 int
1805 uvm_swap_put(swslot, ppsp, npages, flags)
1806 	int swslot;
1807 	struct vm_page **ppsp;
1808 	int	npages;
1809 	int	flags;
1810 {
1811 	int	result;
1812 
1813 	result = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
1814 	    ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1815 
1816 	return (result);
1817 }
1818 
1819 /*
1820  * uvm_swap_get: get a single page from swap
1821  *
1822  * => usually a sync op (from fault)
1823  * => XXXMRG: consider making it an inline or macro
1824  */
1825 int
1826 uvm_swap_get(page, swslot, flags)
1827 	struct vm_page *page;
1828 	int swslot, flags;
1829 {
1830 	int	result;
1831 
1832 	uvmexp.nswget++;
1833 #ifdef DIAGNOSTIC
1834 	if ((flags & PGO_SYNCIO) == 0)
1835 		printf("uvm_swap_get: ASYNC get requested?\n");
1836 #endif
1837 
1838 	if (swslot == SWSLOT_BAD) {
1839 		return VM_PAGER_ERROR;
1840 	}
1841 
1842 	/*
1843 	 * this page is (about to be) no longer only in swap.
1844 	 */
1845 	simple_lock(&uvm.swap_data_lock);
1846 	uvmexp.swpgonly--;
1847 	simple_unlock(&uvm.swap_data_lock);
1848 
1849 	result = uvm_swap_io(&page, swslot, 1, B_READ |
1850 	    ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1851 
1852 	if (result != VM_PAGER_OK && result != VM_PAGER_PEND) {
1853 		/*
1854 		 * oops, the read failed so it really is still only in swap.
1855 		 */
1856 		simple_lock(&uvm.swap_data_lock);
1857 		uvmexp.swpgonly++;
1858 		simple_unlock(&uvm.swap_data_lock);
1859 	}
1860 
1861 	return (result);
1862 }
1863 
1864 /*
1865  * uvm_swap_io: do an i/o operation to swap
1866  */
1867 
1868 static int
1869 uvm_swap_io(pps, startslot, npages, flags)
1870 	struct vm_page **pps;
1871 	int startslot, npages, flags;
1872 {
1873 	daddr_t startblk;
1874 	struct swapbuf *sbp;
1875 	struct	buf *bp;
1876 	vaddr_t kva;
1877 	int	result, s, mapinflags, pflag;
1878 #ifdef UVM_SWAP_ENCRYPT
1879 	vaddr_t dstkva;
1880 	struct vm_page *tpps[MAXBSIZE >> PAGE_SHIFT];
1881 	struct swapdev *sdp;
1882 	int	encrypt = 0;
1883 #endif
1884 	UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist);
1885 
1886 	UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d",
1887 	    startslot, npages, flags, 0);
1888 
1889 	/*
1890 	 * convert starting drum slot to block number
1891 	 */
1892 	startblk = btodb(startslot << PAGE_SHIFT);
1893 
1894 	/*
1895 	 * first, map the pages into the kernel (XXX: currently required
1896 	 * by buffer system).   note that we don't let pagermapin alloc
1897 	 * an aiodesc structure because we don't want to chance a malloc.
1898 	 * we've got our own pool of aiodesc structures (in swapbuf).
1899 	 */
1900 	mapinflags = (flags & B_READ) ? UVMPAGER_MAPIN_READ :
1901 	    UVMPAGER_MAPIN_WRITE;
1902 	if ((flags & B_ASYNC) == 0)
1903 		mapinflags |= UVMPAGER_MAPIN_WAITOK;
1904 	kva = uvm_pagermapin(pps, npages, NULL, mapinflags);
1905 	if (kva == 0)
1906 		return (VM_PAGER_AGAIN);
1907 
1908 #ifdef UVM_SWAP_ENCRYPT
1909 	if ((flags & B_READ) == 0) {
1910 		/*
1911 		 * Check if we need to do swap encryption on old pages.
1912 		 * Later we need a different scheme, that swap encrypts
1913 		 * all pages of a process that had at least one page swap
1914 		 * encrypted.  Then we might not need to copy all pages
1915 		 * in the cluster, and avoid the memory overheard in
1916 		 * swapping.
1917 		 */
1918 		if (uvm_doswapencrypt)
1919 			encrypt = 1;
1920 	}
1921 
1922 	if (swap_encrypt_initalized  || encrypt) {
1923 		/*
1924 		 * we need to know the swap device that we are swapping to/from
1925 		 * to see if the pages need to be marked for decryption or
1926 		 * actually need to be decrypted.
1927 		 * XXX - does this information stay the same over the whole
1928 		 * execution of this function?
1929 		 */
1930 		simple_lock(&uvm.swap_data_lock);
1931 		sdp = swapdrum_getsdp(startslot);
1932 		simple_unlock(&uvm.swap_data_lock);
1933 	}
1934 
1935 	/*
1936 	 * encrypt to swap
1937 	 */
1938 	if ((flags & B_READ) == 0 && encrypt) {
1939 		int i, opages;
1940 		caddr_t src, dst;
1941 		struct swap_key *key;
1942 		u_int64_t block;
1943 		int swmapflags;
1944 
1945 		/* We always need write access. */
1946 		swmapflags = UVMPAGER_MAPIN_READ;
1947 		if ((flags & B_ASYNC) == 0)
1948 			swmapflags |= UVMPAGER_MAPIN_WAITOK;
1949 
1950 		if (!uvm_swap_allocpages(tpps, npages)) {
1951 			uvm_pagermapout(kva, npages);
1952 			return (VM_PAGER_AGAIN);
1953 		}
1954 
1955 		dstkva = uvm_pagermapin(tpps, npages, NULL, swmapflags);
1956 		if (dstkva == NULL) {
1957 			uvm_pagermapout(kva, npages);
1958 			uvm_swap_freepages(tpps, npages);
1959 			return (VM_PAGER_AGAIN);
1960 		}
1961 
1962 		src = (caddr_t) kva;
1963 		dst = (caddr_t) dstkva;
1964 		block = startblk;
1965 		for (i = 0; i < npages; i++) {
1966 			key = SWD_KEY(sdp, startslot + i);
1967 			SWAP_KEY_GET(sdp, key);	/* add reference */
1968 
1969 			/* mark for async writes */
1970 			tpps[i]->pqflags |= PQ_ENCRYPT;
1971 			swap_encrypt(key, src, dst, block, 1 << PAGE_SHIFT);
1972 			src += 1 << PAGE_SHIFT;
1973 			dst += 1 << PAGE_SHIFT;
1974 			block += btodb(1 << PAGE_SHIFT);
1975 		}
1976 
1977 		uvm_pagermapout(kva, npages);
1978 
1979 		/* dispose of pages we dont use anymore */
1980 		opages = npages;
1981 		uvm_pager_dropcluster(NULL, NULL, pps, &opages,
1982 				      PGO_PDFREECLUST);
1983 
1984 		kva = dstkva;
1985 	}
1986 #endif /* UVM_SWAP_ENCRYPT */
1987 
1988 	/*
1989 	 * now allocate a swap buffer off of freesbufs
1990 	 * [make sure we don't put the pagedaemon to sleep...]
1991 	 */
1992 	s = splbio();
1993 	pflag = ((flags & B_ASYNC) != 0 || curproc == uvm.pagedaemon_proc)
1994 		? 0
1995 		: PR_WAITOK;
1996 	sbp = pool_get(&swapbuf_pool, pflag);
1997 	splx(s);		/* drop splbio */
1998 
1999 	/*
2000 	 * if we failed to get a swapbuf, return "try again"
2001 	 */
2002 	if (sbp == NULL) {
2003 #ifdef UVM_SWAP_ENCRYPT
2004 		if ((flags & B_READ) == 0 && encrypt) {
2005 			int i;
2006 
2007 			/* swap encrypt needs cleanup */
2008 			for (i = 0; i < npages; i++)
2009 				SWAP_KEY_PUT(sdp, SWD_KEY(sdp, startslot + i));
2010 
2011 			uvm_pagermapout(kva, npages);
2012 			uvm_swap_freepages(tpps, npages);
2013 		}
2014 #endif
2015 		return (VM_PAGER_AGAIN);
2016 	}
2017 
2018 #ifdef UVM_SWAP_ENCRYPT
2019 	/*
2020 	 * prevent ASYNC reads.
2021 	 * uvm_swap_io is only called from uvm_swap_get, uvm_swap_get
2022 	 * assumes that all gets are SYNCIO.  Just make sure here.
2023 	 */
2024 	if (flags & B_READ)
2025 		flags &= ~B_ASYNC;
2026 #endif
2027 	/*
2028 	 * fill in the bp/sbp.   we currently route our i/o through
2029 	 * /dev/drum's vnode [swapdev_vp].
2030 	 */
2031 	bp = &sbp->sw_buf;
2032 	bp->b_flags = B_BUSY | B_NOCACHE | (flags & (B_READ|B_ASYNC));
2033 	bp->b_proc = &proc0;	/* XXX */
2034 	bp->b_rcred = bp->b_wcred = proc0.p_ucred;
2035 	bp->b_vnbufs.le_next = NOLIST;
2036 	bp->b_data = (caddr_t)kva;
2037 	bp->b_blkno = startblk;
2038 	LIST_INIT(&bp->b_dep);
2039 	s = splbio();
2040 	bp->b_vp = NULL;
2041 	buf_replacevnode(bp, swapdev_vp);
2042 	splx(s);
2043 	bp->b_bcount = npages << PAGE_SHIFT;
2044 
2045 	/*
2046 	 * for pageouts we must set "dirtyoff" [NFS client code needs it].
2047 	 * and we bump v_numoutput (counter of number of active outputs).
2048 	 */
2049 	if ((bp->b_flags & B_READ) == 0) {
2050 		bp->b_dirtyoff = 0;
2051 		bp->b_dirtyend = npages << PAGE_SHIFT;
2052 #ifdef UVM_SWAP_ENCRYPT
2053 		/* mark the pages in the drum for decryption */
2054 		if (swap_encrypt_initalized)
2055 			uvm_swap_markdecrypt(sdp, startslot, npages, encrypt);
2056 #endif
2057 		s = splbio();
2058 		swapdev_vp->v_numoutput++;
2059 		splx(s);
2060 	}
2061 
2062 	/*
2063 	 * for async ops we must set up the aiodesc and setup the callback
2064 	 * XXX: we expect no async-reads, but we don't prevent it here.
2065 	 */
2066 	if (flags & B_ASYNC) {
2067 		sbp->sw_aio.aiodone = uvm_swap_aiodone;
2068 		sbp->sw_aio.kva = kva;
2069 		sbp->sw_aio.npages = npages;
2070 		sbp->sw_aio.pd_ptr = sbp;	/* backpointer */
2071 		bp->b_flags |= B_CALL;		/* set callback */
2072 		bp->b_iodone = uvm_swap_bufdone;/* "buf" iodone function */
2073 		UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
2074 	}
2075 	UVMHIST_LOG(pdhist,
2076 	    "about to start io: data = 0x%p blkno = 0x%x, bcount = %ld",
2077 	    bp->b_data, bp->b_blkno, bp->b_bcount, 0);
2078 
2079 	/*
2080 	 * now we start the I/O, and if async, return.
2081 	 */
2082 	VOP_STRATEGY(bp);
2083 	if (flags & B_ASYNC)
2084 		return (VM_PAGER_PEND);
2085 
2086 	/*
2087 	 * must be sync i/o.   wait for it to finish
2088 	 */
2089 	bp->b_error = biowait(bp);
2090 	result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK;
2091 
2092 #ifdef UVM_SWAP_ENCRYPT
2093 	/*
2094 	 * decrypt swap
2095 	 */
2096 	if (swap_encrypt_initalized &&
2097 	    (bp->b_flags & B_READ) && !(bp->b_flags & B_ERROR)) {
2098 		int i;
2099 		caddr_t data = bp->b_data;
2100 		u_int64_t block = startblk;
2101 		struct swap_key *key = NULL;
2102 
2103 		for (i = 0; i < npages; i++) {
2104 			/* Check if we need to decrypt */
2105 			if (uvm_swap_needdecrypt(sdp, startslot + i)) {
2106 				key = SWD_KEY(sdp, startslot + i);
2107 				swap_decrypt(key, data, data, block,
2108 					     1 << PAGE_SHIFT);
2109 			}
2110 			data += 1 << PAGE_SHIFT;
2111 			block += btodb(1 << PAGE_SHIFT);
2112 		}
2113 	}
2114 #endif
2115 	/*
2116 	 * kill the pager mapping
2117 	 */
2118 	uvm_pagermapout(kva, npages);
2119 
2120 #ifdef UVM_SWAP_ENCRYPT
2121 	/*
2122 	 *  Not anymore needed, free after encryption
2123 	 */
2124 	if ((bp->b_flags & B_READ) == 0 && encrypt)
2125 		uvm_swap_freepages(tpps, npages);
2126 #endif
2127 	/*
2128 	 * now dispose of the swap buffer
2129 	 */
2130 	s = splbio();
2131 	if (bp->b_vp)
2132 		brelvp(bp);
2133 
2134 	pool_put(&swapbuf_pool, sbp);
2135 	splx(s);
2136 
2137 	/*
2138 	 * finally return.
2139 	 */
2140 	UVMHIST_LOG(pdhist, "<- done (sync)  result=%d", result, 0, 0, 0);
2141 	return (result);
2142 }
2143 
2144 /*
2145  * uvm_swap_bufdone: called from the buffer system when the i/o is done
2146  */
2147 static void
2148 uvm_swap_bufdone(bp)
2149 	struct buf *bp;
2150 {
2151 	struct swapbuf *sbp = (struct swapbuf *) bp;
2152 	int	s = splbio();
2153 	UVMHIST_FUNC("uvm_swap_bufdone"); UVMHIST_CALLED(pdhist);
2154 
2155 	UVMHIST_LOG(pdhist, "cleaning buf %p", buf, 0, 0, 0);
2156 #ifdef DIAGNOSTIC
2157 	/*
2158 	 * sanity check: swapbufs are private, so they shouldn't be wanted
2159 	 */
2160 	if (bp->b_flags & B_WANTED)
2161 		panic("uvm_swap_bufdone: private buf wanted");
2162 #endif
2163 
2164 	/*
2165 	 * drop the buffer's reference to the vnode.
2166 	 */
2167 	if (bp->b_vp)
2168 		brelvp(bp);
2169 
2170 	/*
2171 	 * now put the aio on the uvm.aio_done list and wake the
2172 	 * pagedaemon (which will finish up our job in its context).
2173 	 */
2174 	simple_lock(&uvm.pagedaemon_lock);	/* locks uvm.aio_done */
2175 	TAILQ_INSERT_TAIL(&uvm.aio_done, &sbp->sw_aio, aioq);
2176 	simple_unlock(&uvm.pagedaemon_lock);
2177 
2178 	wakeup(&uvm.pagedaemon);
2179 	splx(s);
2180 }
2181 
2182 /*
2183  * uvm_swap_aiodone: aiodone function for anonymous memory
2184  *
2185  * => this is called in the context of the pagedaemon (but with the
2186  *	page queues unlocked!)
2187  * => our "aio" structure must be part of a "swapbuf"
2188  */
2189 static void
2190 uvm_swap_aiodone(aio)
2191 	struct uvm_aiodesc *aio;
2192 {
2193 	struct swapbuf *sbp = aio->pd_ptr;
2194 	struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT];
2195 	int lcv, s;
2196 	vaddr_t addr;
2197 	UVMHIST_FUNC("uvm_swap_aiodone"); UVMHIST_CALLED(pdhist);
2198 
2199 	UVMHIST_LOG(pdhist, "done with aio %p", aio, 0, 0, 0);
2200 #ifdef DIAGNOSTIC
2201 	/*
2202 	 * sanity check
2203 	 */
2204 	if (aio->npages > (MAXBSIZE >> PAGE_SHIFT))
2205 		panic("uvm_swap_aiodone: aio too big!");
2206 #endif
2207 
2208 	/*
2209 	 * first, we have to recover the page pointers (pps) by poking in the
2210 	 * kernel pmap (XXX: should be saved in the buf structure).
2211 	 */
2212 	for (addr = aio->kva, lcv = 0 ; lcv < aio->npages ;
2213 		addr += PAGE_SIZE, lcv++) {
2214 		pps[lcv] = uvm_pageratop(addr);
2215 	}
2216 
2217 	/*
2218 	 * now we can dispose of the kernel mappings of the buffer
2219 	 */
2220 	uvm_pagermapout(aio->kva, aio->npages);
2221 
2222 	/*
2223 	 * now we can dispose of the pages by using the dropcluster function
2224 	 * [note that we have no "page of interest" so we pass in null]
2225 	 */
2226 
2227 #ifdef UVM_SWAP_ENCRYPT
2228 	/*
2229 	 * XXX - assumes that we only get ASYNC writes. used to be above.
2230 	 */
2231 	if (pps[0]->pqflags & PQ_ENCRYPT)
2232 		uvm_swap_freepages(pps, aio->npages);
2233 	else
2234 #endif /* UVM_SWAP_ENCRYPT */
2235 	uvm_pager_dropcluster(NULL, NULL, pps, &aio->npages,
2236 			      PGO_PDFREECLUST);
2237 
2238 	/*
2239 	 * finally, we can dispose of the swapbuf
2240 	 */
2241 	s = splbio();
2242 	pool_put(&swapbuf_pool, sbp);
2243 	splx(s);
2244 
2245 	/*
2246 	 * done!
2247 	 */
2248 }
2249 
2250 static void
2251 swapmount()
2252 {
2253 	struct swapdev *sdp;
2254 	struct swappri *spp;
2255 	struct vnode *vp;
2256 	dev_t swap_dev = swdevt[0].sw_dev;
2257 
2258 	/*
2259 	 * No locking here since we happen to know that we will just be called
2260 	 * once before any other process has forked.
2261 	 */
2262 
2263 	if (swap_dev == NODEV) {
2264 		printf("swapmount: no device\n");
2265 		return;
2266 	}
2267 
2268 	if (bdevvp(swap_dev, &vp)) {
2269 		printf("swapmount: no device 2\n");
2270 		return;
2271 	}
2272 
2273 	sdp = malloc(sizeof(*sdp), M_VMSWAP, M_WAITOK);
2274 	spp = malloc(sizeof(*spp), M_VMSWAP, M_WAITOK);
2275 	memset(sdp, 0, sizeof(*sdp));
2276 
2277 	sdp->swd_flags = SWF_FAKE;
2278 	sdp->swd_dev = swap_dev;
2279 	sdp->swd_vp = vp;
2280 	swaplist_insert(sdp, spp, 0);
2281 	sdp->swd_pathlen = strlen("swap_device") + 1;
2282 	sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK);
2283 	if (copystr("swap_device", sdp->swd_path, sdp->swd_pathlen, 0))
2284 		panic("swapmount: copystr");
2285 
2286 	if (swap_on(curproc, sdp)) {
2287 		swaplist_find(vp, 1);
2288 		swaplist_trim();
2289 		vput(sdp->swd_vp);
2290 		free(sdp->swd_path, M_VMSWAP);
2291 		free(sdp, M_VMSWAP);
2292 		return;
2293 	}
2294 
2295 	VOP_UNLOCK(vp, 0, curproc);
2296 }
2297