xref: /netbsd-src/sys/uvm/uvm_swap.c (revision aef5eb5f59cdfe8314f1b5f78ac04eb144e44010)
1 /*	$NetBSD: uvm_swap.c,v 1.206 2021/08/23 13:08:18 hannken Exp $	*/
2 
3 /*
4  * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
29  * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
30  */
31 
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.206 2021/08/23 13:08:18 hannken Exp $");
34 
35 #include "opt_uvmhist.h"
36 #include "opt_compat_netbsd.h"
37 #include "opt_ddb.h"
38 #include "opt_vmswap.h"
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/atomic.h>
43 #include <sys/buf.h>
44 #include <sys/bufq.h>
45 #include <sys/conf.h>
46 #include <sys/cprng.h>
47 #include <sys/proc.h>
48 #include <sys/namei.h>
49 #include <sys/disklabel.h>
50 #include <sys/errno.h>
51 #include <sys/kernel.h>
52 #include <sys/vnode.h>
53 #include <sys/file.h>
54 #include <sys/vmem.h>
55 #include <sys/blist.h>
56 #include <sys/mount.h>
57 #include <sys/pool.h>
58 #include <sys/kmem.h>
59 #include <sys/syscallargs.h>
60 #include <sys/swap.h>
61 #include <sys/kauth.h>
62 #include <sys/sysctl.h>
63 #include <sys/workqueue.h>
64 
65 #include <uvm/uvm.h>
66 
67 #include <miscfs/specfs/specdev.h>
68 
69 #include <crypto/aes/aes.h>
70 #include <crypto/aes/aes_cbc.h>
71 
72 /*
73  * uvm_swap.c: manage configuration and i/o to swap space.
74  */
75 
76 /*
77  * swap space is managed in the following way:
78  *
79  * each swap partition or file is described by a "swapdev" structure.
80  * each "swapdev" structure contains a "swapent" structure which contains
81  * information that is passed up to the user (via system calls).
82  *
83  * each swap partition is assigned a "priority" (int) which controls
84  * swap partition usage.
85  *
86  * the system maintains a global data structure describing all swap
87  * partitions/files.   there is a sorted LIST of "swappri" structures
88  * which describe "swapdev"'s at that priority.   this LIST is headed
89  * by the "swap_priority" global var.    each "swappri" contains a
90  * TAILQ of "swapdev" structures at that priority.
91  *
92  * locking:
93  *  - swap_syscall_lock (krwlock_t): this lock serializes the swapctl
94  *    system call and prevents the swap priority list from changing
95  *    while we are in the middle of a system call (e.g. SWAP_STATS).
96  *  - uvm_swap_data_lock (kmutex_t): this lock protects all swap data
97  *    structures including the priority list, the swapdev structures,
98  *    and the swapmap arena.
99  *
100  * each swap device has the following info:
101  *  - swap device in use (could be disabled, preventing future use)
102  *  - swap enabled (allows new allocations on swap)
103  *  - map info in /dev/drum
104  *  - vnode pointer
105  * for swap files only:
106  *  - block size
107  *  - max byte count in buffer
108  *  - buffer
109  *
110  * userland controls and configures swap with the swapctl(2) system call.
111  * the sys_swapctl performs the following operations:
112  *  [1] SWAP_NSWAP: returns the number of swap devices currently configured
113  *  [2] SWAP_STATS: given a pointer to an array of swapent structures
114  *	(passed in via "arg") of a size passed in via "misc" ... we load
115  *	the current swap config into the array. The actual work is done
116  *	in the uvm_swap_stats() function.
117  *  [3] SWAP_ON: given a pathname in arg (could be device or file) and a
118  *	priority in "misc", start swapping on it.
119  *  [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
120  *  [5] SWAP_CTL: changes the priority of a swap device (new priority in
121  *	"misc")
122  */
123 
124 /*
125  * swapdev: describes a single swap partition/file
126  *
127  * note the following should be true:
128  * swd_inuse <= swd_nblks  [number of blocks in use is <= total blocks]
129  * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
130  */
131 struct swapdev {
132 	dev_t			swd_dev;	/* device id */
133 	int			swd_flags;	/* flags:inuse/enable/fake */
134 	int			swd_priority;	/* our priority */
135 	int			swd_nblks;	/* blocks in this device */
136 	char			*swd_path;	/* saved pathname of device */
137 	int			swd_pathlen;	/* length of pathname */
138 	int			swd_npages;	/* #pages we can use */
139 	int			swd_npginuse;	/* #pages in use */
140 	int			swd_npgbad;	/* #pages bad */
141 	int			swd_drumoffset;	/* page0 offset in drum */
142 	int			swd_drumsize;	/* #pages in drum */
143 	blist_t			swd_blist;	/* blist for this swapdev */
144 	struct vnode		*swd_vp;	/* backing vnode */
145 	TAILQ_ENTRY(swapdev)	swd_next;	/* priority tailq */
146 
147 	int			swd_bsize;	/* blocksize (bytes) */
148 	int			swd_maxactive;	/* max active i/o reqs */
149 	struct bufq_state	*swd_tab;	/* buffer list */
150 	int			swd_active;	/* number of active buffers */
151 
152 	volatile uint32_t	*swd_encmap;	/* bitmap of encrypted slots */
153 	struct aesenc		swd_enckey;	/* AES key expanded for enc */
154 	struct aesdec		swd_deckey;	/* AES key expanded for dec */
155 	bool			swd_encinit;	/* true if keys initialized */
156 };
157 
158 /*
159  * swap device priority entry; the list is kept sorted on `spi_priority'.
160  */
161 struct swappri {
162 	int			spi_priority;     /* priority */
163 	TAILQ_HEAD(spi_swapdev, swapdev)	spi_swapdev;
164 	/* tailq of swapdevs at this priority */
165 	LIST_ENTRY(swappri)	spi_swappri;      /* global list of pri's */
166 };
167 
168 /*
169  * The following two structures are used to keep track of data transfers
170  * on swap devices associated with regular files.
171  * NOTE: this code is more or less a copy of vnd.c; we use the same
172  * structure names here to ease porting..
173  */
174 struct vndxfer {
175 	struct buf	*vx_bp;		/* Pointer to parent buffer */
176 	struct swapdev	*vx_sdp;
177 	int		vx_error;
178 	int		vx_pending;	/* # of pending aux buffers */
179 	int		vx_flags;
180 #define VX_BUSY		1
181 #define VX_DEAD		2
182 };
183 
184 struct vndbuf {
185 	struct buf	vb_buf;
186 	struct vndxfer	*vb_xfer;
187 };
188 
189 /*
190  * We keep a of pool vndbuf's and vndxfer structures.
191  */
192 static struct pool vndxfer_pool, vndbuf_pool;
193 
194 /*
195  * local variables
196  */
197 static vmem_t *swapmap;	/* controls the mapping of /dev/drum */
198 
199 /* list of all active swap devices [by priority] */
200 LIST_HEAD(swap_priority, swappri);
201 static struct swap_priority swap_priority;
202 
203 /* locks */
204 static kmutex_t uvm_swap_data_lock __cacheline_aligned;
205 static krwlock_t swap_syscall_lock;
206 bool uvm_swap_init_done = false;
207 
208 /* workqueue and use counter for swap to regular files */
209 static int sw_reg_count = 0;
210 static struct workqueue *sw_reg_workqueue;
211 
212 /* tuneables */
213 u_int uvm_swapisfull_factor = 99;
214 #if VMSWAP_DEFAULT_PLAINTEXT
215 bool uvm_swap_encrypt = false;
216 #else
217 bool uvm_swap_encrypt = true;
218 #endif
219 
220 /*
221  * prototypes
222  */
223 static struct swapdev	*swapdrum_getsdp(int);
224 
225 static struct swapdev	*swaplist_find(struct vnode *, bool);
226 static void		 swaplist_insert(struct swapdev *,
227 					 struct swappri *, int);
228 static void		 swaplist_trim(void);
229 
230 static int swap_on(struct lwp *, struct swapdev *);
231 static int swap_off(struct lwp *, struct swapdev *);
232 
233 static void sw_reg_strategy(struct swapdev *, struct buf *, int);
234 static void sw_reg_biodone(struct buf *);
235 static void sw_reg_iodone(struct work *wk, void *dummy);
236 static void sw_reg_start(struct swapdev *);
237 
238 static int uvm_swap_io(struct vm_page **, int, int, int);
239 
240 static void uvm_swap_genkey(struct swapdev *);
241 static void uvm_swap_encryptpage(struct swapdev *, void *, int);
242 static void uvm_swap_decryptpage(struct swapdev *, void *, int);
243 
244 static size_t
245 encmap_size(size_t npages)
246 {
247 	struct swapdev *sdp;
248 	const size_t bytesperword = sizeof(sdp->swd_encmap[0]);
249 	const size_t bitsperword = NBBY * bytesperword;
250 	const size_t nbits = npages; /* one bit for each page */
251 	const size_t nwords = howmany(nbits, bitsperword);
252 	const size_t nbytes = nwords * bytesperword;
253 
254 	return nbytes;
255 }
256 
257 /*
258  * uvm_swap_init: init the swap system data structures and locks
259  *
260  * => called at boot time from init_main.c after the filesystems
261  *	are brought up (which happens after uvm_init())
262  */
263 void
264 uvm_swap_init(void)
265 {
266 	UVMHIST_FUNC(__func__);
267 
268 	UVMHIST_CALLED(pdhist);
269 	/*
270 	 * first, init the swap list, its counter, and its lock.
271 	 * then get a handle on the vnode for /dev/drum by using
272 	 * the its dev_t number ("swapdev", from MD conf.c).
273 	 */
274 
275 	LIST_INIT(&swap_priority);
276 	uvmexp.nswapdev = 0;
277 	rw_init(&swap_syscall_lock);
278 	mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE);
279 
280 	if (bdevvp(swapdev, &swapdev_vp))
281 		panic("%s: can't get vnode for swap device", __func__);
282 	if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY))
283 		panic("%s: can't lock swap device", __func__);
284 	if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED))
285 		panic("%s: can't open swap device", __func__);
286 	VOP_UNLOCK(swapdev_vp);
287 
288 	/*
289 	 * create swap block resource map to map /dev/drum.   the range
290 	 * from 1 to INT_MAX allows 2 gigablocks of swap space.  note
291 	 * that block 0 is reserved (used to indicate an allocation
292 	 * failure, or no allocation).
293 	 */
294 	swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0,
295 	    VM_NOSLEEP, IPL_NONE);
296 	if (swapmap == 0) {
297 		panic("%s: vmem_create failed", __func__);
298 	}
299 
300 	pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx",
301 	    NULL, IPL_BIO);
302 	pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd",
303 	    NULL, IPL_BIO);
304 
305 	uvm_swap_init_done = true;
306 
307 	UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
308 }
309 
310 /*
311  * swaplist functions: functions that operate on the list of swap
312  * devices on the system.
313  */
314 
315 /*
316  * swaplist_insert: insert swap device "sdp" into the global list
317  *
318  * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
319  * => caller must provide a newly allocated swappri structure (we will
320  *	FREE it if we don't need it... this it to prevent allocation
321  *	blocking here while adding swap)
322  */
323 static void
324 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority)
325 {
326 	struct swappri *spp, *pspp;
327 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
328 
329 	KASSERT(rw_write_held(&swap_syscall_lock));
330 	KASSERT(mutex_owned(&uvm_swap_data_lock));
331 
332 	/*
333 	 * find entry at or after which to insert the new device.
334 	 */
335 	pspp = NULL;
336 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
337 		if (priority <= spp->spi_priority)
338 			break;
339 		pspp = spp;
340 	}
341 
342 	/*
343 	 * new priority?
344 	 */
345 	if (spp == NULL || spp->spi_priority != priority) {
346 		spp = newspp;  /* use newspp! */
347 		UVMHIST_LOG(pdhist, "created new swappri = %jd",
348 			    priority, 0, 0, 0);
349 
350 		spp->spi_priority = priority;
351 		TAILQ_INIT(&spp->spi_swapdev);
352 
353 		if (pspp)
354 			LIST_INSERT_AFTER(pspp, spp, spi_swappri);
355 		else
356 			LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
357 	} else {
358 	  	/* we don't need a new priority structure, free it */
359 		kmem_free(newspp, sizeof(*newspp));
360 	}
361 
362 	/*
363 	 * priority found (or created).   now insert on the priority's
364 	 * tailq list and bump the total number of swapdevs.
365 	 */
366 	sdp->swd_priority = priority;
367 	TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
368 	uvmexp.nswapdev++;
369 }
370 
371 /*
372  * swaplist_find: find and optionally remove a swap device from the
373  *	global list.
374  *
375  * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
376  * => we return the swapdev we found (and removed)
377  */
378 static struct swapdev *
379 swaplist_find(struct vnode *vp, bool remove)
380 {
381 	struct swapdev *sdp;
382 	struct swappri *spp;
383 
384 	KASSERT(rw_lock_held(&swap_syscall_lock));
385 	KASSERT(remove ? rw_write_held(&swap_syscall_lock) : 1);
386 	KASSERT(mutex_owned(&uvm_swap_data_lock));
387 
388 	/*
389 	 * search the lists for the requested vp
390 	 */
391 
392 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
393 		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
394 			if (sdp->swd_vp == vp) {
395 				if (remove) {
396 					TAILQ_REMOVE(&spp->spi_swapdev,
397 					    sdp, swd_next);
398 					uvmexp.nswapdev--;
399 				}
400 				return(sdp);
401 			}
402 		}
403 	}
404 	return (NULL);
405 }
406 
407 /*
408  * swaplist_trim: scan priority list for empty priority entries and kill
409  *	them.
410  *
411  * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
412  */
413 static void
414 swaplist_trim(void)
415 {
416 	struct swappri *spp, *nextspp;
417 
418 	KASSERT(rw_write_held(&swap_syscall_lock));
419 	KASSERT(mutex_owned(&uvm_swap_data_lock));
420 
421 	LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) {
422 		if (!TAILQ_EMPTY(&spp->spi_swapdev))
423 			continue;
424 		LIST_REMOVE(spp, spi_swappri);
425 		kmem_free(spp, sizeof(*spp));
426 	}
427 }
428 
429 /*
430  * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
431  *	to the "swapdev" that maps that section of the drum.
432  *
433  * => each swapdev takes one big contig chunk of the drum
434  * => caller must hold uvm_swap_data_lock
435  */
436 static struct swapdev *
437 swapdrum_getsdp(int pgno)
438 {
439 	struct swapdev *sdp;
440 	struct swappri *spp;
441 
442 	KASSERT(mutex_owned(&uvm_swap_data_lock));
443 
444 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
445 		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
446 			if (sdp->swd_flags & SWF_FAKE)
447 				continue;
448 			if (pgno >= sdp->swd_drumoffset &&
449 			    pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
450 				return sdp;
451 			}
452 		}
453 	}
454 	return NULL;
455 }
456 
457 /*
458  * swapdrum_sdp_is: true iff the swap device for pgno is sdp
459  *
460  * => for use in positive assertions only; result is not stable
461  */
462 static bool __debugused
463 swapdrum_sdp_is(int pgno, struct swapdev *sdp)
464 {
465 	bool result;
466 
467 	mutex_enter(&uvm_swap_data_lock);
468 	result = swapdrum_getsdp(pgno) == sdp;
469 	mutex_exit(&uvm_swap_data_lock);
470 
471 	return result;
472 }
473 
474 void swapsys_lock(krw_t op)
475 {
476 	rw_enter(&swap_syscall_lock, op);
477 }
478 
479 void swapsys_unlock(void)
480 {
481 	rw_exit(&swap_syscall_lock);
482 }
483 
484 static void
485 swapent_cvt(struct swapent *se, const struct swapdev *sdp, int inuse)
486 {
487 	se->se_dev = sdp->swd_dev;
488 	se->se_flags = sdp->swd_flags;
489 	se->se_nblks = sdp->swd_nblks;
490 	se->se_inuse = inuse;
491 	se->se_priority = sdp->swd_priority;
492 	KASSERT(sdp->swd_pathlen < sizeof(se->se_path));
493 	strcpy(se->se_path, sdp->swd_path);
494 }
495 
496 int (*uvm_swap_stats13)(const struct sys_swapctl_args *, register_t *) =
497     (void *)enosys;
498 int (*uvm_swap_stats50)(const struct sys_swapctl_args *, register_t *) =
499     (void *)enosys;
500 
501 /*
502  * sys_swapctl: main entry point for swapctl(2) system call
503  * 	[with two helper functions: swap_on and swap_off]
504  */
505 int
506 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval)
507 {
508 	/* {
509 		syscallarg(int) cmd;
510 		syscallarg(void *) arg;
511 		syscallarg(int) misc;
512 	} */
513 	struct vnode *vp;
514 	struct nameidata nd;
515 	struct swappri *spp;
516 	struct swapdev *sdp;
517 #define SWAP_PATH_MAX (PATH_MAX + 1)
518 	char	*userpath;
519 	size_t	len = 0;
520 	int	error;
521 	int	priority;
522 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
523 
524 	/*
525 	 * we handle the non-priv NSWAP and STATS request first.
526 	 *
527 	 * SWAP_NSWAP: return number of config'd swap devices
528 	 * [can also be obtained with uvmexp sysctl]
529 	 */
530 	if (SCARG(uap, cmd) == SWAP_NSWAP) {
531 		const int nswapdev = uvmexp.nswapdev;
532 		UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%jd", nswapdev,
533 		    0, 0, 0);
534 		*retval = nswapdev;
535 		return 0;
536 	}
537 
538 	userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP);
539 
540 	/*
541 	 * ensure serialized syscall access by grabbing the swap_syscall_lock
542 	 */
543 	rw_enter(&swap_syscall_lock, RW_WRITER);
544 
545 	/*
546 	 * SWAP_STATS: get stats on current # of configured swap devs
547 	 *
548 	 * note that the swap_priority list can't change as long
549 	 * as we are holding the swap_syscall_lock.  we don't want
550 	 * to grab the uvm_swap_data_lock because we may fault&sleep during
551 	 * copyout() and we don't want to be holding that lock then!
552 	 */
553 	switch (SCARG(uap, cmd)) {
554 	case SWAP_STATS13:
555 		error = (*uvm_swap_stats13)(uap, retval);
556 		goto out;
557 	case SWAP_STATS50:
558 		error = (*uvm_swap_stats50)(uap, retval);
559 		goto out;
560 	case SWAP_STATS:
561 		error = uvm_swap_stats(SCARG(uap, arg), SCARG(uap, misc),
562 		    NULL, sizeof(struct swapent), retval);
563 		UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
564 		goto out;
565 
566 	case SWAP_GETDUMPDEV:
567 		error = copyout(&dumpdev, SCARG(uap, arg), sizeof(dumpdev));
568 		goto out;
569 	default:
570 		break;
571 	}
572 
573 	/*
574 	 * all other requests require superuser privs.   verify.
575 	 */
576 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL,
577 	    0, NULL, NULL, NULL)))
578 		goto out;
579 
580 	if (SCARG(uap, cmd) == SWAP_DUMPOFF) {
581 		/* drop the current dump device */
582 		dumpdev = NODEV;
583 		dumpcdev = NODEV;
584 		cpu_dumpconf();
585 		goto out;
586 	}
587 
588 	/*
589 	 * at this point we expect a path name in arg.   we will
590 	 * use namei() to gain a vnode reference (vref), and lock
591 	 * the vnode (VOP_LOCK).
592 	 *
593 	 * XXX: a NULL arg means use the root vnode pointer (e.g. for
594 	 * miniroot)
595 	 */
596 	if (SCARG(uap, arg) == NULL) {
597 		vp = rootvp;		/* miniroot */
598 		vref(vp);
599 		if (vn_lock(vp, LK_EXCLUSIVE)) {
600 			vrele(vp);
601 			error = EBUSY;
602 			goto out;
603 		}
604 		if (SCARG(uap, cmd) == SWAP_ON &&
605 		    copystr("miniroot", userpath, SWAP_PATH_MAX, &len))
606 			panic("swapctl: miniroot copy failed");
607 	} else {
608 		struct pathbuf *pb;
609 
610 		/*
611 		 * This used to allow copying in one extra byte
612 		 * (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON.
613 		 * This was completely pointless because if anyone
614 		 * used that extra byte namei would fail with
615 		 * ENAMETOOLONG anyway, so I've removed the excess
616 		 * logic. - dholland 20100215
617 		 */
618 
619 		error = pathbuf_copyin(SCARG(uap, arg), &pb);
620 		if (error) {
621 			goto out;
622 		}
623 		if (SCARG(uap, cmd) == SWAP_ON) {
624 			/* get a copy of the string */
625 			pathbuf_copystring(pb, userpath, SWAP_PATH_MAX);
626 			len = strlen(userpath) + 1;
627 		}
628 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
629 		if ((error = namei(&nd))) {
630 			pathbuf_destroy(pb);
631 			goto out;
632 		}
633 		vp = nd.ni_vp;
634 		pathbuf_destroy(pb);
635 	}
636 	/* note: "vp" is referenced and locked */
637 
638 	error = 0;		/* assume no error */
639 	switch(SCARG(uap, cmd)) {
640 
641 	case SWAP_DUMPDEV:
642 		if (vp->v_type != VBLK) {
643 			error = ENOTBLK;
644 			break;
645 		}
646 		if (bdevsw_lookup(vp->v_rdev)) {
647 			dumpdev = vp->v_rdev;
648 			dumpcdev = devsw_blk2chr(dumpdev);
649 		} else
650 			dumpdev = NODEV;
651 		cpu_dumpconf();
652 		break;
653 
654 	case SWAP_CTL:
655 		/*
656 		 * get new priority, remove old entry (if any) and then
657 		 * reinsert it in the correct place.  finally, prune out
658 		 * any empty priority structures.
659 		 */
660 		priority = SCARG(uap, misc);
661 		spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
662 		mutex_enter(&uvm_swap_data_lock);
663 		if ((sdp = swaplist_find(vp, true)) == NULL) {
664 			error = ENOENT;
665 		} else {
666 			swaplist_insert(sdp, spp, priority);
667 			swaplist_trim();
668 		}
669 		mutex_exit(&uvm_swap_data_lock);
670 		if (error)
671 			kmem_free(spp, sizeof(*spp));
672 		break;
673 
674 	case SWAP_ON:
675 
676 		/*
677 		 * check for duplicates.   if none found, then insert a
678 		 * dummy entry on the list to prevent someone else from
679 		 * trying to enable this device while we are working on
680 		 * it.
681 		 */
682 
683 		priority = SCARG(uap, misc);
684 		sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP);
685 		spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
686 		sdp->swd_flags = SWF_FAKE;
687 		sdp->swd_vp = vp;
688 		sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
689 		bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK);
690 		mutex_enter(&uvm_swap_data_lock);
691 		if (swaplist_find(vp, false) != NULL) {
692 			error = EBUSY;
693 			mutex_exit(&uvm_swap_data_lock);
694 			bufq_free(sdp->swd_tab);
695 			kmem_free(sdp, sizeof(*sdp));
696 			kmem_free(spp, sizeof(*spp));
697 			break;
698 		}
699 		swaplist_insert(sdp, spp, priority);
700 		mutex_exit(&uvm_swap_data_lock);
701 
702 		KASSERT(len > 0);
703 		sdp->swd_pathlen = len;
704 		sdp->swd_path = kmem_alloc(len, KM_SLEEP);
705 		if (copystr(userpath, sdp->swd_path, len, 0) != 0)
706 			panic("swapctl: copystr");
707 
708 		/*
709 		 * we've now got a FAKE placeholder in the swap list.
710 		 * now attempt to enable swap on it.  if we fail, undo
711 		 * what we've done and kill the fake entry we just inserted.
712 		 * if swap_on is a success, it will clear the SWF_FAKE flag
713 		 */
714 
715 		if ((error = swap_on(l, sdp)) != 0) {
716 			mutex_enter(&uvm_swap_data_lock);
717 			(void) swaplist_find(vp, true);  /* kill fake entry */
718 			swaplist_trim();
719 			mutex_exit(&uvm_swap_data_lock);
720 			bufq_free(sdp->swd_tab);
721 			kmem_free(sdp->swd_path, sdp->swd_pathlen);
722 			kmem_free(sdp, sizeof(*sdp));
723 			break;
724 		}
725 		break;
726 
727 	case SWAP_OFF:
728 		mutex_enter(&uvm_swap_data_lock);
729 		if ((sdp = swaplist_find(vp, false)) == NULL) {
730 			mutex_exit(&uvm_swap_data_lock);
731 			error = ENXIO;
732 			break;
733 		}
734 
735 		/*
736 		 * If a device isn't in use or enabled, we
737 		 * can't stop swapping from it (again).
738 		 */
739 		if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
740 			mutex_exit(&uvm_swap_data_lock);
741 			error = EBUSY;
742 			break;
743 		}
744 
745 		/*
746 		 * do the real work.
747 		 */
748 		error = swap_off(l, sdp);
749 		break;
750 
751 	default:
752 		error = EINVAL;
753 	}
754 
755 	/*
756 	 * done!  release the ref gained by namei() and unlock.
757 	 */
758 	vput(vp);
759 out:
760 	rw_exit(&swap_syscall_lock);
761 	kmem_free(userpath, SWAP_PATH_MAX);
762 
763 	UVMHIST_LOG(pdhist, "<- done!  error=%jd", error, 0, 0, 0);
764 	return (error);
765 }
766 
767 /*
768  * uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept
769  * away from sys_swapctl() in order to allow COMPAT_* swapctl()
770  * emulation to use it directly without going through sys_swapctl().
771  * The problem with using sys_swapctl() there is that it involves
772  * copying the swapent array to the stackgap, and this array's size
773  * is not known at build time. Hence it would not be possible to
774  * ensure it would fit in the stackgap in any case.
775  */
776 int
777 uvm_swap_stats(char *ptr, int misc,
778     void (*f)(void *, const struct swapent *), size_t len,
779     register_t *retval)
780 {
781 	struct swappri *spp;
782 	struct swapdev *sdp;
783 	struct swapent sep;
784 	int count = 0;
785 	int error;
786 
787 	KASSERT(len <= sizeof(sep));
788 	if (len == 0)
789 		return ENOSYS;
790 
791 	if (misc < 0)
792 		return EINVAL;
793 
794 	if (misc == 0 || uvmexp.nswapdev == 0)
795 		return 0;
796 
797 	/* Make sure userland cannot exhaust kernel memory */
798 	if ((size_t)misc > (size_t)uvmexp.nswapdev)
799 		misc = uvmexp.nswapdev;
800 
801 	KASSERT(rw_lock_held(&swap_syscall_lock));
802 
803 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
804 		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
805 			int inuse;
806 
807 			if (misc-- <= 0)
808 				break;
809 
810 			inuse = btodb((uint64_t)sdp->swd_npginuse <<
811 			    PAGE_SHIFT);
812 
813 			memset(&sep, 0, sizeof(sep));
814 			swapent_cvt(&sep, sdp, inuse);
815 			if (f)
816 				(*f)(&sep, &sep);
817 			if ((error = copyout(&sep, ptr, len)) != 0)
818 				return error;
819 			ptr += len;
820 			count++;
821 		}
822 	}
823 	*retval = count;
824 	return 0;
825 }
826 
827 /*
828  * swap_on: attempt to enable a swapdev for swapping.   note that the
829  *	swapdev is already on the global list, but disabled (marked
830  *	SWF_FAKE).
831  *
832  * => we avoid the start of the disk (to protect disk labels)
833  * => we also avoid the miniroot, if we are swapping to root.
834  * => caller should leave uvm_swap_data_lock unlocked, we may lock it
835  *	if needed.
836  */
837 static int
838 swap_on(struct lwp *l, struct swapdev *sdp)
839 {
840 	struct vnode *vp;
841 	int error, npages, nblocks, size;
842 	long addr;
843 	vmem_addr_t result;
844 	struct vattr va;
845 	dev_t dev;
846 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
847 
848 	/*
849 	 * we want to enable swapping on sdp.   the swd_vp contains
850 	 * the vnode we want (locked and ref'd), and the swd_dev
851 	 * contains the dev_t of the file, if it a block device.
852 	 */
853 
854 	vp = sdp->swd_vp;
855 	dev = sdp->swd_dev;
856 
857 	/*
858 	 * open the swap file (mostly useful for block device files to
859 	 * let device driver know what is up).
860 	 *
861 	 * we skip the open/close for root on swap because the root
862 	 * has already been opened when root was mounted (mountroot).
863 	 */
864 	if (vp != rootvp) {
865 		if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred)))
866 			return (error);
867 	}
868 
869 	/* XXX this only works for block devices */
870 	UVMHIST_LOG(pdhist, "  dev=%jd, major(dev)=%jd", dev, major(dev), 0, 0);
871 
872 	/*
873 	 * we now need to determine the size of the swap area.   for
874 	 * block specials we can call the d_psize function.
875 	 * for normal files, we must stat [get attrs].
876 	 *
877 	 * we put the result in nblks.
878 	 * for normal files, we also want the filesystem block size
879 	 * (which we get with statfs).
880 	 */
881 	switch (vp->v_type) {
882 	case VBLK:
883 		if ((nblocks = bdev_size(dev)) == -1) {
884 			error = ENXIO;
885 			goto bad;
886 		}
887 		break;
888 
889 	case VREG:
890 		if ((error = VOP_GETATTR(vp, &va, l->l_cred)))
891 			goto bad;
892 		nblocks = (int)btodb(va.va_size);
893 		sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift;
894 		/*
895 		 * limit the max # of outstanding I/O requests we issue
896 		 * at any one time.   take it easy on NFS servers.
897 		 */
898 		if (vp->v_tag == VT_NFS)
899 			sdp->swd_maxactive = 2; /* XXX */
900 		else
901 			sdp->swd_maxactive = 8; /* XXX */
902 		break;
903 
904 	default:
905 		error = ENXIO;
906 		goto bad;
907 	}
908 
909 	/*
910 	 * save nblocks in a safe place and convert to pages.
911 	 */
912 
913 	sdp->swd_nblks = nblocks;
914 	npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT;
915 
916 	/*
917 	 * for block special files, we want to make sure that leave
918 	 * the disklabel and bootblocks alone, so we arrange to skip
919 	 * over them (arbitrarily choosing to skip PAGE_SIZE bytes).
920 	 * note that because of this the "size" can be less than the
921 	 * actual number of blocks on the device.
922 	 */
923 	if (vp->v_type == VBLK) {
924 		/* we use pages 1 to (size - 1) [inclusive] */
925 		size = npages - 1;
926 		addr = 1;
927 	} else {
928 		/* we use pages 0 to (size - 1) [inclusive] */
929 		size = npages;
930 		addr = 0;
931 	}
932 
933 	/*
934 	 * make sure we have enough blocks for a reasonable sized swap
935 	 * area.   we want at least one page.
936 	 */
937 
938 	if (size < 1) {
939 		UVMHIST_LOG(pdhist, "  size <= 1!!", 0, 0, 0, 0);
940 		error = EINVAL;
941 		goto bad;
942 	}
943 
944 	UVMHIST_LOG(pdhist, "  dev=%#jx: size=%jd addr=%jd", dev, size, addr, 0);
945 
946 	/*
947 	 * now we need to allocate an extent to manage this swap device
948 	 */
949 
950 	sdp->swd_blist = blist_create(npages);
951 	/* mark all expect the `saved' region free. */
952 	blist_free(sdp->swd_blist, addr, size);
953 
954 	/*
955 	 * allocate space to for swap encryption state and mark the
956 	 * keys uninitialized so we generate them lazily
957 	 */
958 	sdp->swd_encmap = kmem_zalloc(encmap_size(npages), KM_SLEEP);
959 	sdp->swd_encinit = false;
960 
961 	/*
962 	 * if the vnode we are swapping to is the root vnode
963 	 * (i.e. we are swapping to the miniroot) then we want
964 	 * to make sure we don't overwrite it.   do a statfs to
965 	 * find its size and skip over it.
966 	 */
967 	if (vp == rootvp) {
968 		struct mount *mp;
969 		struct statvfs *sp;
970 		int rootblocks, rootpages;
971 
972 		mp = rootvnode->v_mount;
973 		sp = &mp->mnt_stat;
974 		rootblocks = sp->f_blocks * btodb(sp->f_frsize);
975 		/*
976 		 * XXX: sp->f_blocks isn't the total number of
977 		 * blocks in the filesystem, it's the number of
978 		 * data blocks.  so, our rootblocks almost
979 		 * definitely underestimates the total size
980 		 * of the filesystem - how badly depends on the
981 		 * details of the filesystem type.  there isn't
982 		 * an obvious way to deal with this cleanly
983 		 * and perfectly, so for now we just pad our
984 		 * rootblocks estimate with an extra 5 percent.
985 		 */
986 		rootblocks += (rootblocks >> 5) +
987 			(rootblocks >> 6) +
988 			(rootblocks >> 7);
989 		rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
990 		if (rootpages > size)
991 			panic("swap_on: miniroot larger than swap?");
992 
993 		if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) {
994 			panic("swap_on: unable to preserve miniroot");
995 		}
996 
997 		size -= rootpages;
998 		printf("Preserved %d pages of miniroot ", rootpages);
999 		printf("leaving %d pages of swap\n", size);
1000 	}
1001 
1002 	/*
1003 	 * add a ref to vp to reflect usage as a swap device.
1004 	 */
1005 	vref(vp);
1006 
1007 	/*
1008 	 * now add the new swapdev to the drum and enable.
1009 	 */
1010 	error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result);
1011 	if (error != 0)
1012 		panic("swapdrum_add");
1013 	/*
1014 	 * If this is the first regular swap create the workqueue.
1015 	 * => Protected by swap_syscall_lock.
1016 	 */
1017 	if (vp->v_type != VBLK) {
1018 		if (sw_reg_count++ == 0) {
1019 			KASSERT(sw_reg_workqueue == NULL);
1020 			if (workqueue_create(&sw_reg_workqueue, "swapiod",
1021 			    sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0)
1022 				panic("%s: workqueue_create failed", __func__);
1023 		}
1024 	}
1025 
1026 	sdp->swd_drumoffset = (int)result;
1027 	sdp->swd_drumsize = npages;
1028 	sdp->swd_npages = size;
1029 	mutex_enter(&uvm_swap_data_lock);
1030 	sdp->swd_flags &= ~SWF_FAKE;	/* going live */
1031 	sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
1032 	uvmexp.swpages += size;
1033 	uvmexp.swpgavail += size;
1034 	mutex_exit(&uvm_swap_data_lock);
1035 	return (0);
1036 
1037 	/*
1038 	 * failure: clean up and return error.
1039 	 */
1040 
1041 bad:
1042 	if (sdp->swd_blist) {
1043 		blist_destroy(sdp->swd_blist);
1044 	}
1045 	if (vp != rootvp) {
1046 		(void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred);
1047 	}
1048 	return (error);
1049 }
1050 
1051 /*
1052  * swap_off: stop swapping on swapdev
1053  *
1054  * => swap data should be locked, we will unlock.
1055  */
1056 static int
1057 swap_off(struct lwp *l, struct swapdev *sdp)
1058 {
1059 	int npages = sdp->swd_npages;
1060 	int error = 0;
1061 
1062 	UVMHIST_FUNC(__func__);
1063 	UVMHIST_CALLARGS(pdhist, "  dev=%#jx, npages=%jd", sdp->swd_dev,npages, 0, 0);
1064 
1065 	KASSERT(rw_write_held(&swap_syscall_lock));
1066 	KASSERT(mutex_owned(&uvm_swap_data_lock));
1067 
1068 	/* disable the swap area being removed */
1069 	sdp->swd_flags &= ~SWF_ENABLE;
1070 	uvmexp.swpgavail -= npages;
1071 	mutex_exit(&uvm_swap_data_lock);
1072 
1073 	/*
1074 	 * the idea is to find all the pages that are paged out to this
1075 	 * device, and page them all in.  in uvm, swap-backed pageable
1076 	 * memory can take two forms: aobjs and anons.  call the
1077 	 * swapoff hook for each subsystem to bring in pages.
1078 	 */
1079 
1080 	if (uao_swap_off(sdp->swd_drumoffset,
1081 			 sdp->swd_drumoffset + sdp->swd_drumsize) ||
1082 	    amap_swap_off(sdp->swd_drumoffset,
1083 			  sdp->swd_drumoffset + sdp->swd_drumsize)) {
1084 		error = ENOMEM;
1085 	} else if (sdp->swd_npginuse > sdp->swd_npgbad) {
1086 		error = EBUSY;
1087 	}
1088 
1089 	if (error) {
1090 		mutex_enter(&uvm_swap_data_lock);
1091 		sdp->swd_flags |= SWF_ENABLE;
1092 		uvmexp.swpgavail += npages;
1093 		mutex_exit(&uvm_swap_data_lock);
1094 
1095 		return error;
1096 	}
1097 
1098 	/*
1099 	 * If this is the last regular swap destroy the workqueue.
1100 	 * => Protected by swap_syscall_lock.
1101 	 */
1102 	if (sdp->swd_vp->v_type != VBLK) {
1103 		KASSERT(sw_reg_count > 0);
1104 		KASSERT(sw_reg_workqueue != NULL);
1105 		if (--sw_reg_count == 0) {
1106 			workqueue_destroy(sw_reg_workqueue);
1107 			sw_reg_workqueue = NULL;
1108 		}
1109 	}
1110 
1111 	/*
1112 	 * done with the vnode.
1113 	 * drop our ref on the vnode before calling VOP_CLOSE()
1114 	 * so that spec_close() can tell if this is the last close.
1115 	 */
1116 	vrele(sdp->swd_vp);
1117 	if (sdp->swd_vp != rootvp) {
1118 		(void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred);
1119 	}
1120 
1121 	mutex_enter(&uvm_swap_data_lock);
1122 	uvmexp.swpages -= npages;
1123 	uvmexp.swpginuse -= sdp->swd_npgbad;
1124 
1125 	if (swaplist_find(sdp->swd_vp, true) == NULL)
1126 		panic("%s: swapdev not in list", __func__);
1127 	swaplist_trim();
1128 	mutex_exit(&uvm_swap_data_lock);
1129 
1130 	/*
1131 	 * free all resources!
1132 	 */
1133 	vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize);
1134 	blist_destroy(sdp->swd_blist);
1135 	bufq_free(sdp->swd_tab);
1136 	kmem_free(__UNVOLATILE(sdp->swd_encmap),
1137 	    encmap_size(sdp->swd_drumsize));
1138 	explicit_memset(&sdp->swd_enckey, 0, sizeof sdp->swd_enckey);
1139 	explicit_memset(&sdp->swd_deckey, 0, sizeof sdp->swd_deckey);
1140 	kmem_free(sdp, sizeof(*sdp));
1141 	return (0);
1142 }
1143 
1144 void
1145 uvm_swap_shutdown(struct lwp *l)
1146 {
1147 	struct swapdev *sdp;
1148 	struct swappri *spp;
1149 	struct vnode *vp;
1150 	int error;
1151 
1152 	if (!uvm_swap_init_done || uvmexp.nswapdev == 0)
1153 		return;
1154 	printf("turning off swap...");
1155 	rw_enter(&swap_syscall_lock, RW_WRITER);
1156 	mutex_enter(&uvm_swap_data_lock);
1157 again:
1158 	LIST_FOREACH(spp, &swap_priority, spi_swappri)
1159 		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
1160 			if (sdp->swd_flags & SWF_FAKE)
1161 				continue;
1162 			if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0)
1163 				continue;
1164 #ifdef DEBUG
1165 			printf("\nturning off swap on %s...", sdp->swd_path);
1166 #endif
1167 			/* Have to lock and reference vnode for swap_off(). */
1168 			vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE|LK_RETRY);
1169 			vref(vp);
1170 			error = swap_off(l, sdp);
1171 			vput(vp);
1172 			mutex_enter(&uvm_swap_data_lock);
1173 			if (error) {
1174 				printf("stopping swap on %s failed "
1175 				    "with error %d\n", sdp->swd_path, error);
1176 				TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1177 				uvmexp.nswapdev--;
1178 				swaplist_trim();
1179 			}
1180 			goto again;
1181 		}
1182 	printf(" done\n");
1183 	mutex_exit(&uvm_swap_data_lock);
1184 	rw_exit(&swap_syscall_lock);
1185 }
1186 
1187 
1188 /*
1189  * /dev/drum interface and i/o functions
1190  */
1191 
1192 /*
1193  * swstrategy: perform I/O on the drum
1194  *
1195  * => we must map the i/o request from the drum to the correct swapdev.
1196  */
1197 static void
1198 swstrategy(struct buf *bp)
1199 {
1200 	struct swapdev *sdp;
1201 	struct vnode *vp;
1202 	int pageno, bn;
1203 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1204 
1205 	/*
1206 	 * convert block number to swapdev.   note that swapdev can't
1207 	 * be yanked out from under us because we are holding resources
1208 	 * in it (i.e. the blocks we are doing I/O on).
1209 	 */
1210 	pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT;
1211 	mutex_enter(&uvm_swap_data_lock);
1212 	sdp = swapdrum_getsdp(pageno);
1213 	mutex_exit(&uvm_swap_data_lock);
1214 	if (sdp == NULL) {
1215 		bp->b_error = EINVAL;
1216 		bp->b_resid = bp->b_bcount;
1217 		biodone(bp);
1218 		UVMHIST_LOG(pdhist, "  failed to get swap device", 0, 0, 0, 0);
1219 		return;
1220 	}
1221 
1222 	/*
1223 	 * convert drum page number to block number on this swapdev.
1224 	 */
1225 
1226 	pageno -= sdp->swd_drumoffset;	/* page # on swapdev */
1227 	bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */
1228 
1229 	UVMHIST_LOG(pdhist, "  Rd/Wr (0/1) %jd: mapoff=%#jx bn=%#jx bcount=%jd",
1230 		((bp->b_flags & B_READ) == 0) ? 1 : 0,
1231 		sdp->swd_drumoffset, bn, bp->b_bcount);
1232 
1233 	/*
1234 	 * for block devices we finish up here.
1235 	 * for regular files we have to do more work which we delegate
1236 	 * to sw_reg_strategy().
1237 	 */
1238 
1239 	vp = sdp->swd_vp;		/* swapdev vnode pointer */
1240 	switch (vp->v_type) {
1241 	default:
1242 		panic("%s: vnode type 0x%x", __func__, vp->v_type);
1243 
1244 	case VBLK:
1245 
1246 		/*
1247 		 * must convert "bp" from an I/O on /dev/drum to an I/O
1248 		 * on the swapdev (sdp).
1249 		 */
1250 		bp->b_blkno = bn;		/* swapdev block number */
1251 		bp->b_dev = sdp->swd_dev;	/* swapdev dev_t */
1252 
1253 		/*
1254 		 * if we are doing a write, we have to redirect the i/o on
1255 		 * drum's v_numoutput counter to the swapdevs.
1256 		 */
1257 		if ((bp->b_flags & B_READ) == 0) {
1258 			mutex_enter(bp->b_objlock);
1259 			vwakeup(bp);	/* kills one 'v_numoutput' on drum */
1260 			mutex_exit(bp->b_objlock);
1261 			mutex_enter(vp->v_interlock);
1262 			vp->v_numoutput++;	/* put it on swapdev */
1263 			mutex_exit(vp->v_interlock);
1264 		}
1265 
1266 		/*
1267 		 * finally plug in swapdev vnode and start I/O
1268 		 */
1269 		bp->b_vp = vp;
1270 		bp->b_objlock = vp->v_interlock;
1271 		VOP_STRATEGY(vp, bp);
1272 		return;
1273 
1274 	case VREG:
1275 		/*
1276 		 * delegate to sw_reg_strategy function.
1277 		 */
1278 		sw_reg_strategy(sdp, bp, bn);
1279 		return;
1280 	}
1281 	/* NOTREACHED */
1282 }
1283 
1284 /*
1285  * swread: the read function for the drum (just a call to physio)
1286  */
1287 /*ARGSUSED*/
1288 static int
1289 swread(dev_t dev, struct uio *uio, int ioflag)
1290 {
1291 	UVMHIST_FUNC(__func__);
1292 	UVMHIST_CALLARGS(pdhist, "  dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0);
1293 
1294 	return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
1295 }
1296 
1297 /*
1298  * swwrite: the write function for the drum (just a call to physio)
1299  */
1300 /*ARGSUSED*/
1301 static int
1302 swwrite(dev_t dev, struct uio *uio, int ioflag)
1303 {
1304 	UVMHIST_FUNC(__func__);
1305 	UVMHIST_CALLARGS(pdhist, "  dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0);
1306 
1307 	return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
1308 }
1309 
1310 const struct bdevsw swap_bdevsw = {
1311 	.d_open = nullopen,
1312 	.d_close = nullclose,
1313 	.d_strategy = swstrategy,
1314 	.d_ioctl = noioctl,
1315 	.d_dump = nodump,
1316 	.d_psize = nosize,
1317 	.d_discard = nodiscard,
1318 	.d_flag = D_OTHER
1319 };
1320 
1321 const struct cdevsw swap_cdevsw = {
1322 	.d_open = nullopen,
1323 	.d_close = nullclose,
1324 	.d_read = swread,
1325 	.d_write = swwrite,
1326 	.d_ioctl = noioctl,
1327 	.d_stop = nostop,
1328 	.d_tty = notty,
1329 	.d_poll = nopoll,
1330 	.d_mmap = nommap,
1331 	.d_kqfilter = nokqfilter,
1332 	.d_discard = nodiscard,
1333 	.d_flag = D_OTHER,
1334 };
1335 
1336 /*
1337  * sw_reg_strategy: handle swap i/o to regular files
1338  */
1339 static void
1340 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn)
1341 {
1342 	struct vnode	*vp;
1343 	struct vndxfer	*vnx;
1344 	daddr_t		nbn;
1345 	char 		*addr;
1346 	off_t		byteoff;
1347 	int		s, off, nra, error, sz, resid;
1348 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1349 
1350 	/*
1351 	 * allocate a vndxfer head for this transfer and point it to
1352 	 * our buffer.
1353 	 */
1354 	vnx = pool_get(&vndxfer_pool, PR_WAITOK);
1355 	vnx->vx_flags = VX_BUSY;
1356 	vnx->vx_error = 0;
1357 	vnx->vx_pending = 0;
1358 	vnx->vx_bp = bp;
1359 	vnx->vx_sdp = sdp;
1360 
1361 	/*
1362 	 * setup for main loop where we read filesystem blocks into
1363 	 * our buffer.
1364 	 */
1365 	error = 0;
1366 	bp->b_resid = bp->b_bcount;	/* nothing transferred yet! */
1367 	addr = bp->b_data;		/* current position in buffer */
1368 	byteoff = dbtob((uint64_t)bn);
1369 
1370 	for (resid = bp->b_resid; resid; resid -= sz) {
1371 		struct vndbuf	*nbp;
1372 
1373 		/*
1374 		 * translate byteoffset into block number.  return values:
1375 		 *   vp = vnode of underlying device
1376 		 *  nbn = new block number (on underlying vnode dev)
1377 		 *  nra = num blocks we can read-ahead (excludes requested
1378 		 *	block)
1379 		 */
1380 		nra = 0;
1381 		error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
1382 				 	&vp, &nbn, &nra);
1383 
1384 		if (error == 0 && nbn == (daddr_t)-1) {
1385 			/*
1386 			 * this used to just set error, but that doesn't
1387 			 * do the right thing.  Instead, it causes random
1388 			 * memory errors.  The panic() should remain until
1389 			 * this condition doesn't destabilize the system.
1390 			 */
1391 #if 1
1392 			panic("%s: swap to sparse file", __func__);
1393 #else
1394 			error = EIO;	/* failure */
1395 #endif
1396 		}
1397 
1398 		/*
1399 		 * punt if there was an error or a hole in the file.
1400 		 * we must wait for any i/o ops we have already started
1401 		 * to finish before returning.
1402 		 *
1403 		 * XXX we could deal with holes here but it would be
1404 		 * a hassle (in the write case).
1405 		 */
1406 		if (error) {
1407 			s = splbio();
1408 			vnx->vx_error = error;	/* pass error up */
1409 			goto out;
1410 		}
1411 
1412 		/*
1413 		 * compute the size ("sz") of this transfer (in bytes).
1414 		 */
1415 		off = byteoff % sdp->swd_bsize;
1416 		sz = (1 + nra) * sdp->swd_bsize - off;
1417 		if (sz > resid)
1418 			sz = resid;
1419 
1420 		UVMHIST_LOG(pdhist, "sw_reg_strategy: "
1421 		    "vp %#jx/%#jx offset %#jx/%#jx",
1422 		    (uintptr_t)sdp->swd_vp, (uintptr_t)vp, byteoff, nbn);
1423 
1424 		/*
1425 		 * now get a buf structure.   note that the vb_buf is
1426 		 * at the front of the nbp structure so that you can
1427 		 * cast pointers between the two structure easily.
1428 		 */
1429 		nbp = pool_get(&vndbuf_pool, PR_WAITOK);
1430 		buf_init(&nbp->vb_buf);
1431 		nbp->vb_buf.b_flags    = bp->b_flags;
1432 		nbp->vb_buf.b_cflags   = bp->b_cflags;
1433 		nbp->vb_buf.b_oflags   = bp->b_oflags;
1434 		nbp->vb_buf.b_bcount   = sz;
1435 		nbp->vb_buf.b_bufsize  = sz;
1436 		nbp->vb_buf.b_error    = 0;
1437 		nbp->vb_buf.b_data     = addr;
1438 		nbp->vb_buf.b_lblkno   = 0;
1439 		nbp->vb_buf.b_blkno    = nbn + btodb(off);
1440 		nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno;
1441 		nbp->vb_buf.b_iodone   = sw_reg_biodone;
1442 		nbp->vb_buf.b_vp       = vp;
1443 		nbp->vb_buf.b_objlock  = vp->v_interlock;
1444 		if (vp->v_type == VBLK) {
1445 			nbp->vb_buf.b_dev = vp->v_rdev;
1446 		}
1447 
1448 		nbp->vb_xfer = vnx;	/* patch it back in to vnx */
1449 
1450 		/*
1451 		 * Just sort by block number
1452 		 */
1453 		s = splbio();
1454 		if (vnx->vx_error != 0) {
1455 			buf_destroy(&nbp->vb_buf);
1456 			pool_put(&vndbuf_pool, nbp);
1457 			goto out;
1458 		}
1459 		vnx->vx_pending++;
1460 
1461 		/* sort it in and start I/O if we are not over our limit */
1462 		/* XXXAD locking */
1463 		bufq_put(sdp->swd_tab, &nbp->vb_buf);
1464 		sw_reg_start(sdp);
1465 		splx(s);
1466 
1467 		/*
1468 		 * advance to the next I/O
1469 		 */
1470 		byteoff += sz;
1471 		addr += sz;
1472 	}
1473 
1474 	s = splbio();
1475 
1476 out: /* Arrive here at splbio */
1477 	vnx->vx_flags &= ~VX_BUSY;
1478 	if (vnx->vx_pending == 0) {
1479 		error = vnx->vx_error;
1480 		pool_put(&vndxfer_pool, vnx);
1481 		bp->b_error = error;
1482 		biodone(bp);
1483 	}
1484 	splx(s);
1485 }
1486 
1487 /*
1488  * sw_reg_start: start an I/O request on the requested swapdev
1489  *
1490  * => reqs are sorted by b_rawblkno (above)
1491  */
1492 static void
1493 sw_reg_start(struct swapdev *sdp)
1494 {
1495 	struct buf	*bp;
1496 	struct vnode	*vp;
1497 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1498 
1499 	/* recursion control */
1500 	if ((sdp->swd_flags & SWF_BUSY) != 0)
1501 		return;
1502 
1503 	sdp->swd_flags |= SWF_BUSY;
1504 
1505 	while (sdp->swd_active < sdp->swd_maxactive) {
1506 		bp = bufq_get(sdp->swd_tab);
1507 		if (bp == NULL)
1508 			break;
1509 		sdp->swd_active++;
1510 
1511 		UVMHIST_LOG(pdhist,
1512 		    "sw_reg_start:  bp %#jx vp %#jx blkno %#jx cnt %#jx",
1513 		    (uintptr_t)bp, (uintptr_t)bp->b_vp, (uintptr_t)bp->b_blkno,
1514 		    bp->b_bcount);
1515 		vp = bp->b_vp;
1516 		KASSERT(bp->b_objlock == vp->v_interlock);
1517 		if ((bp->b_flags & B_READ) == 0) {
1518 			mutex_enter(vp->v_interlock);
1519 			vp->v_numoutput++;
1520 			mutex_exit(vp->v_interlock);
1521 		}
1522 		VOP_STRATEGY(vp, bp);
1523 	}
1524 	sdp->swd_flags &= ~SWF_BUSY;
1525 }
1526 
1527 /*
1528  * sw_reg_biodone: one of our i/o's has completed
1529  */
1530 static void
1531 sw_reg_biodone(struct buf *bp)
1532 {
1533 	workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL);
1534 }
1535 
1536 /*
1537  * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
1538  *
1539  * => note that we can recover the vndbuf struct by casting the buf ptr
1540  */
1541 static void
1542 sw_reg_iodone(struct work *wk, void *dummy)
1543 {
1544 	struct vndbuf *vbp = (void *)wk;
1545 	struct vndxfer *vnx = vbp->vb_xfer;
1546 	struct buf *pbp = vnx->vx_bp;		/* parent buffer */
1547 	struct swapdev	*sdp = vnx->vx_sdp;
1548 	int s, resid, error;
1549 	KASSERT(&vbp->vb_buf.b_work == wk);
1550 	UVMHIST_FUNC(__func__);
1551 	UVMHIST_CALLARGS(pdhist, "  vbp=%#jx vp=%#jx blkno=%#jx addr=%#jx",
1552 	    (uintptr_t)vbp, (uintptr_t)vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno,
1553 	    (uintptr_t)vbp->vb_buf.b_data);
1554 	UVMHIST_LOG(pdhist, "  cnt=%#jx resid=%#jx",
1555 	    vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
1556 
1557 	/*
1558 	 * protect vbp at splbio and update.
1559 	 */
1560 
1561 	s = splbio();
1562 	resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
1563 	pbp->b_resid -= resid;
1564 	vnx->vx_pending--;
1565 
1566 	if (vbp->vb_buf.b_error != 0) {
1567 		/* pass error upward */
1568 		error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO;
1569 		UVMHIST_LOG(pdhist, "  got error=%jd !", error, 0, 0, 0);
1570 		vnx->vx_error = error;
1571 	}
1572 
1573 	/*
1574 	 * kill vbp structure
1575 	 */
1576 	buf_destroy(&vbp->vb_buf);
1577 	pool_put(&vndbuf_pool, vbp);
1578 
1579 	/*
1580 	 * wrap up this transaction if it has run to completion or, in
1581 	 * case of an error, when all auxiliary buffers have returned.
1582 	 */
1583 	if (vnx->vx_error != 0) {
1584 		/* pass error upward */
1585 		error = vnx->vx_error;
1586 		if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
1587 			pbp->b_error = error;
1588 			biodone(pbp);
1589 			pool_put(&vndxfer_pool, vnx);
1590 		}
1591 	} else if (pbp->b_resid == 0) {
1592 		KASSERT(vnx->vx_pending == 0);
1593 		if ((vnx->vx_flags & VX_BUSY) == 0) {
1594 			UVMHIST_LOG(pdhist, "  iodone, pbp=%#jx error=%jd !",
1595 			    (uintptr_t)pbp, vnx->vx_error, 0, 0);
1596 			biodone(pbp);
1597 			pool_put(&vndxfer_pool, vnx);
1598 		}
1599 	}
1600 
1601 	/*
1602 	 * done!   start next swapdev I/O if one is pending
1603 	 */
1604 	sdp->swd_active--;
1605 	sw_reg_start(sdp);
1606 	splx(s);
1607 }
1608 
1609 
1610 /*
1611  * uvm_swap_alloc: allocate space on swap
1612  *
1613  * => allocation is done "round robin" down the priority list, as we
1614  *	allocate in a priority we "rotate" the circle queue.
1615  * => space can be freed with uvm_swap_free
1616  * => we return the page slot number in /dev/drum (0 == invalid slot)
1617  * => we lock uvm_swap_data_lock
1618  * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
1619  */
1620 int
1621 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok)
1622 {
1623 	struct swapdev *sdp;
1624 	struct swappri *spp;
1625 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1626 
1627 	/*
1628 	 * no swap devices configured yet?   definite failure.
1629 	 */
1630 	if (uvmexp.nswapdev < 1)
1631 		return 0;
1632 
1633 	/*
1634 	 * XXXJAK: BEGIN HACK
1635 	 *
1636 	 * blist_alloc() in subr_blist.c will panic if we try to allocate
1637 	 * too many slots.
1638 	 */
1639 	if (*nslots > BLIST_MAX_ALLOC) {
1640 		if (__predict_false(lessok == false))
1641 			return 0;
1642 		*nslots = BLIST_MAX_ALLOC;
1643 	}
1644 	/* XXXJAK: END HACK */
1645 
1646 	/*
1647 	 * lock data lock, convert slots into blocks, and enter loop
1648 	 */
1649 	mutex_enter(&uvm_swap_data_lock);
1650 
1651 ReTry:	/* XXXMRG */
1652 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
1653 		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
1654 			uint64_t result;
1655 
1656 			/* if it's not enabled, then we can't swap from it */
1657 			if ((sdp->swd_flags & SWF_ENABLE) == 0)
1658 				continue;
1659 			if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
1660 				continue;
1661 			result = blist_alloc(sdp->swd_blist, *nslots);
1662 			if (result == BLIST_NONE) {
1663 				continue;
1664 			}
1665 			KASSERT(result < sdp->swd_drumsize);
1666 
1667 			/*
1668 			 * successful allocation!  now rotate the tailq.
1669 			 */
1670 			TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1671 			TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
1672 			sdp->swd_npginuse += *nslots;
1673 			uvmexp.swpginuse += *nslots;
1674 			mutex_exit(&uvm_swap_data_lock);
1675 			/* done!  return drum slot number */
1676 			UVMHIST_LOG(pdhist,
1677 			    "success!  returning %jd slots starting at %jd",
1678 			    *nslots, result + sdp->swd_drumoffset, 0, 0);
1679 			return (result + sdp->swd_drumoffset);
1680 		}
1681 	}
1682 
1683 	/* XXXMRG: BEGIN HACK */
1684 	if (*nslots > 1 && lessok) {
1685 		*nslots = 1;
1686 		/* XXXMRG: ugh!  blist should support this for us */
1687 		goto ReTry;
1688 	}
1689 	/* XXXMRG: END HACK */
1690 
1691 	mutex_exit(&uvm_swap_data_lock);
1692 	return 0;
1693 }
1694 
1695 /*
1696  * uvm_swapisfull: return true if most of available swap is allocated
1697  * and in use.  we don't count some small portion as it may be inaccessible
1698  * to us at any given moment, for example if there is lock contention or if
1699  * pages are busy.
1700  */
1701 bool
1702 uvm_swapisfull(void)
1703 {
1704 	int swpgonly;
1705 	bool rv;
1706 
1707 	if (uvmexp.swpages == 0) {
1708 		return true;
1709 	}
1710 
1711 	mutex_enter(&uvm_swap_data_lock);
1712 	KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
1713 	swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 /
1714 	    uvm_swapisfull_factor);
1715 	rv = (swpgonly >= uvmexp.swpgavail);
1716 	mutex_exit(&uvm_swap_data_lock);
1717 
1718 	return (rv);
1719 }
1720 
1721 /*
1722  * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
1723  *
1724  * => we lock uvm_swap_data_lock
1725  */
1726 void
1727 uvm_swap_markbad(int startslot, int nslots)
1728 {
1729 	struct swapdev *sdp;
1730 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1731 
1732 	mutex_enter(&uvm_swap_data_lock);
1733 	sdp = swapdrum_getsdp(startslot);
1734 	KASSERT(sdp != NULL);
1735 
1736 	/*
1737 	 * we just keep track of how many pages have been marked bad
1738 	 * in this device, to make everything add up in swap_off().
1739 	 * we assume here that the range of slots will all be within
1740 	 * one swap device.
1741 	 */
1742 
1743 	KASSERT(uvmexp.swpgonly >= nslots);
1744 	atomic_add_int(&uvmexp.swpgonly, -nslots);
1745 	sdp->swd_npgbad += nslots;
1746 	UVMHIST_LOG(pdhist, "now %jd bad", sdp->swd_npgbad, 0,0,0);
1747 	mutex_exit(&uvm_swap_data_lock);
1748 }
1749 
1750 /*
1751  * uvm_swap_free: free swap slots
1752  *
1753  * => this can be all or part of an allocation made by uvm_swap_alloc
1754  * => we lock uvm_swap_data_lock
1755  */
1756 void
1757 uvm_swap_free(int startslot, int nslots)
1758 {
1759 	struct swapdev *sdp;
1760 	UVMHIST_FUNC(__func__);
1761 	UVMHIST_CALLARGS(pdhist, "freeing %jd slots starting at %jd", nslots,
1762 	    startslot, 0, 0);
1763 
1764 	/*
1765 	 * ignore attempts to free the "bad" slot.
1766 	 */
1767 
1768 	if (startslot == SWSLOT_BAD) {
1769 		return;
1770 	}
1771 
1772 	/*
1773 	 * convert drum slot offset back to sdp, free the blocks
1774 	 * in the extent, and return.   must hold pri lock to do
1775 	 * lookup and access the extent.
1776 	 */
1777 
1778 	mutex_enter(&uvm_swap_data_lock);
1779 	sdp = swapdrum_getsdp(startslot);
1780 	KASSERT(uvmexp.nswapdev >= 1);
1781 	KASSERT(sdp != NULL);
1782 	KASSERT(sdp->swd_npginuse >= nslots);
1783 	blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots);
1784 	sdp->swd_npginuse -= nslots;
1785 	uvmexp.swpginuse -= nslots;
1786 	mutex_exit(&uvm_swap_data_lock);
1787 }
1788 
1789 /*
1790  * uvm_swap_put: put any number of pages into a contig place on swap
1791  *
1792  * => can be sync or async
1793  */
1794 
1795 int
1796 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags)
1797 {
1798 	int error;
1799 
1800 	error = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
1801 	    ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1802 	return error;
1803 }
1804 
1805 /*
1806  * uvm_swap_get: get a single page from swap
1807  *
1808  * => usually a sync op (from fault)
1809  */
1810 
1811 int
1812 uvm_swap_get(struct vm_page *page, int swslot, int flags)
1813 {
1814 	int error;
1815 
1816 	atomic_inc_uint(&uvmexp.nswget);
1817 	KASSERT(flags & PGO_SYNCIO);
1818 	if (swslot == SWSLOT_BAD) {
1819 		return EIO;
1820 	}
1821 
1822 	error = uvm_swap_io(&page, swslot, 1, B_READ |
1823 	    ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1824 	if (error == 0) {
1825 
1826 		/*
1827 		 * this page is no longer only in swap.
1828 		 */
1829 
1830 		KASSERT(uvmexp.swpgonly > 0);
1831 		atomic_dec_uint(&uvmexp.swpgonly);
1832 	}
1833 	return error;
1834 }
1835 
1836 /*
1837  * uvm_swap_io: do an i/o operation to swap
1838  */
1839 
1840 static int
1841 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags)
1842 {
1843 	daddr_t startblk;
1844 	struct	buf *bp;
1845 	vaddr_t kva;
1846 	int	error, mapinflags;
1847 	bool write, async, swap_encrypt;
1848 	UVMHIST_FUNC(__func__);
1849 	UVMHIST_CALLARGS(pdhist, "<- called, startslot=%jd, npages=%jd, flags=%#jx",
1850 	    startslot, npages, flags, 0);
1851 
1852 	write = (flags & B_READ) == 0;
1853 	async = (flags & B_ASYNC) != 0;
1854 	swap_encrypt = atomic_load_relaxed(&uvm_swap_encrypt);
1855 
1856 	/*
1857 	 * allocate a buf for the i/o.
1858 	 */
1859 
1860 	KASSERT(curlwp != uvm.pagedaemon_lwp || (write && async));
1861 	bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp);
1862 	if (bp == NULL) {
1863 		uvm_aio_aiodone_pages(pps, npages, true, ENOMEM);
1864 		return ENOMEM;
1865 	}
1866 
1867 	/*
1868 	 * convert starting drum slot to block number
1869 	 */
1870 
1871 	startblk = btodb((uint64_t)startslot << PAGE_SHIFT);
1872 
1873 	/*
1874 	 * first, map the pages into the kernel.
1875 	 */
1876 
1877 	mapinflags = !write ?
1878 		UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ :
1879 		UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE;
1880 	if (write && swap_encrypt)	/* need to encrypt in-place */
1881 		mapinflags |= UVMPAGER_MAPIN_READ;
1882 	kva = uvm_pagermapin(pps, npages, mapinflags);
1883 
1884 	/*
1885 	 * encrypt writes in place if requested
1886 	 */
1887 
1888 	if (write) do {
1889 		struct swapdev *sdp;
1890 		int i;
1891 
1892 		/*
1893 		 * Get the swapdev so we can discriminate on the
1894 		 * encryption state.  There may or may not be an
1895 		 * encryption key generated; we may or may not be asked
1896 		 * to encrypt swap.
1897 		 *
1898 		 * 1. NO KEY, NO ENCRYPTION: Nothing to do.
1899 		 *
1900 		 * 2. NO KEY, BUT ENCRYPTION: Generate a key, encrypt,
1901 		 *    and mark the slots encrypted.
1902 		 *
1903 		 * 3. KEY, BUT NO ENCRYPTION: The slots may already be
1904 		 *    marked encrypted from a past life.  Mark them not
1905 		 *    encrypted.
1906 		 *
1907 		 * 4. KEY, ENCRYPTION: Encrypt and mark the slots
1908 		 *    encrypted.
1909 		 */
1910 		mutex_enter(&uvm_swap_data_lock);
1911 		sdp = swapdrum_getsdp(startslot);
1912 		if (!sdp->swd_encinit) {
1913 			if (!swap_encrypt) {
1914 				mutex_exit(&uvm_swap_data_lock);
1915 				break;
1916 			}
1917 			uvm_swap_genkey(sdp);
1918 		}
1919 		KASSERT(sdp->swd_encinit);
1920 		mutex_exit(&uvm_swap_data_lock);
1921 
1922 		for (i = 0; i < npages; i++) {
1923 			int s = startslot + i;
1924 			KDASSERT(swapdrum_sdp_is(s, sdp));
1925 			KASSERT(s >= sdp->swd_drumoffset);
1926 			s -= sdp->swd_drumoffset;
1927 			KASSERT(s < sdp->swd_drumsize);
1928 
1929 			if (swap_encrypt) {
1930 				uvm_swap_encryptpage(sdp,
1931 				    (void *)(kva + (vsize_t)i*PAGE_SIZE), s);
1932 				atomic_or_32(&sdp->swd_encmap[s/32],
1933 				    __BIT(s%32));
1934 			} else {
1935 				atomic_and_32(&sdp->swd_encmap[s/32],
1936 				    ~__BIT(s%32));
1937 			}
1938 		}
1939 	} while (0);
1940 
1941 	/*
1942 	 * fill in the bp/sbp.   we currently route our i/o through
1943 	 * /dev/drum's vnode [swapdev_vp].
1944 	 */
1945 
1946 	bp->b_cflags = BC_BUSY | BC_NOCACHE;
1947 	bp->b_flags = (flags & (B_READ|B_ASYNC));
1948 	bp->b_proc = &proc0;	/* XXX */
1949 	bp->b_vnbufs.le_next = NOLIST;
1950 	bp->b_data = (void *)kva;
1951 	bp->b_blkno = startblk;
1952 	bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT;
1953 
1954 	/*
1955 	 * bump v_numoutput (counter of number of active outputs).
1956 	 */
1957 
1958 	if (write) {
1959 		mutex_enter(swapdev_vp->v_interlock);
1960 		swapdev_vp->v_numoutput++;
1961 		mutex_exit(swapdev_vp->v_interlock);
1962 	}
1963 
1964 	/*
1965 	 * for async ops we must set up the iodone handler.
1966 	 */
1967 
1968 	if (async) {
1969 		bp->b_iodone = uvm_aio_aiodone;
1970 		UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
1971 		if (curlwp == uvm.pagedaemon_lwp)
1972 			BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1973 		else
1974 			BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
1975 	} else {
1976 		bp->b_iodone = NULL;
1977 		BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1978 	}
1979 	UVMHIST_LOG(pdhist,
1980 	    "about to start io: data = %#jx blkno = %#jx, bcount = %jd",
1981 	    (uintptr_t)bp->b_data, bp->b_blkno, bp->b_bcount, 0);
1982 
1983 	/*
1984 	 * now we start the I/O, and if async, return.
1985 	 */
1986 
1987 	VOP_STRATEGY(swapdev_vp, bp);
1988 	if (async) {
1989 		/*
1990 		 * Reads are always synchronous; if this changes, we
1991 		 * need to add an asynchronous path for decryption.
1992 		 */
1993 		KASSERT(write);
1994 		return 0;
1995 	}
1996 
1997 	/*
1998 	 * must be sync i/o.   wait for it to finish
1999 	 */
2000 
2001 	error = biowait(bp);
2002 	if (error)
2003 		goto out;
2004 
2005 	/*
2006 	 * decrypt reads in place if needed
2007 	 */
2008 
2009 	if (!write) do {
2010 		struct swapdev *sdp;
2011 		bool encinit;
2012 		int i;
2013 
2014 		/*
2015 		 * Get the sdp.  Everything about it except the encinit
2016 		 * bit, saying whether the encryption key is
2017 		 * initialized or not, and the encrypted bit for each
2018 		 * page, is stable until all swap pages have been
2019 		 * released and the device is removed.
2020 		 */
2021 		mutex_enter(&uvm_swap_data_lock);
2022 		sdp = swapdrum_getsdp(startslot);
2023 		encinit = sdp->swd_encinit;
2024 		mutex_exit(&uvm_swap_data_lock);
2025 
2026 		if (!encinit)
2027 			/*
2028 			 * If there's no encryption key, there's no way
2029 			 * any of these slots can be encrypted, so
2030 			 * nothing to do here.
2031 			 */
2032 			break;
2033 		for (i = 0; i < npages; i++) {
2034 			int s = startslot + i;
2035 			KDASSERT(swapdrum_sdp_is(s, sdp));
2036 			KASSERT(s >= sdp->swd_drumoffset);
2037 			s -= sdp->swd_drumoffset;
2038 			KASSERT(s < sdp->swd_drumsize);
2039 			if ((atomic_load_relaxed(&sdp->swd_encmap[s/32]) &
2040 				__BIT(s%32)) == 0)
2041 				continue;
2042 			uvm_swap_decryptpage(sdp,
2043 			    (void *)(kva + (vsize_t)i*PAGE_SIZE), s);
2044 		}
2045 	} while (0);
2046 out:
2047 	/*
2048 	 * kill the pager mapping
2049 	 */
2050 
2051 	uvm_pagermapout(kva, npages);
2052 
2053 	/*
2054 	 * now dispose of the buf and we're done.
2055 	 */
2056 
2057 	if (write) {
2058 		mutex_enter(swapdev_vp->v_interlock);
2059 		vwakeup(bp);
2060 		mutex_exit(swapdev_vp->v_interlock);
2061 	}
2062 	putiobuf(bp);
2063 	UVMHIST_LOG(pdhist, "<- done (sync)  error=%jd", error, 0, 0, 0);
2064 
2065 	return (error);
2066 }
2067 
2068 /*
2069  * uvm_swap_genkey(sdp)
2070  *
2071  *	Generate a key for swap encryption.
2072  */
2073 static void
2074 uvm_swap_genkey(struct swapdev *sdp)
2075 {
2076 	uint8_t key[32];
2077 
2078 	KASSERT(!sdp->swd_encinit);
2079 
2080 	cprng_strong(kern_cprng, key, sizeof key, 0);
2081 	aes_setenckey256(&sdp->swd_enckey, key);
2082 	aes_setdeckey256(&sdp->swd_deckey, key);
2083 	explicit_memset(key, 0, sizeof key);
2084 
2085 	sdp->swd_encinit = true;
2086 }
2087 
2088 /*
2089  * uvm_swap_encryptpage(sdp, kva, slot)
2090  *
2091  *	Encrypt one page of data at kva for the specified slot number
2092  *	in the swap device.
2093  */
2094 static void
2095 uvm_swap_encryptpage(struct swapdev *sdp, void *kva, int slot)
2096 {
2097 	uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16);
2098 
2099 	/* iv := AES_k(le32enc(slot) || 0^96) */
2100 	le32enc(preiv, slot);
2101 	aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS);
2102 
2103 	/* *kva := AES-CBC_k(iv, *kva) */
2104 	aes_cbc_enc(&sdp->swd_enckey, kva, kva, PAGE_SIZE, iv,
2105 	    AES_256_NROUNDS);
2106 
2107 	explicit_memset(&iv, 0, sizeof iv);
2108 }
2109 
2110 /*
2111  * uvm_swap_decryptpage(sdp, kva, slot)
2112  *
2113  *	Decrypt one page of data at kva for the specified slot number
2114  *	in the swap device.
2115  */
2116 static void
2117 uvm_swap_decryptpage(struct swapdev *sdp, void *kva, int slot)
2118 {
2119 	uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16);
2120 
2121 	/* iv := AES_k(le32enc(slot) || 0^96) */
2122 	le32enc(preiv, slot);
2123 	aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS);
2124 
2125 	/* *kva := AES-CBC^{-1}_k(iv, *kva) */
2126 	aes_cbc_dec(&sdp->swd_deckey, kva, kva, PAGE_SIZE, iv,
2127 	    AES_256_NROUNDS);
2128 
2129 	explicit_memset(&iv, 0, sizeof iv);
2130 }
2131 
2132 SYSCTL_SETUP(sysctl_uvmswap_setup, "sysctl uvmswap setup")
2133 {
2134 
2135 	sysctl_createv(clog, 0, NULL, NULL,
2136 	    CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "swap_encrypt",
2137 	    SYSCTL_DESCR("Encrypt data when swapped out to disk"),
2138 	    NULL, 0, &uvm_swap_encrypt, 0,
2139 	    CTL_VM, CTL_CREATE, CTL_EOL);
2140 }
2141