xref: /netbsd-src/sys/uvm/uvm_swap.c (revision 82d56013d7b633d116a93943de88e08335357a7c)
1 /*	$NetBSD: uvm_swap.c,v 1.204 2021/05/23 00:36:36 mrg Exp $	*/
2 
3 /*
4  * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
29  * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
30  */
31 
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.204 2021/05/23 00:36:36 mrg Exp $");
34 
35 #include "opt_uvmhist.h"
36 #include "opt_compat_netbsd.h"
37 #include "opt_ddb.h"
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/atomic.h>
42 #include <sys/buf.h>
43 #include <sys/bufq.h>
44 #include <sys/conf.h>
45 #include <sys/cprng.h>
46 #include <sys/proc.h>
47 #include <sys/namei.h>
48 #include <sys/disklabel.h>
49 #include <sys/errno.h>
50 #include <sys/kernel.h>
51 #include <sys/vnode.h>
52 #include <sys/file.h>
53 #include <sys/vmem.h>
54 #include <sys/blist.h>
55 #include <sys/mount.h>
56 #include <sys/pool.h>
57 #include <sys/kmem.h>
58 #include <sys/syscallargs.h>
59 #include <sys/swap.h>
60 #include <sys/kauth.h>
61 #include <sys/sysctl.h>
62 #include <sys/workqueue.h>
63 
64 #include <uvm/uvm.h>
65 
66 #include <miscfs/specfs/specdev.h>
67 
68 #include <crypto/aes/aes.h>
69 #include <crypto/aes/aes_cbc.h>
70 
71 /*
72  * uvm_swap.c: manage configuration and i/o to swap space.
73  */
74 
75 /*
76  * swap space is managed in the following way:
77  *
78  * each swap partition or file is described by a "swapdev" structure.
79  * each "swapdev" structure contains a "swapent" structure which contains
80  * information that is passed up to the user (via system calls).
81  *
82  * each swap partition is assigned a "priority" (int) which controls
83  * swap partition usage.
84  *
85  * the system maintains a global data structure describing all swap
86  * partitions/files.   there is a sorted LIST of "swappri" structures
87  * which describe "swapdev"'s at that priority.   this LIST is headed
88  * by the "swap_priority" global var.    each "swappri" contains a
89  * TAILQ of "swapdev" structures at that priority.
90  *
91  * locking:
92  *  - swap_syscall_lock (krwlock_t): this lock serializes the swapctl
93  *    system call and prevents the swap priority list from changing
94  *    while we are in the middle of a system call (e.g. SWAP_STATS).
95  *  - uvm_swap_data_lock (kmutex_t): this lock protects all swap data
96  *    structures including the priority list, the swapdev structures,
97  *    and the swapmap arena.
98  *
99  * each swap device has the following info:
100  *  - swap device in use (could be disabled, preventing future use)
101  *  - swap enabled (allows new allocations on swap)
102  *  - map info in /dev/drum
103  *  - vnode pointer
104  * for swap files only:
105  *  - block size
106  *  - max byte count in buffer
107  *  - buffer
108  *
109  * userland controls and configures swap with the swapctl(2) system call.
110  * the sys_swapctl performs the following operations:
111  *  [1] SWAP_NSWAP: returns the number of swap devices currently configured
112  *  [2] SWAP_STATS: given a pointer to an array of swapent structures
113  *	(passed in via "arg") of a size passed in via "misc" ... we load
114  *	the current swap config into the array. The actual work is done
115  *	in the uvm_swap_stats() function.
116  *  [3] SWAP_ON: given a pathname in arg (could be device or file) and a
117  *	priority in "misc", start swapping on it.
118  *  [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
119  *  [5] SWAP_CTL: changes the priority of a swap device (new priority in
120  *	"misc")
121  */
122 
123 /*
124  * swapdev: describes a single swap partition/file
125  *
126  * note the following should be true:
127  * swd_inuse <= swd_nblks  [number of blocks in use is <= total blocks]
128  * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
129  */
130 struct swapdev {
131 	dev_t			swd_dev;	/* device id */
132 	int			swd_flags;	/* flags:inuse/enable/fake */
133 	int			swd_priority;	/* our priority */
134 	int			swd_nblks;	/* blocks in this device */
135 	char			*swd_path;	/* saved pathname of device */
136 	int			swd_pathlen;	/* length of pathname */
137 	int			swd_npages;	/* #pages we can use */
138 	int			swd_npginuse;	/* #pages in use */
139 	int			swd_npgbad;	/* #pages bad */
140 	int			swd_drumoffset;	/* page0 offset in drum */
141 	int			swd_drumsize;	/* #pages in drum */
142 	blist_t			swd_blist;	/* blist for this swapdev */
143 	struct vnode		*swd_vp;	/* backing vnode */
144 	TAILQ_ENTRY(swapdev)	swd_next;	/* priority tailq */
145 
146 	int			swd_bsize;	/* blocksize (bytes) */
147 	int			swd_maxactive;	/* max active i/o reqs */
148 	struct bufq_state	*swd_tab;	/* buffer list */
149 	int			swd_active;	/* number of active buffers */
150 
151 	volatile uint32_t	*swd_encmap;	/* bitmap of encrypted slots */
152 	struct aesenc		swd_enckey;	/* AES key expanded for enc */
153 	struct aesdec		swd_deckey;	/* AES key expanded for dec */
154 	bool			swd_encinit;	/* true if keys initialized */
155 };
156 
157 /*
158  * swap device priority entry; the list is kept sorted on `spi_priority'.
159  */
160 struct swappri {
161 	int			spi_priority;     /* priority */
162 	TAILQ_HEAD(spi_swapdev, swapdev)	spi_swapdev;
163 	/* tailq of swapdevs at this priority */
164 	LIST_ENTRY(swappri)	spi_swappri;      /* global list of pri's */
165 };
166 
167 /*
168  * The following two structures are used to keep track of data transfers
169  * on swap devices associated with regular files.
170  * NOTE: this code is more or less a copy of vnd.c; we use the same
171  * structure names here to ease porting..
172  */
173 struct vndxfer {
174 	struct buf	*vx_bp;		/* Pointer to parent buffer */
175 	struct swapdev	*vx_sdp;
176 	int		vx_error;
177 	int		vx_pending;	/* # of pending aux buffers */
178 	int		vx_flags;
179 #define VX_BUSY		1
180 #define VX_DEAD		2
181 };
182 
183 struct vndbuf {
184 	struct buf	vb_buf;
185 	struct vndxfer	*vb_xfer;
186 };
187 
188 /*
189  * We keep a of pool vndbuf's and vndxfer structures.
190  */
191 static struct pool vndxfer_pool, vndbuf_pool;
192 
193 /*
194  * local variables
195  */
196 static vmem_t *swapmap;	/* controls the mapping of /dev/drum */
197 
198 /* list of all active swap devices [by priority] */
199 LIST_HEAD(swap_priority, swappri);
200 static struct swap_priority swap_priority;
201 
202 /* locks */
203 static kmutex_t uvm_swap_data_lock __cacheline_aligned;
204 static krwlock_t swap_syscall_lock;
205 bool uvm_swap_init_done = false;
206 
207 /* workqueue and use counter for swap to regular files */
208 static int sw_reg_count = 0;
209 static struct workqueue *sw_reg_workqueue;
210 
211 /* tuneables */
212 u_int uvm_swapisfull_factor = 99;
213 bool uvm_swap_encrypt = false;
214 
215 /*
216  * prototypes
217  */
218 static struct swapdev	*swapdrum_getsdp(int);
219 
220 static struct swapdev	*swaplist_find(struct vnode *, bool);
221 static void		 swaplist_insert(struct swapdev *,
222 					 struct swappri *, int);
223 static void		 swaplist_trim(void);
224 
225 static int swap_on(struct lwp *, struct swapdev *);
226 static int swap_off(struct lwp *, struct swapdev *);
227 
228 static void sw_reg_strategy(struct swapdev *, struct buf *, int);
229 static void sw_reg_biodone(struct buf *);
230 static void sw_reg_iodone(struct work *wk, void *dummy);
231 static void sw_reg_start(struct swapdev *);
232 
233 static int uvm_swap_io(struct vm_page **, int, int, int);
234 
235 static void uvm_swap_genkey(struct swapdev *);
236 static void uvm_swap_encryptpage(struct swapdev *, void *, int);
237 static void uvm_swap_decryptpage(struct swapdev *, void *, int);
238 
239 static size_t
240 encmap_size(size_t npages)
241 {
242 	struct swapdev *sdp;
243 	const size_t bytesperword = sizeof(sdp->swd_encmap[0]);
244 	const size_t bitsperword = NBBY * bytesperword;
245 	const size_t nbits = npages; /* one bit for each page */
246 	const size_t nwords = howmany(nbits, bitsperword);
247 	const size_t nbytes = nwords * bytesperword;
248 
249 	return nbytes;
250 }
251 
252 /*
253  * uvm_swap_init: init the swap system data structures and locks
254  *
255  * => called at boot time from init_main.c after the filesystems
256  *	are brought up (which happens after uvm_init())
257  */
258 void
259 uvm_swap_init(void)
260 {
261 	UVMHIST_FUNC(__func__);
262 
263 	UVMHIST_CALLED(pdhist);
264 	/*
265 	 * first, init the swap list, its counter, and its lock.
266 	 * then get a handle on the vnode for /dev/drum by using
267 	 * the its dev_t number ("swapdev", from MD conf.c).
268 	 */
269 
270 	LIST_INIT(&swap_priority);
271 	uvmexp.nswapdev = 0;
272 	rw_init(&swap_syscall_lock);
273 	mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE);
274 
275 	if (bdevvp(swapdev, &swapdev_vp))
276 		panic("%s: can't get vnode for swap device", __func__);
277 	if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY))
278 		panic("%s: can't lock swap device", __func__);
279 	if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED))
280 		panic("%s: can't open swap device", __func__);
281 	VOP_UNLOCK(swapdev_vp);
282 
283 	/*
284 	 * create swap block resource map to map /dev/drum.   the range
285 	 * from 1 to INT_MAX allows 2 gigablocks of swap space.  note
286 	 * that block 0 is reserved (used to indicate an allocation
287 	 * failure, or no allocation).
288 	 */
289 	swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0,
290 	    VM_NOSLEEP, IPL_NONE);
291 	if (swapmap == 0) {
292 		panic("%s: vmem_create failed", __func__);
293 	}
294 
295 	pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx",
296 	    NULL, IPL_BIO);
297 	pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd",
298 	    NULL, IPL_BIO);
299 
300 	uvm_swap_init_done = true;
301 
302 	UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
303 }
304 
305 /*
306  * swaplist functions: functions that operate on the list of swap
307  * devices on the system.
308  */
309 
310 /*
311  * swaplist_insert: insert swap device "sdp" into the global list
312  *
313  * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
314  * => caller must provide a newly allocated swappri structure (we will
315  *	FREE it if we don't need it... this it to prevent allocation
316  *	blocking here while adding swap)
317  */
318 static void
319 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority)
320 {
321 	struct swappri *spp, *pspp;
322 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
323 
324 	KASSERT(rw_write_held(&swap_syscall_lock));
325 	KASSERT(mutex_owned(&uvm_swap_data_lock));
326 
327 	/*
328 	 * find entry at or after which to insert the new device.
329 	 */
330 	pspp = NULL;
331 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
332 		if (priority <= spp->spi_priority)
333 			break;
334 		pspp = spp;
335 	}
336 
337 	/*
338 	 * new priority?
339 	 */
340 	if (spp == NULL || spp->spi_priority != priority) {
341 		spp = newspp;  /* use newspp! */
342 		UVMHIST_LOG(pdhist, "created new swappri = %jd",
343 			    priority, 0, 0, 0);
344 
345 		spp->spi_priority = priority;
346 		TAILQ_INIT(&spp->spi_swapdev);
347 
348 		if (pspp)
349 			LIST_INSERT_AFTER(pspp, spp, spi_swappri);
350 		else
351 			LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
352 	} else {
353 	  	/* we don't need a new priority structure, free it */
354 		kmem_free(newspp, sizeof(*newspp));
355 	}
356 
357 	/*
358 	 * priority found (or created).   now insert on the priority's
359 	 * tailq list and bump the total number of swapdevs.
360 	 */
361 	sdp->swd_priority = priority;
362 	TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
363 	uvmexp.nswapdev++;
364 }
365 
366 /*
367  * swaplist_find: find and optionally remove a swap device from the
368  *	global list.
369  *
370  * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
371  * => we return the swapdev we found (and removed)
372  */
373 static struct swapdev *
374 swaplist_find(struct vnode *vp, bool remove)
375 {
376 	struct swapdev *sdp;
377 	struct swappri *spp;
378 
379 	KASSERT(rw_lock_held(&swap_syscall_lock));
380 	KASSERT(remove ? rw_write_held(&swap_syscall_lock) : 1);
381 	KASSERT(mutex_owned(&uvm_swap_data_lock));
382 
383 	/*
384 	 * search the lists for the requested vp
385 	 */
386 
387 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
388 		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
389 			if (sdp->swd_vp == vp) {
390 				if (remove) {
391 					TAILQ_REMOVE(&spp->spi_swapdev,
392 					    sdp, swd_next);
393 					uvmexp.nswapdev--;
394 				}
395 				return(sdp);
396 			}
397 		}
398 	}
399 	return (NULL);
400 }
401 
402 /*
403  * swaplist_trim: scan priority list for empty priority entries and kill
404  *	them.
405  *
406  * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
407  */
408 static void
409 swaplist_trim(void)
410 {
411 	struct swappri *spp, *nextspp;
412 
413 	KASSERT(rw_write_held(&swap_syscall_lock));
414 	KASSERT(mutex_owned(&uvm_swap_data_lock));
415 
416 	LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) {
417 		if (!TAILQ_EMPTY(&spp->spi_swapdev))
418 			continue;
419 		LIST_REMOVE(spp, spi_swappri);
420 		kmem_free(spp, sizeof(*spp));
421 	}
422 }
423 
424 /*
425  * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
426  *	to the "swapdev" that maps that section of the drum.
427  *
428  * => each swapdev takes one big contig chunk of the drum
429  * => caller must hold uvm_swap_data_lock
430  */
431 static struct swapdev *
432 swapdrum_getsdp(int pgno)
433 {
434 	struct swapdev *sdp;
435 	struct swappri *spp;
436 
437 	KASSERT(mutex_owned(&uvm_swap_data_lock));
438 
439 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
440 		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
441 			if (sdp->swd_flags & SWF_FAKE)
442 				continue;
443 			if (pgno >= sdp->swd_drumoffset &&
444 			    pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
445 				return sdp;
446 			}
447 		}
448 	}
449 	return NULL;
450 }
451 
452 /*
453  * swapdrum_sdp_is: true iff the swap device for pgno is sdp
454  *
455  * => for use in positive assertions only; result is not stable
456  */
457 static bool __debugused
458 swapdrum_sdp_is(int pgno, struct swapdev *sdp)
459 {
460 	bool result;
461 
462 	mutex_enter(&uvm_swap_data_lock);
463 	result = swapdrum_getsdp(pgno) == sdp;
464 	mutex_exit(&uvm_swap_data_lock);
465 
466 	return result;
467 }
468 
469 void swapsys_lock(krw_t op)
470 {
471 	rw_enter(&swap_syscall_lock, op);
472 }
473 
474 void swapsys_unlock(void)
475 {
476 	rw_exit(&swap_syscall_lock);
477 }
478 
479 static void
480 swapent_cvt(struct swapent *se, const struct swapdev *sdp, int inuse)
481 {
482 	se->se_dev = sdp->swd_dev;
483 	se->se_flags = sdp->swd_flags;
484 	se->se_nblks = sdp->swd_nblks;
485 	se->se_inuse = inuse;
486 	se->se_priority = sdp->swd_priority;
487 	KASSERT(sdp->swd_pathlen < sizeof(se->se_path));
488 	strcpy(se->se_path, sdp->swd_path);
489 }
490 
491 int (*uvm_swap_stats13)(const struct sys_swapctl_args *, register_t *) =
492     (void *)enosys;
493 int (*uvm_swap_stats50)(const struct sys_swapctl_args *, register_t *) =
494     (void *)enosys;
495 
496 /*
497  * sys_swapctl: main entry point for swapctl(2) system call
498  * 	[with two helper functions: swap_on and swap_off]
499  */
500 int
501 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval)
502 {
503 	/* {
504 		syscallarg(int) cmd;
505 		syscallarg(void *) arg;
506 		syscallarg(int) misc;
507 	} */
508 	struct vnode *vp;
509 	struct nameidata nd;
510 	struct swappri *spp;
511 	struct swapdev *sdp;
512 #define SWAP_PATH_MAX (PATH_MAX + 1)
513 	char	*userpath;
514 	size_t	len = 0;
515 	int	error;
516 	int	priority;
517 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
518 
519 	/*
520 	 * we handle the non-priv NSWAP and STATS request first.
521 	 *
522 	 * SWAP_NSWAP: return number of config'd swap devices
523 	 * [can also be obtained with uvmexp sysctl]
524 	 */
525 	if (SCARG(uap, cmd) == SWAP_NSWAP) {
526 		const int nswapdev = uvmexp.nswapdev;
527 		UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%jd", nswapdev,
528 		    0, 0, 0);
529 		*retval = nswapdev;
530 		return 0;
531 	}
532 
533 	userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP);
534 
535 	/*
536 	 * ensure serialized syscall access by grabbing the swap_syscall_lock
537 	 */
538 	rw_enter(&swap_syscall_lock, RW_WRITER);
539 
540 	/*
541 	 * SWAP_STATS: get stats on current # of configured swap devs
542 	 *
543 	 * note that the swap_priority list can't change as long
544 	 * as we are holding the swap_syscall_lock.  we don't want
545 	 * to grab the uvm_swap_data_lock because we may fault&sleep during
546 	 * copyout() and we don't want to be holding that lock then!
547 	 */
548 	switch (SCARG(uap, cmd)) {
549 	case SWAP_STATS13:
550 		error = (*uvm_swap_stats13)(uap, retval);
551 		goto out;
552 	case SWAP_STATS50:
553 		error = (*uvm_swap_stats50)(uap, retval);
554 		goto out;
555 	case SWAP_STATS:
556 		error = uvm_swap_stats(SCARG(uap, arg), SCARG(uap, misc),
557 		    NULL, sizeof(struct swapent), retval);
558 		UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
559 		goto out;
560 
561 	case SWAP_GETDUMPDEV:
562 		error = copyout(&dumpdev, SCARG(uap, arg), sizeof(dumpdev));
563 		goto out;
564 	default:
565 		break;
566 	}
567 
568 	/*
569 	 * all other requests require superuser privs.   verify.
570 	 */
571 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL,
572 	    0, NULL, NULL, NULL)))
573 		goto out;
574 
575 	if (SCARG(uap, cmd) == SWAP_DUMPOFF) {
576 		/* drop the current dump device */
577 		dumpdev = NODEV;
578 		dumpcdev = NODEV;
579 		cpu_dumpconf();
580 		goto out;
581 	}
582 
583 	/*
584 	 * at this point we expect a path name in arg.   we will
585 	 * use namei() to gain a vnode reference (vref), and lock
586 	 * the vnode (VOP_LOCK).
587 	 *
588 	 * XXX: a NULL arg means use the root vnode pointer (e.g. for
589 	 * miniroot)
590 	 */
591 	if (SCARG(uap, arg) == NULL) {
592 		vp = rootvp;		/* miniroot */
593 		vref(vp);
594 		if (vn_lock(vp, LK_EXCLUSIVE)) {
595 			vrele(vp);
596 			error = EBUSY;
597 			goto out;
598 		}
599 		if (SCARG(uap, cmd) == SWAP_ON &&
600 		    copystr("miniroot", userpath, SWAP_PATH_MAX, &len))
601 			panic("swapctl: miniroot copy failed");
602 	} else {
603 		struct pathbuf *pb;
604 
605 		/*
606 		 * This used to allow copying in one extra byte
607 		 * (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON.
608 		 * This was completely pointless because if anyone
609 		 * used that extra byte namei would fail with
610 		 * ENAMETOOLONG anyway, so I've removed the excess
611 		 * logic. - dholland 20100215
612 		 */
613 
614 		error = pathbuf_copyin(SCARG(uap, arg), &pb);
615 		if (error) {
616 			goto out;
617 		}
618 		if (SCARG(uap, cmd) == SWAP_ON) {
619 			/* get a copy of the string */
620 			pathbuf_copystring(pb, userpath, SWAP_PATH_MAX);
621 			len = strlen(userpath) + 1;
622 		}
623 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
624 		if ((error = namei(&nd))) {
625 			pathbuf_destroy(pb);
626 			goto out;
627 		}
628 		vp = nd.ni_vp;
629 		pathbuf_destroy(pb);
630 	}
631 	/* note: "vp" is referenced and locked */
632 
633 	error = 0;		/* assume no error */
634 	switch(SCARG(uap, cmd)) {
635 
636 	case SWAP_DUMPDEV:
637 		if (vp->v_type != VBLK) {
638 			error = ENOTBLK;
639 			break;
640 		}
641 		if (bdevsw_lookup(vp->v_rdev)) {
642 			dumpdev = vp->v_rdev;
643 			dumpcdev = devsw_blk2chr(dumpdev);
644 		} else
645 			dumpdev = NODEV;
646 		cpu_dumpconf();
647 		break;
648 
649 	case SWAP_CTL:
650 		/*
651 		 * get new priority, remove old entry (if any) and then
652 		 * reinsert it in the correct place.  finally, prune out
653 		 * any empty priority structures.
654 		 */
655 		priority = SCARG(uap, misc);
656 		spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
657 		mutex_enter(&uvm_swap_data_lock);
658 		if ((sdp = swaplist_find(vp, true)) == NULL) {
659 			error = ENOENT;
660 		} else {
661 			swaplist_insert(sdp, spp, priority);
662 			swaplist_trim();
663 		}
664 		mutex_exit(&uvm_swap_data_lock);
665 		if (error)
666 			kmem_free(spp, sizeof(*spp));
667 		break;
668 
669 	case SWAP_ON:
670 
671 		/*
672 		 * check for duplicates.   if none found, then insert a
673 		 * dummy entry on the list to prevent someone else from
674 		 * trying to enable this device while we are working on
675 		 * it.
676 		 */
677 
678 		priority = SCARG(uap, misc);
679 		sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP);
680 		spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
681 		sdp->swd_flags = SWF_FAKE;
682 		sdp->swd_vp = vp;
683 		sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
684 		bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK);
685 		mutex_enter(&uvm_swap_data_lock);
686 		if (swaplist_find(vp, false) != NULL) {
687 			error = EBUSY;
688 			mutex_exit(&uvm_swap_data_lock);
689 			bufq_free(sdp->swd_tab);
690 			kmem_free(sdp, sizeof(*sdp));
691 			kmem_free(spp, sizeof(*spp));
692 			break;
693 		}
694 		swaplist_insert(sdp, spp, priority);
695 		mutex_exit(&uvm_swap_data_lock);
696 
697 		KASSERT(len > 0);
698 		sdp->swd_pathlen = len;
699 		sdp->swd_path = kmem_alloc(len, KM_SLEEP);
700 		if (copystr(userpath, sdp->swd_path, len, 0) != 0)
701 			panic("swapctl: copystr");
702 
703 		/*
704 		 * we've now got a FAKE placeholder in the swap list.
705 		 * now attempt to enable swap on it.  if we fail, undo
706 		 * what we've done and kill the fake entry we just inserted.
707 		 * if swap_on is a success, it will clear the SWF_FAKE flag
708 		 */
709 
710 		if ((error = swap_on(l, sdp)) != 0) {
711 			mutex_enter(&uvm_swap_data_lock);
712 			(void) swaplist_find(vp, true);  /* kill fake entry */
713 			swaplist_trim();
714 			mutex_exit(&uvm_swap_data_lock);
715 			bufq_free(sdp->swd_tab);
716 			kmem_free(sdp->swd_path, sdp->swd_pathlen);
717 			kmem_free(sdp, sizeof(*sdp));
718 			break;
719 		}
720 		break;
721 
722 	case SWAP_OFF:
723 		mutex_enter(&uvm_swap_data_lock);
724 		if ((sdp = swaplist_find(vp, false)) == NULL) {
725 			mutex_exit(&uvm_swap_data_lock);
726 			error = ENXIO;
727 			break;
728 		}
729 
730 		/*
731 		 * If a device isn't in use or enabled, we
732 		 * can't stop swapping from it (again).
733 		 */
734 		if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
735 			mutex_exit(&uvm_swap_data_lock);
736 			error = EBUSY;
737 			break;
738 		}
739 
740 		/*
741 		 * do the real work.
742 		 */
743 		error = swap_off(l, sdp);
744 		break;
745 
746 	default:
747 		error = EINVAL;
748 	}
749 
750 	/*
751 	 * done!  release the ref gained by namei() and unlock.
752 	 */
753 	vput(vp);
754 out:
755 	rw_exit(&swap_syscall_lock);
756 	kmem_free(userpath, SWAP_PATH_MAX);
757 
758 	UVMHIST_LOG(pdhist, "<- done!  error=%jd", error, 0, 0, 0);
759 	return (error);
760 }
761 
762 /*
763  * uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept
764  * away from sys_swapctl() in order to allow COMPAT_* swapctl()
765  * emulation to use it directly without going through sys_swapctl().
766  * The problem with using sys_swapctl() there is that it involves
767  * copying the swapent array to the stackgap, and this array's size
768  * is not known at build time. Hence it would not be possible to
769  * ensure it would fit in the stackgap in any case.
770  */
771 int
772 uvm_swap_stats(char *ptr, int misc,
773     void (*f)(void *, const struct swapent *), size_t len,
774     register_t *retval)
775 {
776 	struct swappri *spp;
777 	struct swapdev *sdp;
778 	struct swapent sep;
779 	int count = 0;
780 	int error;
781 
782 	KASSERT(len <= sizeof(sep));
783 	if (len == 0)
784 		return ENOSYS;
785 
786 	if (misc < 0)
787 		return EINVAL;
788 
789 	if (misc == 0 || uvmexp.nswapdev == 0)
790 		return 0;
791 
792 	/* Make sure userland cannot exhaust kernel memory */
793 	if ((size_t)misc > (size_t)uvmexp.nswapdev)
794 		misc = uvmexp.nswapdev;
795 
796 	KASSERT(rw_lock_held(&swap_syscall_lock));
797 
798 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
799 		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
800 			int inuse;
801 
802 			if (misc-- <= 0)
803 				break;
804 
805 			inuse = btodb((uint64_t)sdp->swd_npginuse <<
806 			    PAGE_SHIFT);
807 
808 			memset(&sep, 0, sizeof(sep));
809 			swapent_cvt(&sep, sdp, inuse);
810 			if (f)
811 				(*f)(&sep, &sep);
812 			if ((error = copyout(&sep, ptr, len)) != 0)
813 				return error;
814 			ptr += len;
815 			count++;
816 		}
817 	}
818 	*retval = count;
819 	return 0;
820 }
821 
822 /*
823  * swap_on: attempt to enable a swapdev for swapping.   note that the
824  *	swapdev is already on the global list, but disabled (marked
825  *	SWF_FAKE).
826  *
827  * => we avoid the start of the disk (to protect disk labels)
828  * => we also avoid the miniroot, if we are swapping to root.
829  * => caller should leave uvm_swap_data_lock unlocked, we may lock it
830  *	if needed.
831  */
832 static int
833 swap_on(struct lwp *l, struct swapdev *sdp)
834 {
835 	struct vnode *vp;
836 	int error, npages, nblocks, size;
837 	long addr;
838 	vmem_addr_t result;
839 	struct vattr va;
840 	dev_t dev;
841 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
842 
843 	/*
844 	 * we want to enable swapping on sdp.   the swd_vp contains
845 	 * the vnode we want (locked and ref'd), and the swd_dev
846 	 * contains the dev_t of the file, if it a block device.
847 	 */
848 
849 	vp = sdp->swd_vp;
850 	dev = sdp->swd_dev;
851 
852 	/*
853 	 * open the swap file (mostly useful for block device files to
854 	 * let device driver know what is up).
855 	 *
856 	 * we skip the open/close for root on swap because the root
857 	 * has already been opened when root was mounted (mountroot).
858 	 */
859 	if (vp != rootvp) {
860 		if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred)))
861 			return (error);
862 	}
863 
864 	/* XXX this only works for block devices */
865 	UVMHIST_LOG(pdhist, "  dev=%jd, major(dev)=%jd", dev, major(dev), 0, 0);
866 
867 	/*
868 	 * we now need to determine the size of the swap area.   for
869 	 * block specials we can call the d_psize function.
870 	 * for normal files, we must stat [get attrs].
871 	 *
872 	 * we put the result in nblks.
873 	 * for normal files, we also want the filesystem block size
874 	 * (which we get with statfs).
875 	 */
876 	switch (vp->v_type) {
877 	case VBLK:
878 		if ((nblocks = bdev_size(dev)) == -1) {
879 			error = ENXIO;
880 			goto bad;
881 		}
882 		break;
883 
884 	case VREG:
885 		if ((error = VOP_GETATTR(vp, &va, l->l_cred)))
886 			goto bad;
887 		nblocks = (int)btodb(va.va_size);
888 		sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift;
889 		/*
890 		 * limit the max # of outstanding I/O requests we issue
891 		 * at any one time.   take it easy on NFS servers.
892 		 */
893 		if (vp->v_tag == VT_NFS)
894 			sdp->swd_maxactive = 2; /* XXX */
895 		else
896 			sdp->swd_maxactive = 8; /* XXX */
897 		break;
898 
899 	default:
900 		error = ENXIO;
901 		goto bad;
902 	}
903 
904 	/*
905 	 * save nblocks in a safe place and convert to pages.
906 	 */
907 
908 	sdp->swd_nblks = nblocks;
909 	npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT;
910 
911 	/*
912 	 * for block special files, we want to make sure that leave
913 	 * the disklabel and bootblocks alone, so we arrange to skip
914 	 * over them (arbitrarily choosing to skip PAGE_SIZE bytes).
915 	 * note that because of this the "size" can be less than the
916 	 * actual number of blocks on the device.
917 	 */
918 	if (vp->v_type == VBLK) {
919 		/* we use pages 1 to (size - 1) [inclusive] */
920 		size = npages - 1;
921 		addr = 1;
922 	} else {
923 		/* we use pages 0 to (size - 1) [inclusive] */
924 		size = npages;
925 		addr = 0;
926 	}
927 
928 	/*
929 	 * make sure we have enough blocks for a reasonable sized swap
930 	 * area.   we want at least one page.
931 	 */
932 
933 	if (size < 1) {
934 		UVMHIST_LOG(pdhist, "  size <= 1!!", 0, 0, 0, 0);
935 		error = EINVAL;
936 		goto bad;
937 	}
938 
939 	UVMHIST_LOG(pdhist, "  dev=%#jx: size=%jd addr=%jd", dev, size, addr, 0);
940 
941 	/*
942 	 * now we need to allocate an extent to manage this swap device
943 	 */
944 
945 	sdp->swd_blist = blist_create(npages);
946 	/* mark all expect the `saved' region free. */
947 	blist_free(sdp->swd_blist, addr, size);
948 
949 	/*
950 	 * allocate space to for swap encryption state and mark the
951 	 * keys uninitialized so we generate them lazily
952 	 */
953 	sdp->swd_encmap = kmem_zalloc(encmap_size(npages), KM_SLEEP);
954 	sdp->swd_encinit = false;
955 
956 	/*
957 	 * if the vnode we are swapping to is the root vnode
958 	 * (i.e. we are swapping to the miniroot) then we want
959 	 * to make sure we don't overwrite it.   do a statfs to
960 	 * find its size and skip over it.
961 	 */
962 	if (vp == rootvp) {
963 		struct mount *mp;
964 		struct statvfs *sp;
965 		int rootblocks, rootpages;
966 
967 		mp = rootvnode->v_mount;
968 		sp = &mp->mnt_stat;
969 		rootblocks = sp->f_blocks * btodb(sp->f_frsize);
970 		/*
971 		 * XXX: sp->f_blocks isn't the total number of
972 		 * blocks in the filesystem, it's the number of
973 		 * data blocks.  so, our rootblocks almost
974 		 * definitely underestimates the total size
975 		 * of the filesystem - how badly depends on the
976 		 * details of the filesystem type.  there isn't
977 		 * an obvious way to deal with this cleanly
978 		 * and perfectly, so for now we just pad our
979 		 * rootblocks estimate with an extra 5 percent.
980 		 */
981 		rootblocks += (rootblocks >> 5) +
982 			(rootblocks >> 6) +
983 			(rootblocks >> 7);
984 		rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
985 		if (rootpages > size)
986 			panic("swap_on: miniroot larger than swap?");
987 
988 		if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) {
989 			panic("swap_on: unable to preserve miniroot");
990 		}
991 
992 		size -= rootpages;
993 		printf("Preserved %d pages of miniroot ", rootpages);
994 		printf("leaving %d pages of swap\n", size);
995 	}
996 
997 	/*
998 	 * add a ref to vp to reflect usage as a swap device.
999 	 */
1000 	vref(vp);
1001 
1002 	/*
1003 	 * now add the new swapdev to the drum and enable.
1004 	 */
1005 	error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result);
1006 	if (error != 0)
1007 		panic("swapdrum_add");
1008 	/*
1009 	 * If this is the first regular swap create the workqueue.
1010 	 * => Protected by swap_syscall_lock.
1011 	 */
1012 	if (vp->v_type != VBLK) {
1013 		if (sw_reg_count++ == 0) {
1014 			KASSERT(sw_reg_workqueue == NULL);
1015 			if (workqueue_create(&sw_reg_workqueue, "swapiod",
1016 			    sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0)
1017 				panic("%s: workqueue_create failed", __func__);
1018 		}
1019 	}
1020 
1021 	sdp->swd_drumoffset = (int)result;
1022 	sdp->swd_drumsize = npages;
1023 	sdp->swd_npages = size;
1024 	mutex_enter(&uvm_swap_data_lock);
1025 	sdp->swd_flags &= ~SWF_FAKE;	/* going live */
1026 	sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
1027 	uvmexp.swpages += size;
1028 	uvmexp.swpgavail += size;
1029 	mutex_exit(&uvm_swap_data_lock);
1030 	return (0);
1031 
1032 	/*
1033 	 * failure: clean up and return error.
1034 	 */
1035 
1036 bad:
1037 	if (sdp->swd_blist) {
1038 		blist_destroy(sdp->swd_blist);
1039 	}
1040 	if (vp != rootvp) {
1041 		(void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred);
1042 	}
1043 	return (error);
1044 }
1045 
1046 /*
1047  * swap_off: stop swapping on swapdev
1048  *
1049  * => swap data should be locked, we will unlock.
1050  */
1051 static int
1052 swap_off(struct lwp *l, struct swapdev *sdp)
1053 {
1054 	int npages = sdp->swd_npages;
1055 	int error = 0;
1056 
1057 	UVMHIST_FUNC(__func__);
1058 	UVMHIST_CALLARGS(pdhist, "  dev=%#jx, npages=%jd", sdp->swd_dev,npages, 0, 0);
1059 
1060 	KASSERT(rw_write_held(&swap_syscall_lock));
1061 	KASSERT(mutex_owned(&uvm_swap_data_lock));
1062 
1063 	/* disable the swap area being removed */
1064 	sdp->swd_flags &= ~SWF_ENABLE;
1065 	uvmexp.swpgavail -= npages;
1066 	mutex_exit(&uvm_swap_data_lock);
1067 
1068 	/*
1069 	 * the idea is to find all the pages that are paged out to this
1070 	 * device, and page them all in.  in uvm, swap-backed pageable
1071 	 * memory can take two forms: aobjs and anons.  call the
1072 	 * swapoff hook for each subsystem to bring in pages.
1073 	 */
1074 
1075 	if (uao_swap_off(sdp->swd_drumoffset,
1076 			 sdp->swd_drumoffset + sdp->swd_drumsize) ||
1077 	    amap_swap_off(sdp->swd_drumoffset,
1078 			  sdp->swd_drumoffset + sdp->swd_drumsize)) {
1079 		error = ENOMEM;
1080 	} else if (sdp->swd_npginuse > sdp->swd_npgbad) {
1081 		error = EBUSY;
1082 	}
1083 
1084 	if (error) {
1085 		mutex_enter(&uvm_swap_data_lock);
1086 		sdp->swd_flags |= SWF_ENABLE;
1087 		uvmexp.swpgavail += npages;
1088 		mutex_exit(&uvm_swap_data_lock);
1089 
1090 		return error;
1091 	}
1092 
1093 	/*
1094 	 * If this is the last regular swap destroy the workqueue.
1095 	 * => Protected by swap_syscall_lock.
1096 	 */
1097 	if (sdp->swd_vp->v_type != VBLK) {
1098 		KASSERT(sw_reg_count > 0);
1099 		KASSERT(sw_reg_workqueue != NULL);
1100 		if (--sw_reg_count == 0) {
1101 			workqueue_destroy(sw_reg_workqueue);
1102 			sw_reg_workqueue = NULL;
1103 		}
1104 	}
1105 
1106 	/*
1107 	 * done with the vnode.
1108 	 * drop our ref on the vnode before calling VOP_CLOSE()
1109 	 * so that spec_close() can tell if this is the last close.
1110 	 */
1111 	vrele(sdp->swd_vp);
1112 	if (sdp->swd_vp != rootvp) {
1113 		(void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred);
1114 	}
1115 
1116 	mutex_enter(&uvm_swap_data_lock);
1117 	uvmexp.swpages -= npages;
1118 	uvmexp.swpginuse -= sdp->swd_npgbad;
1119 
1120 	if (swaplist_find(sdp->swd_vp, true) == NULL)
1121 		panic("%s: swapdev not in list", __func__);
1122 	swaplist_trim();
1123 	mutex_exit(&uvm_swap_data_lock);
1124 
1125 	/*
1126 	 * free all resources!
1127 	 */
1128 	vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize);
1129 	blist_destroy(sdp->swd_blist);
1130 	bufq_free(sdp->swd_tab);
1131 	kmem_free(__UNVOLATILE(sdp->swd_encmap),
1132 	    encmap_size(sdp->swd_drumsize));
1133 	explicit_memset(&sdp->swd_enckey, 0, sizeof sdp->swd_enckey);
1134 	explicit_memset(&sdp->swd_deckey, 0, sizeof sdp->swd_deckey);
1135 	kmem_free(sdp, sizeof(*sdp));
1136 	return (0);
1137 }
1138 
1139 void
1140 uvm_swap_shutdown(struct lwp *l)
1141 {
1142 	struct swapdev *sdp;
1143 	struct swappri *spp;
1144 	struct vnode *vp;
1145 	int error;
1146 
1147 	if (!uvm_swap_init_done)
1148 		return;
1149 	printf("turning off swap...");
1150 	rw_enter(&swap_syscall_lock, RW_WRITER);
1151 	mutex_enter(&uvm_swap_data_lock);
1152 again:
1153 	LIST_FOREACH(spp, &swap_priority, spi_swappri)
1154 		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
1155 			if (sdp->swd_flags & SWF_FAKE)
1156 				continue;
1157 			if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0)
1158 				continue;
1159 #ifdef DEBUG
1160 			printf("\nturning off swap on %s...", sdp->swd_path);
1161 #endif
1162 			/* Have to lock and reference vnode for swap_off(). */
1163 			vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE|LK_RETRY);
1164 			vref(vp);
1165 			error = swap_off(l, sdp);
1166 			vput(vp);
1167 			mutex_enter(&uvm_swap_data_lock);
1168 			if (error) {
1169 				printf("stopping swap on %s failed "
1170 				    "with error %d\n", sdp->swd_path, error);
1171 				TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1172 				uvmexp.nswapdev--;
1173 				swaplist_trim();
1174 			}
1175 			goto again;
1176 		}
1177 	printf(" done\n");
1178 	mutex_exit(&uvm_swap_data_lock);
1179 	rw_exit(&swap_syscall_lock);
1180 }
1181 
1182 
1183 /*
1184  * /dev/drum interface and i/o functions
1185  */
1186 
1187 /*
1188  * swstrategy: perform I/O on the drum
1189  *
1190  * => we must map the i/o request from the drum to the correct swapdev.
1191  */
1192 static void
1193 swstrategy(struct buf *bp)
1194 {
1195 	struct swapdev *sdp;
1196 	struct vnode *vp;
1197 	int pageno, bn;
1198 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1199 
1200 	/*
1201 	 * convert block number to swapdev.   note that swapdev can't
1202 	 * be yanked out from under us because we are holding resources
1203 	 * in it (i.e. the blocks we are doing I/O on).
1204 	 */
1205 	pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT;
1206 	mutex_enter(&uvm_swap_data_lock);
1207 	sdp = swapdrum_getsdp(pageno);
1208 	mutex_exit(&uvm_swap_data_lock);
1209 	if (sdp == NULL) {
1210 		bp->b_error = EINVAL;
1211 		bp->b_resid = bp->b_bcount;
1212 		biodone(bp);
1213 		UVMHIST_LOG(pdhist, "  failed to get swap device", 0, 0, 0, 0);
1214 		return;
1215 	}
1216 
1217 	/*
1218 	 * convert drum page number to block number on this swapdev.
1219 	 */
1220 
1221 	pageno -= sdp->swd_drumoffset;	/* page # on swapdev */
1222 	bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */
1223 
1224 	UVMHIST_LOG(pdhist, "  Rd/Wr (0/1) %jd: mapoff=%#jx bn=%#jx bcount=%jd",
1225 		((bp->b_flags & B_READ) == 0) ? 1 : 0,
1226 		sdp->swd_drumoffset, bn, bp->b_bcount);
1227 
1228 	/*
1229 	 * for block devices we finish up here.
1230 	 * for regular files we have to do more work which we delegate
1231 	 * to sw_reg_strategy().
1232 	 */
1233 
1234 	vp = sdp->swd_vp;		/* swapdev vnode pointer */
1235 	switch (vp->v_type) {
1236 	default:
1237 		panic("%s: vnode type 0x%x", __func__, vp->v_type);
1238 
1239 	case VBLK:
1240 
1241 		/*
1242 		 * must convert "bp" from an I/O on /dev/drum to an I/O
1243 		 * on the swapdev (sdp).
1244 		 */
1245 		bp->b_blkno = bn;		/* swapdev block number */
1246 		bp->b_dev = sdp->swd_dev;	/* swapdev dev_t */
1247 
1248 		/*
1249 		 * if we are doing a write, we have to redirect the i/o on
1250 		 * drum's v_numoutput counter to the swapdevs.
1251 		 */
1252 		if ((bp->b_flags & B_READ) == 0) {
1253 			mutex_enter(bp->b_objlock);
1254 			vwakeup(bp);	/* kills one 'v_numoutput' on drum */
1255 			mutex_exit(bp->b_objlock);
1256 			mutex_enter(vp->v_interlock);
1257 			vp->v_numoutput++;	/* put it on swapdev */
1258 			mutex_exit(vp->v_interlock);
1259 		}
1260 
1261 		/*
1262 		 * finally plug in swapdev vnode and start I/O
1263 		 */
1264 		bp->b_vp = vp;
1265 		bp->b_objlock = vp->v_interlock;
1266 		VOP_STRATEGY(vp, bp);
1267 		return;
1268 
1269 	case VREG:
1270 		/*
1271 		 * delegate to sw_reg_strategy function.
1272 		 */
1273 		sw_reg_strategy(sdp, bp, bn);
1274 		return;
1275 	}
1276 	/* NOTREACHED */
1277 }
1278 
1279 /*
1280  * swread: the read function for the drum (just a call to physio)
1281  */
1282 /*ARGSUSED*/
1283 static int
1284 swread(dev_t dev, struct uio *uio, int ioflag)
1285 {
1286 	UVMHIST_FUNC(__func__);
1287 	UVMHIST_CALLARGS(pdhist, "  dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0);
1288 
1289 	return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
1290 }
1291 
1292 /*
1293  * swwrite: the write function for the drum (just a call to physio)
1294  */
1295 /*ARGSUSED*/
1296 static int
1297 swwrite(dev_t dev, struct uio *uio, int ioflag)
1298 {
1299 	UVMHIST_FUNC(__func__);
1300 	UVMHIST_CALLARGS(pdhist, "  dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0);
1301 
1302 	return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
1303 }
1304 
1305 const struct bdevsw swap_bdevsw = {
1306 	.d_open = nullopen,
1307 	.d_close = nullclose,
1308 	.d_strategy = swstrategy,
1309 	.d_ioctl = noioctl,
1310 	.d_dump = nodump,
1311 	.d_psize = nosize,
1312 	.d_discard = nodiscard,
1313 	.d_flag = D_OTHER
1314 };
1315 
1316 const struct cdevsw swap_cdevsw = {
1317 	.d_open = nullopen,
1318 	.d_close = nullclose,
1319 	.d_read = swread,
1320 	.d_write = swwrite,
1321 	.d_ioctl = noioctl,
1322 	.d_stop = nostop,
1323 	.d_tty = notty,
1324 	.d_poll = nopoll,
1325 	.d_mmap = nommap,
1326 	.d_kqfilter = nokqfilter,
1327 	.d_discard = nodiscard,
1328 	.d_flag = D_OTHER,
1329 };
1330 
1331 /*
1332  * sw_reg_strategy: handle swap i/o to regular files
1333  */
1334 static void
1335 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn)
1336 {
1337 	struct vnode	*vp;
1338 	struct vndxfer	*vnx;
1339 	daddr_t		nbn;
1340 	char 		*addr;
1341 	off_t		byteoff;
1342 	int		s, off, nra, error, sz, resid;
1343 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1344 
1345 	/*
1346 	 * allocate a vndxfer head for this transfer and point it to
1347 	 * our buffer.
1348 	 */
1349 	vnx = pool_get(&vndxfer_pool, PR_WAITOK);
1350 	vnx->vx_flags = VX_BUSY;
1351 	vnx->vx_error = 0;
1352 	vnx->vx_pending = 0;
1353 	vnx->vx_bp = bp;
1354 	vnx->vx_sdp = sdp;
1355 
1356 	/*
1357 	 * setup for main loop where we read filesystem blocks into
1358 	 * our buffer.
1359 	 */
1360 	error = 0;
1361 	bp->b_resid = bp->b_bcount;	/* nothing transferred yet! */
1362 	addr = bp->b_data;		/* current position in buffer */
1363 	byteoff = dbtob((uint64_t)bn);
1364 
1365 	for (resid = bp->b_resid; resid; resid -= sz) {
1366 		struct vndbuf	*nbp;
1367 
1368 		/*
1369 		 * translate byteoffset into block number.  return values:
1370 		 *   vp = vnode of underlying device
1371 		 *  nbn = new block number (on underlying vnode dev)
1372 		 *  nra = num blocks we can read-ahead (excludes requested
1373 		 *	block)
1374 		 */
1375 		nra = 0;
1376 		error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
1377 				 	&vp, &nbn, &nra);
1378 
1379 		if (error == 0 && nbn == (daddr_t)-1) {
1380 			/*
1381 			 * this used to just set error, but that doesn't
1382 			 * do the right thing.  Instead, it causes random
1383 			 * memory errors.  The panic() should remain until
1384 			 * this condition doesn't destabilize the system.
1385 			 */
1386 #if 1
1387 			panic("%s: swap to sparse file", __func__);
1388 #else
1389 			error = EIO;	/* failure */
1390 #endif
1391 		}
1392 
1393 		/*
1394 		 * punt if there was an error or a hole in the file.
1395 		 * we must wait for any i/o ops we have already started
1396 		 * to finish before returning.
1397 		 *
1398 		 * XXX we could deal with holes here but it would be
1399 		 * a hassle (in the write case).
1400 		 */
1401 		if (error) {
1402 			s = splbio();
1403 			vnx->vx_error = error;	/* pass error up */
1404 			goto out;
1405 		}
1406 
1407 		/*
1408 		 * compute the size ("sz") of this transfer (in bytes).
1409 		 */
1410 		off = byteoff % sdp->swd_bsize;
1411 		sz = (1 + nra) * sdp->swd_bsize - off;
1412 		if (sz > resid)
1413 			sz = resid;
1414 
1415 		UVMHIST_LOG(pdhist, "sw_reg_strategy: "
1416 		    "vp %#jx/%#jx offset %#jx/%#jx",
1417 		    (uintptr_t)sdp->swd_vp, (uintptr_t)vp, byteoff, nbn);
1418 
1419 		/*
1420 		 * now get a buf structure.   note that the vb_buf is
1421 		 * at the front of the nbp structure so that you can
1422 		 * cast pointers between the two structure easily.
1423 		 */
1424 		nbp = pool_get(&vndbuf_pool, PR_WAITOK);
1425 		buf_init(&nbp->vb_buf);
1426 		nbp->vb_buf.b_flags    = bp->b_flags;
1427 		nbp->vb_buf.b_cflags   = bp->b_cflags;
1428 		nbp->vb_buf.b_oflags   = bp->b_oflags;
1429 		nbp->vb_buf.b_bcount   = sz;
1430 		nbp->vb_buf.b_bufsize  = sz;
1431 		nbp->vb_buf.b_error    = 0;
1432 		nbp->vb_buf.b_data     = addr;
1433 		nbp->vb_buf.b_lblkno   = 0;
1434 		nbp->vb_buf.b_blkno    = nbn + btodb(off);
1435 		nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno;
1436 		nbp->vb_buf.b_iodone   = sw_reg_biodone;
1437 		nbp->vb_buf.b_vp       = vp;
1438 		nbp->vb_buf.b_objlock  = vp->v_interlock;
1439 		if (vp->v_type == VBLK) {
1440 			nbp->vb_buf.b_dev = vp->v_rdev;
1441 		}
1442 
1443 		nbp->vb_xfer = vnx;	/* patch it back in to vnx */
1444 
1445 		/*
1446 		 * Just sort by block number
1447 		 */
1448 		s = splbio();
1449 		if (vnx->vx_error != 0) {
1450 			buf_destroy(&nbp->vb_buf);
1451 			pool_put(&vndbuf_pool, nbp);
1452 			goto out;
1453 		}
1454 		vnx->vx_pending++;
1455 
1456 		/* sort it in and start I/O if we are not over our limit */
1457 		/* XXXAD locking */
1458 		bufq_put(sdp->swd_tab, &nbp->vb_buf);
1459 		sw_reg_start(sdp);
1460 		splx(s);
1461 
1462 		/*
1463 		 * advance to the next I/O
1464 		 */
1465 		byteoff += sz;
1466 		addr += sz;
1467 	}
1468 
1469 	s = splbio();
1470 
1471 out: /* Arrive here at splbio */
1472 	vnx->vx_flags &= ~VX_BUSY;
1473 	if (vnx->vx_pending == 0) {
1474 		error = vnx->vx_error;
1475 		pool_put(&vndxfer_pool, vnx);
1476 		bp->b_error = error;
1477 		biodone(bp);
1478 	}
1479 	splx(s);
1480 }
1481 
1482 /*
1483  * sw_reg_start: start an I/O request on the requested swapdev
1484  *
1485  * => reqs are sorted by b_rawblkno (above)
1486  */
1487 static void
1488 sw_reg_start(struct swapdev *sdp)
1489 {
1490 	struct buf	*bp;
1491 	struct vnode	*vp;
1492 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1493 
1494 	/* recursion control */
1495 	if ((sdp->swd_flags & SWF_BUSY) != 0)
1496 		return;
1497 
1498 	sdp->swd_flags |= SWF_BUSY;
1499 
1500 	while (sdp->swd_active < sdp->swd_maxactive) {
1501 		bp = bufq_get(sdp->swd_tab);
1502 		if (bp == NULL)
1503 			break;
1504 		sdp->swd_active++;
1505 
1506 		UVMHIST_LOG(pdhist,
1507 		    "sw_reg_start:  bp %#jx vp %#jx blkno %#jx cnt %#jx",
1508 		    (uintptr_t)bp, (uintptr_t)bp->b_vp, (uintptr_t)bp->b_blkno,
1509 		    bp->b_bcount);
1510 		vp = bp->b_vp;
1511 		KASSERT(bp->b_objlock == vp->v_interlock);
1512 		if ((bp->b_flags & B_READ) == 0) {
1513 			mutex_enter(vp->v_interlock);
1514 			vp->v_numoutput++;
1515 			mutex_exit(vp->v_interlock);
1516 		}
1517 		VOP_STRATEGY(vp, bp);
1518 	}
1519 	sdp->swd_flags &= ~SWF_BUSY;
1520 }
1521 
1522 /*
1523  * sw_reg_biodone: one of our i/o's has completed
1524  */
1525 static void
1526 sw_reg_biodone(struct buf *bp)
1527 {
1528 	workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL);
1529 }
1530 
1531 /*
1532  * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
1533  *
1534  * => note that we can recover the vndbuf struct by casting the buf ptr
1535  */
1536 static void
1537 sw_reg_iodone(struct work *wk, void *dummy)
1538 {
1539 	struct vndbuf *vbp = (void *)wk;
1540 	struct vndxfer *vnx = vbp->vb_xfer;
1541 	struct buf *pbp = vnx->vx_bp;		/* parent buffer */
1542 	struct swapdev	*sdp = vnx->vx_sdp;
1543 	int s, resid, error;
1544 	KASSERT(&vbp->vb_buf.b_work == wk);
1545 	UVMHIST_FUNC(__func__);
1546 	UVMHIST_CALLARGS(pdhist, "  vbp=%#jx vp=%#jx blkno=%#jx addr=%#jx",
1547 	    (uintptr_t)vbp, (uintptr_t)vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno,
1548 	    (uintptr_t)vbp->vb_buf.b_data);
1549 	UVMHIST_LOG(pdhist, "  cnt=%#jx resid=%#jx",
1550 	    vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
1551 
1552 	/*
1553 	 * protect vbp at splbio and update.
1554 	 */
1555 
1556 	s = splbio();
1557 	resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
1558 	pbp->b_resid -= resid;
1559 	vnx->vx_pending--;
1560 
1561 	if (vbp->vb_buf.b_error != 0) {
1562 		/* pass error upward */
1563 		error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO;
1564 		UVMHIST_LOG(pdhist, "  got error=%jd !", error, 0, 0, 0);
1565 		vnx->vx_error = error;
1566 	}
1567 
1568 	/*
1569 	 * kill vbp structure
1570 	 */
1571 	buf_destroy(&vbp->vb_buf);
1572 	pool_put(&vndbuf_pool, vbp);
1573 
1574 	/*
1575 	 * wrap up this transaction if it has run to completion or, in
1576 	 * case of an error, when all auxiliary buffers have returned.
1577 	 */
1578 	if (vnx->vx_error != 0) {
1579 		/* pass error upward */
1580 		error = vnx->vx_error;
1581 		if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
1582 			pbp->b_error = error;
1583 			biodone(pbp);
1584 			pool_put(&vndxfer_pool, vnx);
1585 		}
1586 	} else if (pbp->b_resid == 0) {
1587 		KASSERT(vnx->vx_pending == 0);
1588 		if ((vnx->vx_flags & VX_BUSY) == 0) {
1589 			UVMHIST_LOG(pdhist, "  iodone, pbp=%#jx error=%jd !",
1590 			    (uintptr_t)pbp, vnx->vx_error, 0, 0);
1591 			biodone(pbp);
1592 			pool_put(&vndxfer_pool, vnx);
1593 		}
1594 	}
1595 
1596 	/*
1597 	 * done!   start next swapdev I/O if one is pending
1598 	 */
1599 	sdp->swd_active--;
1600 	sw_reg_start(sdp);
1601 	splx(s);
1602 }
1603 
1604 
1605 /*
1606  * uvm_swap_alloc: allocate space on swap
1607  *
1608  * => allocation is done "round robin" down the priority list, as we
1609  *	allocate in a priority we "rotate" the circle queue.
1610  * => space can be freed with uvm_swap_free
1611  * => we return the page slot number in /dev/drum (0 == invalid slot)
1612  * => we lock uvm_swap_data_lock
1613  * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
1614  */
1615 int
1616 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok)
1617 {
1618 	struct swapdev *sdp;
1619 	struct swappri *spp;
1620 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1621 
1622 	/*
1623 	 * no swap devices configured yet?   definite failure.
1624 	 */
1625 	if (uvmexp.nswapdev < 1)
1626 		return 0;
1627 
1628 	/*
1629 	 * XXXJAK: BEGIN HACK
1630 	 *
1631 	 * blist_alloc() in subr_blist.c will panic if we try to allocate
1632 	 * too many slots.
1633 	 */
1634 	if (*nslots > BLIST_MAX_ALLOC) {
1635 		if (__predict_false(lessok == false))
1636 			return 0;
1637 		*nslots = BLIST_MAX_ALLOC;
1638 	}
1639 	/* XXXJAK: END HACK */
1640 
1641 	/*
1642 	 * lock data lock, convert slots into blocks, and enter loop
1643 	 */
1644 	mutex_enter(&uvm_swap_data_lock);
1645 
1646 ReTry:	/* XXXMRG */
1647 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
1648 		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
1649 			uint64_t result;
1650 
1651 			/* if it's not enabled, then we can't swap from it */
1652 			if ((sdp->swd_flags & SWF_ENABLE) == 0)
1653 				continue;
1654 			if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
1655 				continue;
1656 			result = blist_alloc(sdp->swd_blist, *nslots);
1657 			if (result == BLIST_NONE) {
1658 				continue;
1659 			}
1660 			KASSERT(result < sdp->swd_drumsize);
1661 
1662 			/*
1663 			 * successful allocation!  now rotate the tailq.
1664 			 */
1665 			TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1666 			TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
1667 			sdp->swd_npginuse += *nslots;
1668 			uvmexp.swpginuse += *nslots;
1669 			mutex_exit(&uvm_swap_data_lock);
1670 			/* done!  return drum slot number */
1671 			UVMHIST_LOG(pdhist,
1672 			    "success!  returning %jd slots starting at %jd",
1673 			    *nslots, result + sdp->swd_drumoffset, 0, 0);
1674 			return (result + sdp->swd_drumoffset);
1675 		}
1676 	}
1677 
1678 	/* XXXMRG: BEGIN HACK */
1679 	if (*nslots > 1 && lessok) {
1680 		*nslots = 1;
1681 		/* XXXMRG: ugh!  blist should support this for us */
1682 		goto ReTry;
1683 	}
1684 	/* XXXMRG: END HACK */
1685 
1686 	mutex_exit(&uvm_swap_data_lock);
1687 	return 0;
1688 }
1689 
1690 /*
1691  * uvm_swapisfull: return true if most of available swap is allocated
1692  * and in use.  we don't count some small portion as it may be inaccessible
1693  * to us at any given moment, for example if there is lock contention or if
1694  * pages are busy.
1695  */
1696 bool
1697 uvm_swapisfull(void)
1698 {
1699 	int swpgonly;
1700 	bool rv;
1701 
1702 	if (uvmexp.swpages == 0) {
1703 		return true;
1704 	}
1705 
1706 	mutex_enter(&uvm_swap_data_lock);
1707 	KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
1708 	swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 /
1709 	    uvm_swapisfull_factor);
1710 	rv = (swpgonly >= uvmexp.swpgavail);
1711 	mutex_exit(&uvm_swap_data_lock);
1712 
1713 	return (rv);
1714 }
1715 
1716 /*
1717  * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
1718  *
1719  * => we lock uvm_swap_data_lock
1720  */
1721 void
1722 uvm_swap_markbad(int startslot, int nslots)
1723 {
1724 	struct swapdev *sdp;
1725 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1726 
1727 	mutex_enter(&uvm_swap_data_lock);
1728 	sdp = swapdrum_getsdp(startslot);
1729 	KASSERT(sdp != NULL);
1730 
1731 	/*
1732 	 * we just keep track of how many pages have been marked bad
1733 	 * in this device, to make everything add up in swap_off().
1734 	 * we assume here that the range of slots will all be within
1735 	 * one swap device.
1736 	 */
1737 
1738 	KASSERT(uvmexp.swpgonly >= nslots);
1739 	atomic_add_int(&uvmexp.swpgonly, -nslots);
1740 	sdp->swd_npgbad += nslots;
1741 	UVMHIST_LOG(pdhist, "now %jd bad", sdp->swd_npgbad, 0,0,0);
1742 	mutex_exit(&uvm_swap_data_lock);
1743 }
1744 
1745 /*
1746  * uvm_swap_free: free swap slots
1747  *
1748  * => this can be all or part of an allocation made by uvm_swap_alloc
1749  * => we lock uvm_swap_data_lock
1750  */
1751 void
1752 uvm_swap_free(int startslot, int nslots)
1753 {
1754 	struct swapdev *sdp;
1755 	UVMHIST_FUNC(__func__);
1756 	UVMHIST_CALLARGS(pdhist, "freeing %jd slots starting at %jd", nslots,
1757 	    startslot, 0, 0);
1758 
1759 	/*
1760 	 * ignore attempts to free the "bad" slot.
1761 	 */
1762 
1763 	if (startslot == SWSLOT_BAD) {
1764 		return;
1765 	}
1766 
1767 	/*
1768 	 * convert drum slot offset back to sdp, free the blocks
1769 	 * in the extent, and return.   must hold pri lock to do
1770 	 * lookup and access the extent.
1771 	 */
1772 
1773 	mutex_enter(&uvm_swap_data_lock);
1774 	sdp = swapdrum_getsdp(startslot);
1775 	KASSERT(uvmexp.nswapdev >= 1);
1776 	KASSERT(sdp != NULL);
1777 	KASSERT(sdp->swd_npginuse >= nslots);
1778 	blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots);
1779 	sdp->swd_npginuse -= nslots;
1780 	uvmexp.swpginuse -= nslots;
1781 	mutex_exit(&uvm_swap_data_lock);
1782 }
1783 
1784 /*
1785  * uvm_swap_put: put any number of pages into a contig place on swap
1786  *
1787  * => can be sync or async
1788  */
1789 
1790 int
1791 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags)
1792 {
1793 	int error;
1794 
1795 	error = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
1796 	    ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1797 	return error;
1798 }
1799 
1800 /*
1801  * uvm_swap_get: get a single page from swap
1802  *
1803  * => usually a sync op (from fault)
1804  */
1805 
1806 int
1807 uvm_swap_get(struct vm_page *page, int swslot, int flags)
1808 {
1809 	int error;
1810 
1811 	atomic_inc_uint(&uvmexp.nswget);
1812 	KASSERT(flags & PGO_SYNCIO);
1813 	if (swslot == SWSLOT_BAD) {
1814 		return EIO;
1815 	}
1816 
1817 	error = uvm_swap_io(&page, swslot, 1, B_READ |
1818 	    ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1819 	if (error == 0) {
1820 
1821 		/*
1822 		 * this page is no longer only in swap.
1823 		 */
1824 
1825 		KASSERT(uvmexp.swpgonly > 0);
1826 		atomic_dec_uint(&uvmexp.swpgonly);
1827 	}
1828 	return error;
1829 }
1830 
1831 /*
1832  * uvm_swap_io: do an i/o operation to swap
1833  */
1834 
1835 static int
1836 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags)
1837 {
1838 	daddr_t startblk;
1839 	struct	buf *bp;
1840 	vaddr_t kva;
1841 	int	error, mapinflags;
1842 	bool write, async, swap_encrypt;
1843 	UVMHIST_FUNC(__func__);
1844 	UVMHIST_CALLARGS(pdhist, "<- called, startslot=%jd, npages=%jd, flags=%#jx",
1845 	    startslot, npages, flags, 0);
1846 
1847 	write = (flags & B_READ) == 0;
1848 	async = (flags & B_ASYNC) != 0;
1849 	swap_encrypt = atomic_load_relaxed(&uvm_swap_encrypt);
1850 
1851 	/*
1852 	 * allocate a buf for the i/o.
1853 	 */
1854 
1855 	KASSERT(curlwp != uvm.pagedaemon_lwp || (write && async));
1856 	bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp);
1857 	if (bp == NULL) {
1858 		uvm_aio_aiodone_pages(pps, npages, true, ENOMEM);
1859 		return ENOMEM;
1860 	}
1861 
1862 	/*
1863 	 * convert starting drum slot to block number
1864 	 */
1865 
1866 	startblk = btodb((uint64_t)startslot << PAGE_SHIFT);
1867 
1868 	/*
1869 	 * first, map the pages into the kernel.
1870 	 */
1871 
1872 	mapinflags = !write ?
1873 		UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ :
1874 		UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE;
1875 	if (write && swap_encrypt)	/* need to encrypt in-place */
1876 		mapinflags |= UVMPAGER_MAPIN_READ;
1877 	kva = uvm_pagermapin(pps, npages, mapinflags);
1878 
1879 	/*
1880 	 * encrypt writes in place if requested
1881 	 */
1882 
1883 	if (write) do {
1884 		struct swapdev *sdp;
1885 		int i;
1886 
1887 		/*
1888 		 * Get the swapdev so we can discriminate on the
1889 		 * encryption state.  There may or may not be an
1890 		 * encryption key generated; we may or may not be asked
1891 		 * to encrypt swap.
1892 		 *
1893 		 * 1. NO KEY, NO ENCRYPTION: Nothing to do.
1894 		 *
1895 		 * 2. NO KEY, BUT ENCRYPTION: Generate a key, encrypt,
1896 		 *    and mark the slots encrypted.
1897 		 *
1898 		 * 3. KEY, BUT NO ENCRYPTION: The slots may already be
1899 		 *    marked encrypted from a past life.  Mark them not
1900 		 *    encrypted.
1901 		 *
1902 		 * 4. KEY, ENCRYPTION: Encrypt and mark the slots
1903 		 *    encrypted.
1904 		 */
1905 		mutex_enter(&uvm_swap_data_lock);
1906 		sdp = swapdrum_getsdp(startslot);
1907 		if (!sdp->swd_encinit) {
1908 			if (!swap_encrypt) {
1909 				mutex_exit(&uvm_swap_data_lock);
1910 				break;
1911 			}
1912 			uvm_swap_genkey(sdp);
1913 		}
1914 		KASSERT(sdp->swd_encinit);
1915 		mutex_exit(&uvm_swap_data_lock);
1916 
1917 		for (i = 0; i < npages; i++) {
1918 			int s = startslot + i;
1919 			KDASSERT(swapdrum_sdp_is(s, sdp));
1920 			KASSERT(s >= sdp->swd_drumoffset);
1921 			s -= sdp->swd_drumoffset;
1922 			KASSERT(s < sdp->swd_drumsize);
1923 
1924 			if (swap_encrypt) {
1925 				uvm_swap_encryptpage(sdp,
1926 				    (void *)(kva + (vsize_t)i*PAGE_SIZE), s);
1927 				atomic_or_32(&sdp->swd_encmap[s/32],
1928 				    __BIT(s%32));
1929 			} else {
1930 				atomic_and_32(&sdp->swd_encmap[s/32],
1931 				    ~__BIT(s%32));
1932 			}
1933 		}
1934 	} while (0);
1935 
1936 	/*
1937 	 * fill in the bp/sbp.   we currently route our i/o through
1938 	 * /dev/drum's vnode [swapdev_vp].
1939 	 */
1940 
1941 	bp->b_cflags = BC_BUSY | BC_NOCACHE;
1942 	bp->b_flags = (flags & (B_READ|B_ASYNC));
1943 	bp->b_proc = &proc0;	/* XXX */
1944 	bp->b_vnbufs.le_next = NOLIST;
1945 	bp->b_data = (void *)kva;
1946 	bp->b_blkno = startblk;
1947 	bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT;
1948 
1949 	/*
1950 	 * bump v_numoutput (counter of number of active outputs).
1951 	 */
1952 
1953 	if (write) {
1954 		mutex_enter(swapdev_vp->v_interlock);
1955 		swapdev_vp->v_numoutput++;
1956 		mutex_exit(swapdev_vp->v_interlock);
1957 	}
1958 
1959 	/*
1960 	 * for async ops we must set up the iodone handler.
1961 	 */
1962 
1963 	if (async) {
1964 		bp->b_iodone = uvm_aio_aiodone;
1965 		UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
1966 		if (curlwp == uvm.pagedaemon_lwp)
1967 			BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1968 		else
1969 			BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
1970 	} else {
1971 		bp->b_iodone = NULL;
1972 		BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1973 	}
1974 	UVMHIST_LOG(pdhist,
1975 	    "about to start io: data = %#jx blkno = %#jx, bcount = %jd",
1976 	    (uintptr_t)bp->b_data, bp->b_blkno, bp->b_bcount, 0);
1977 
1978 	/*
1979 	 * now we start the I/O, and if async, return.
1980 	 */
1981 
1982 	VOP_STRATEGY(swapdev_vp, bp);
1983 	if (async) {
1984 		/*
1985 		 * Reads are always synchronous; if this changes, we
1986 		 * need to add an asynchronous path for decryption.
1987 		 */
1988 		KASSERT(write);
1989 		return 0;
1990 	}
1991 
1992 	/*
1993 	 * must be sync i/o.   wait for it to finish
1994 	 */
1995 
1996 	error = biowait(bp);
1997 	if (error)
1998 		goto out;
1999 
2000 	/*
2001 	 * decrypt reads in place if needed
2002 	 */
2003 
2004 	if (!write) do {
2005 		struct swapdev *sdp;
2006 		bool encinit;
2007 		int i;
2008 
2009 		/*
2010 		 * Get the sdp.  Everything about it except the encinit
2011 		 * bit, saying whether the encryption key is
2012 		 * initialized or not, and the encrypted bit for each
2013 		 * page, is stable until all swap pages have been
2014 		 * released and the device is removed.
2015 		 */
2016 		mutex_enter(&uvm_swap_data_lock);
2017 		sdp = swapdrum_getsdp(startslot);
2018 		encinit = sdp->swd_encinit;
2019 		mutex_exit(&uvm_swap_data_lock);
2020 
2021 		if (!encinit)
2022 			/*
2023 			 * If there's no encryption key, there's no way
2024 			 * any of these slots can be encrypted, so
2025 			 * nothing to do here.
2026 			 */
2027 			break;
2028 		for (i = 0; i < npages; i++) {
2029 			int s = startslot + i;
2030 			KDASSERT(swapdrum_sdp_is(s, sdp));
2031 			KASSERT(s >= sdp->swd_drumoffset);
2032 			s -= sdp->swd_drumoffset;
2033 			KASSERT(s < sdp->swd_drumsize);
2034 			if ((atomic_load_relaxed(&sdp->swd_encmap[s/32]) &
2035 				__BIT(s%32)) == 0)
2036 				continue;
2037 			uvm_swap_decryptpage(sdp,
2038 			    (void *)(kva + (vsize_t)i*PAGE_SIZE), s);
2039 		}
2040 	} while (0);
2041 out:
2042 	/*
2043 	 * kill the pager mapping
2044 	 */
2045 
2046 	uvm_pagermapout(kva, npages);
2047 
2048 	/*
2049 	 * now dispose of the buf and we're done.
2050 	 */
2051 
2052 	if (write) {
2053 		mutex_enter(swapdev_vp->v_interlock);
2054 		vwakeup(bp);
2055 		mutex_exit(swapdev_vp->v_interlock);
2056 	}
2057 	putiobuf(bp);
2058 	UVMHIST_LOG(pdhist, "<- done (sync)  error=%jd", error, 0, 0, 0);
2059 
2060 	return (error);
2061 }
2062 
2063 /*
2064  * uvm_swap_genkey(sdp)
2065  *
2066  *	Generate a key for swap encryption.
2067  */
2068 static void
2069 uvm_swap_genkey(struct swapdev *sdp)
2070 {
2071 	uint8_t key[32];
2072 
2073 	KASSERT(!sdp->swd_encinit);
2074 
2075 	cprng_strong(kern_cprng, key, sizeof key, 0);
2076 	aes_setenckey256(&sdp->swd_enckey, key);
2077 	aes_setdeckey256(&sdp->swd_deckey, key);
2078 	explicit_memset(key, 0, sizeof key);
2079 
2080 	sdp->swd_encinit = true;
2081 }
2082 
2083 /*
2084  * uvm_swap_encryptpage(sdp, kva, slot)
2085  *
2086  *	Encrypt one page of data at kva for the specified slot number
2087  *	in the swap device.
2088  */
2089 static void
2090 uvm_swap_encryptpage(struct swapdev *sdp, void *kva, int slot)
2091 {
2092 	uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16);
2093 
2094 	/* iv := AES_k(le32enc(slot) || 0^96) */
2095 	le32enc(preiv, slot);
2096 	aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS);
2097 
2098 	/* *kva := AES-CBC_k(iv, *kva) */
2099 	aes_cbc_enc(&sdp->swd_enckey, kva, kva, PAGE_SIZE, iv,
2100 	    AES_256_NROUNDS);
2101 
2102 	explicit_memset(&iv, 0, sizeof iv);
2103 }
2104 
2105 /*
2106  * uvm_swap_decryptpage(sdp, kva, slot)
2107  *
2108  *	Decrypt one page of data at kva for the specified slot number
2109  *	in the swap device.
2110  */
2111 static void
2112 uvm_swap_decryptpage(struct swapdev *sdp, void *kva, int slot)
2113 {
2114 	uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16);
2115 
2116 	/* iv := AES_k(le32enc(slot) || 0^96) */
2117 	le32enc(preiv, slot);
2118 	aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS);
2119 
2120 	/* *kva := AES-CBC^{-1}_k(iv, *kva) */
2121 	aes_cbc_dec(&sdp->swd_deckey, kva, kva, PAGE_SIZE, iv,
2122 	    AES_256_NROUNDS);
2123 
2124 	explicit_memset(&iv, 0, sizeof iv);
2125 }
2126 
2127 SYSCTL_SETUP(sysctl_uvmswap_setup, "sysctl uvmswap setup")
2128 {
2129 
2130 	sysctl_createv(clog, 0, NULL, NULL,
2131 	    CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "swap_encrypt",
2132 	    SYSCTL_DESCR("Encrypt data when swapped out to disk"),
2133 	    NULL, 0, &uvm_swap_encrypt, 0,
2134 	    CTL_VM, CTL_CREATE, CTL_EOL);
2135 }
2136