xref: /dflybsd-src/sys/vm/swap_pager.c (revision 441d34b2441f59fde86fa4ef2d5d5cb7a6bfcb11)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 1998-2010 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  * Copyright (c) 1994 John S. Dyson
37  * Copyright (c) 1990 University of Utah.
38  * Copyright (c) 1991, 1993
39  *	The Regents of the University of California.  All rights reserved.
40  *
41  * This code is derived from software contributed to Berkeley by
42  * the Systems Programming Group of the University of Utah Computer
43  * Science Department.
44  *
45  * Redistribution and use in source and binary forms, with or without
46  * modification, are permitted provided that the following conditions
47  * are met:
48  * 1. Redistributions of source code must retain the above copyright
49  *    notice, this list of conditions and the following disclaimer.
50  * 2. Redistributions in binary form must reproduce the above copyright
51  *    notice, this list of conditions and the following disclaimer in the
52  *    documentation and/or other materials provided with the distribution.
53  * 3. All advertising materials mentioning features or use of this software
54  *    must display the following acknowledgement:
55  *	This product includes software developed by the University of
56  *	California, Berkeley and its contributors.
57  * 4. Neither the name of the University nor the names of its contributors
58  *    may be used to endorse or promote products derived from this software
59  *    without specific prior written permission.
60  *
61  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
62  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
63  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
64  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
65  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
66  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
67  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
68  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
69  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
70  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
71  * SUCH DAMAGE.
72  *
73  *				New Swap System
74  *				Matthew Dillon
75  *
76  * Radix Bitmap 'blists'.
77  *
78  *	- The new swapper uses the new radix bitmap code.  This should scale
79  *	  to arbitrarily small or arbitrarily large swap spaces and an almost
80  *	  arbitrary degree of fragmentation.
81  *
82  * Features:
83  *
84  *	- on the fly reallocation of swap during putpages.  The new system
85  *	  does not try to keep previously allocated swap blocks for dirty
86  *	  pages.
87  *
88  *	- on the fly deallocation of swap
89  *
90  *	- No more garbage collection required.  Unnecessarily allocated swap
91  *	  blocks only exist for dirty vm_page_t's now and these are already
92  *	  cycled (in a high-load system) by the pager.  We also do on-the-fly
93  *	  removal of invalidated swap blocks when a page is destroyed
94  *	  or renamed.
95  *
96  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
97  * @(#)swap_pager.c	8.9 (Berkeley) 3/21/94
98  * $FreeBSD: src/sys/vm/swap_pager.c,v 1.130.2.12 2002/08/31 21:15:55 dillon Exp $
99  */
100 
101 #include <sys/param.h>
102 #include <sys/systm.h>
103 #include <sys/conf.h>
104 #include <sys/kernel.h>
105 #include <sys/proc.h>
106 #include <sys/buf.h>
107 #include <sys/vnode.h>
108 #include <sys/malloc.h>
109 #include <sys/vmmeter.h>
110 #include <sys/sysctl.h>
111 #include <sys/blist.h>
112 #include <sys/lock.h>
113 #include <sys/thread2.h>
114 
115 #ifndef MAX_PAGEOUT_CLUSTER
116 #define MAX_PAGEOUT_CLUSTER 16
117 #endif
118 
119 #define SWB_NPAGES	MAX_PAGEOUT_CLUSTER
120 
121 #include "opt_swap.h"
122 #include <vm/vm.h>
123 #include <vm/vm_object.h>
124 #include <vm/vm_page.h>
125 #include <vm/vm_pager.h>
126 #include <vm/vm_pageout.h>
127 #include <vm/swap_pager.h>
128 #include <vm/vm_extern.h>
129 #include <vm/vm_zone.h>
130 #include <vm/vnode_pager.h>
131 
132 #include <sys/buf2.h>
133 #include <vm/vm_page2.h>
134 
135 #define SWM_FREE	0x02	/* free, period			*/
136 #define SWM_POP		0x04	/* pop out			*/
137 
138 #define SWBIO_READ	0x01
139 #define SWBIO_WRITE	0x02
140 #define SWBIO_SYNC	0x04
141 
142 struct swfreeinfo {
143 	vm_object_t	object;
144 	vm_pindex_t	basei;
145 	vm_pindex_t	begi;
146 	vm_pindex_t	endi;	/* inclusive */
147 };
148 
149 /*
150  * vm_swap_size is in page-sized chunks now.  It was DEV_BSIZE'd chunks
151  * in the old system.
152  */
153 
154 int swap_pager_full;		/* swap space exhaustion (task killing) */
155 int vm_swap_cache_use;
156 int vm_swap_anon_use;
157 
158 static int swap_pager_almost_full; /* swap space exhaustion (w/ hysteresis)*/
159 static int nsw_rcount;		/* free read buffers			*/
160 static int nsw_wcount_sync;	/* limit write buffers / synchronous	*/
161 static int nsw_wcount_async;	/* limit write buffers / asynchronous	*/
162 static int nsw_wcount_async_max;/* assigned maximum			*/
163 static int nsw_cluster_max;	/* maximum VOP I/O allowed		*/
164 
165 struct blist *swapblist;
166 static int swap_async_max = 4;	/* maximum in-progress async I/O's	*/
167 static int swap_burst_read = 0;	/* allow burst reading */
168 
169 extern struct vnode *swapdev_vp;	/* from vm_swap.c */
170 
171 SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
172         CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
173 SYSCTL_INT(_vm, OID_AUTO, swap_burst_read,
174         CTLFLAG_RW, &swap_burst_read, 0, "Allow burst reads for pageins");
175 
176 SYSCTL_INT(_vm, OID_AUTO, swap_cache_use,
177         CTLFLAG_RD, &vm_swap_cache_use, 0, "");
178 SYSCTL_INT(_vm, OID_AUTO, swap_anon_use,
179         CTLFLAG_RD, &vm_swap_anon_use, 0, "");
180 
181 vm_zone_t		swap_zone;
182 
183 /*
184  * Red-Black tree for swblock entries
185  *
186  * The caller must hold vm_token
187  */
188 RB_GENERATE2(swblock_rb_tree, swblock, swb_entry, rb_swblock_compare,
189 	     vm_pindex_t, swb_index);
190 
191 int
192 rb_swblock_compare(struct swblock *swb1, struct swblock *swb2)
193 {
194 	if (swb1->swb_index < swb2->swb_index)
195 		return(-1);
196 	if (swb1->swb_index > swb2->swb_index)
197 		return(1);
198 	return(0);
199 }
200 
201 static
202 int
203 rb_swblock_scancmp(struct swblock *swb, void *data)
204 {
205 	struct swfreeinfo *info = data;
206 
207 	if (swb->swb_index < info->basei)
208 		return(-1);
209 	if (swb->swb_index > info->endi)
210 		return(1);
211 	return(0);
212 }
213 
214 static
215 int
216 rb_swblock_condcmp(struct swblock *swb, void *data)
217 {
218 	struct swfreeinfo *info = data;
219 
220 	if (swb->swb_index < info->basei)
221 		return(-1);
222 	return(0);
223 }
224 
225 /*
226  * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
227  * calls hooked from other parts of the VM system and do not appear here.
228  * (see vm/swap_pager.h).
229  */
230 
231 static void	swap_pager_dealloc (vm_object_t object);
232 static int	swap_pager_getpage (vm_object_t, vm_page_t *, int);
233 static void	swap_chain_iodone(struct bio *biox);
234 
235 struct pagerops swappagerops = {
236 	swap_pager_dealloc,	/* deallocate an OBJT_SWAP object	*/
237 	swap_pager_getpage,	/* pagein				*/
238 	swap_pager_putpages,	/* pageout				*/
239 	swap_pager_haspage	/* get backing store status for page	*/
240 };
241 
242 /*
243  * dmmax is in page-sized chunks with the new swap system.  It was
244  * dev-bsized chunks in the old.  dmmax is always a power of 2.
245  *
246  * swap_*() routines are externally accessible.  swp_*() routines are
247  * internal.
248  */
249 
250 int dmmax;
251 static int dmmax_mask;
252 int nswap_lowat = 128;		/* in pages, swap_pager_almost_full warn */
253 int nswap_hiwat = 512;		/* in pages, swap_pager_almost_full warn */
254 
255 static __inline void	swp_sizecheck (void);
256 static void	swp_pager_async_iodone (struct bio *bio);
257 
258 /*
259  * Swap bitmap functions
260  */
261 
262 static __inline void	swp_pager_freeswapspace(vm_object_t object,
263 						daddr_t blk, int npages);
264 static __inline daddr_t	swp_pager_getswapspace(vm_object_t object, int npages);
265 
266 /*
267  * Metadata functions
268  */
269 
270 static void swp_pager_meta_convert(vm_object_t);
271 static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t);
272 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t);
273 static void swp_pager_meta_free_all(vm_object_t);
274 static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);
275 
276 /*
277  * SWP_SIZECHECK() -	update swap_pager_full indication
278  *
279  *	update the swap_pager_almost_full indication and warn when we are
280  *	about to run out of swap space, using lowat/hiwat hysteresis.
281  *
282  *	Clear swap_pager_full ( task killing ) indication when lowat is met.
283  *
284  * No restrictions on call
285  * This routine may not block.
286  * SMP races are ok.
287  */
288 static __inline void
289 swp_sizecheck(void)
290 {
291 	if (vm_swap_size < nswap_lowat) {
292 		if (swap_pager_almost_full == 0) {
293 			kprintf("swap_pager: out of swap space\n");
294 			swap_pager_almost_full = 1;
295 		}
296 	} else {
297 		swap_pager_full = 0;
298 		if (vm_swap_size > nswap_hiwat)
299 			swap_pager_almost_full = 0;
300 	}
301 }
302 
303 /*
304  * SWAP_PAGER_INIT() -	initialize the swap pager!
305  *
306  *	Expected to be started from system init.  NOTE:  This code is run
307  *	before much else so be careful what you depend on.  Most of the VM
308  *	system has yet to be initialized at this point.
309  *
310  * Called from the low level boot code only.
311  */
312 static void
313 swap_pager_init(void *arg __unused)
314 {
315 	/*
316 	 * Device Stripe, in PAGE_SIZE'd blocks
317 	 */
318 	dmmax = SWB_NPAGES * 2;
319 	dmmax_mask = ~(dmmax - 1);
320 }
321 SYSINIT(vm_mem, SI_BOOT1_VM, SI_ORDER_THIRD, swap_pager_init, NULL)
322 
323 /*
324  * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
325  *
326  *	Expected to be started from pageout process once, prior to entering
327  *	its main loop.
328  *
329  * Called from the low level boot code only.
330  */
331 void
332 swap_pager_swap_init(void)
333 {
334 	int n, n2;
335 
336 	/*
337 	 * Number of in-transit swap bp operations.  Don't
338 	 * exhaust the pbufs completely.  Make sure we
339 	 * initialize workable values (0 will work for hysteresis
340 	 * but it isn't very efficient).
341 	 *
342 	 * The nsw_cluster_max is constrained by the number of pages an XIO
343 	 * holds, i.e., (MAXPHYS/PAGE_SIZE) and our locally defined
344 	 * MAX_PAGEOUT_CLUSTER.   Also be aware that swap ops are
345 	 * constrained by the swap device interleave stripe size.
346 	 *
347 	 * Currently we hardwire nsw_wcount_async to 4.  This limit is
348 	 * designed to prevent other I/O from having high latencies due to
349 	 * our pageout I/O.  The value 4 works well for one or two active swap
350 	 * devices but is probably a little low if you have more.  Even so,
351 	 * a higher value would probably generate only a limited improvement
352 	 * with three or four active swap devices since the system does not
353 	 * typically have to pageout at extreme bandwidths.   We will want
354 	 * at least 2 per swap devices, and 4 is a pretty good value if you
355 	 * have one NFS swap device due to the command/ack latency over NFS.
356 	 * So it all works out pretty well.
357 	 */
358 
359 	nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
360 
361 	nsw_rcount = (nswbuf + 1) / 2;
362 	nsw_wcount_sync = (nswbuf + 3) / 4;
363 	nsw_wcount_async = 4;
364 	nsw_wcount_async_max = nsw_wcount_async;
365 
366 	/*
367 	 * The zone is dynamically allocated so generally size it to
368 	 * maxswzone (32MB to 512MB of KVM).  Set a minimum size based
369 	 * on physical memory of around 8x (each swblock can hold 16 pages).
370 	 *
371 	 * With the advent of SSDs (vs HDs) the practical (swap:memory) ratio
372 	 * has increased dramatically.
373 	 */
374 	n = vmstats.v_page_count / 2;
375 	if (maxswzone && n < maxswzone / sizeof(struct swblock))
376 		n = maxswzone / sizeof(struct swblock);
377 	n2 = n;
378 
379 	do {
380 		swap_zone = zinit(
381 			"SWAPMETA",
382 			sizeof(struct swblock),
383 			n,
384 			ZONE_INTERRUPT,
385 			1);
386 		if (swap_zone != NULL)
387 			break;
388 		/*
389 		 * if the allocation failed, try a zone two thirds the
390 		 * size of the previous attempt.
391 		 */
392 		n -= ((n + 2) / 3);
393 	} while (n > 0);
394 
395 	if (swap_zone == NULL)
396 		panic("swap_pager_swap_init: swap_zone == NULL");
397 	if (n2 != n)
398 		kprintf("Swap zone entries reduced from %d to %d.\n", n2, n);
399 }
400 
401 /*
402  * SWAP_PAGER_ALLOC() -	allocate a new OBJT_SWAP VM object and instantiate
403  *			its metadata structures.
404  *
405  *	This routine is called from the mmap and fork code to create a new
406  *	OBJT_SWAP object.  We do this by creating an OBJT_DEFAULT object
407  *	and then converting it with swp_pager_meta_convert().
408  *
409  *	We only support unnamed objects.
410  *
411  * No restrictions.
412  */
413 vm_object_t
414 swap_pager_alloc(void *handle, off_t size, vm_prot_t prot, off_t offset)
415 {
416 	vm_object_t object;
417 
418 	KKASSERT(handle == NULL);
419 	lwkt_gettoken(&vm_token);
420 	object = vm_object_allocate(OBJT_DEFAULT,
421 				    OFF_TO_IDX(offset + PAGE_MASK + size));
422 	swp_pager_meta_convert(object);
423 	lwkt_reltoken(&vm_token);
424 
425 	return (object);
426 }
427 
428 /*
429  * SWAP_PAGER_DEALLOC() -	remove swap metadata from object
430  *
431  *	The swap backing for the object is destroyed.  The code is
432  *	designed such that we can reinstantiate it later, but this
433  *	routine is typically called only when the entire object is
434  *	about to be destroyed.
435  *
436  * The object must be locked or unreferenceable.
437  * No other requirements.
438  */
439 static void
440 swap_pager_dealloc(vm_object_t object)
441 {
442 	lwkt_gettoken(&vm_token);
443 	vm_object_pip_wait(object, "swpdea");
444 
445 	/*
446 	 * Free all remaining metadata.  We only bother to free it from
447 	 * the swap meta data.  We do not attempt to free swapblk's still
448 	 * associated with vm_page_t's for this object.  We do not care
449 	 * if paging is still in progress on some objects.
450 	 */
451 	crit_enter();
452 	swp_pager_meta_free_all(object);
453 	crit_exit();
454 	lwkt_reltoken(&vm_token);
455 }
456 
457 /************************************************************************
458  *			SWAP PAGER BITMAP ROUTINES			*
459  ************************************************************************/
460 
461 /*
462  * SWP_PAGER_GETSWAPSPACE() -	allocate raw swap space
463  *
464  *	Allocate swap for the requested number of pages.  The starting
465  *	swap block number (a page index) is returned or SWAPBLK_NONE
466  *	if the allocation failed.
467  *
468  *	Also has the side effect of advising that somebody made a mistake
469  *	when they configured swap and didn't configure enough.
470  *
471  * The caller must hold vm_token.
472  * This routine may not block.
473  *
474  * NOTE: vm_token must be held to avoid races with bitmap frees from
475  *	 vm_page_remove() via swap_pager_page_removed().
476  */
477 static __inline daddr_t
478 swp_pager_getswapspace(vm_object_t object, int npages)
479 {
480 	daddr_t blk;
481 
482 	ASSERT_LWKT_TOKEN_HELD(&vm_token);
483 
484 	if ((blk = blist_alloc(swapblist, npages)) == SWAPBLK_NONE) {
485 		if (swap_pager_full != 2) {
486 			kprintf("swap_pager_getswapspace: failed\n");
487 			swap_pager_full = 2;
488 			swap_pager_almost_full = 1;
489 		}
490 	} else {
491 		vm_swap_size -= npages;
492 		if (object->type == OBJT_SWAP)
493 			vm_swap_anon_use += npages;
494 		else
495 			vm_swap_cache_use += npages;
496 		swp_sizecheck();
497 	}
498 	return(blk);
499 }
500 
501 /*
502  * SWP_PAGER_FREESWAPSPACE() -	free raw swap space
503  *
504  *	This routine returns the specified swap blocks back to the bitmap.
505  *
506  *	Note:  This routine may not block (it could in the old swap code),
507  *	and through the use of the new blist routines it does not block.
508  *
509  *	We must be called at splvm() to avoid races with bitmap frees from
510  *	vm_page_remove() aka swap_pager_page_removed().
511  *
512  * The caller must hold vm_token.
513  * This routine may not block.
514  */
515 
516 static __inline void
517 swp_pager_freeswapspace(vm_object_t object, daddr_t blk, int npages)
518 {
519 	blist_free(swapblist, blk, npages);
520 	vm_swap_size += npages;
521 	if (object->type == OBJT_SWAP)
522 		vm_swap_anon_use -= npages;
523 	else
524 		vm_swap_cache_use -= npages;
525 	swp_sizecheck();
526 }
527 
528 /*
529  * SWAP_PAGER_FREESPACE() -	frees swap blocks associated with a page
530  *				range within an object.
531  *
532  *	This is a globally accessible routine.
533  *
534  *	This routine removes swapblk assignments from swap metadata.
535  *
536  *	The external callers of this routine typically have already destroyed
537  *	or renamed vm_page_t's associated with this range in the object so
538  *	we should be ok.
539  *
540  * No requirements.
541  */
542 void
543 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_pindex_t size)
544 {
545 	crit_enter();
546 	lwkt_gettoken(&vm_token);
547 	swp_pager_meta_free(object, start, size);
548 	lwkt_reltoken(&vm_token);
549 	crit_exit();
550 }
551 
552 /*
553  * No requirements.
554  */
555 void
556 swap_pager_freespace_all(vm_object_t object)
557 {
558 	crit_enter();
559 	lwkt_gettoken(&vm_token);
560 	swp_pager_meta_free_all(object);
561 	lwkt_reltoken(&vm_token);
562 	crit_exit();
563 }
564 
565 /*
566  * This function conditionally frees swap cache swap starting at
567  * (*basei) in the object.  (count) swap blocks will be nominally freed.
568  * The actual number of blocks freed can be more or less than the
569  * requested number.
570  *
571  * This function nominally returns the number of blocks freed.  However,
572  * the actual number of blocks freed may be less then the returned value.
573  * If the function is unable to exhaust the object or if it is able to
574  * free (approximately) the requested number of blocks it returns
575  * a value n > count.
576  *
577  * If we exhaust the object we will return a value n <= count.
578  *
579  * The caller must hold vm_token.
580  */
581 static int swap_pager_condfree_callback(struct swblock *swap, void *data);
582 
583 int
584 swap_pager_condfree(vm_object_t object, vm_pindex_t *basei, int count)
585 {
586 	struct swfreeinfo info;
587 
588 	ASSERT_LWKT_TOKEN_HELD(&vm_token);
589 
590 	info.object = object;
591 	info.basei = *basei;	/* skip up to this page index */
592 	info.begi = count;	/* max swap pages to destroy */
593 	info.endi = count * 8;	/* max swblocks to scan */
594 
595 	swblock_rb_tree_RB_SCAN(&object->swblock_root, rb_swblock_condcmp,
596 				swap_pager_condfree_callback, &info);
597 	*basei = info.basei;
598 	if (info.endi < 0 && info.begi <= count)
599 		info.begi = count + 1;
600 	return(count - (int)info.begi);
601 }
602 
603 /*
604  * The idea is to free whole meta-block to avoid fragmenting
605  * the swap space or disk I/O.  We only do this if NO VM pages
606  * are present.
607  *
608  * We do not have to deal with clearing PG_SWAPPED in related VM
609  * pages because there are no related VM pages.
610  *
611  * The caller must hold vm_token.
612  */
613 static int
614 swap_pager_condfree_callback(struct swblock *swap, void *data)
615 {
616 	struct swfreeinfo *info = data;
617 	vm_object_t object = info->object;
618 	int i;
619 
620 	for (i = 0; i < SWAP_META_PAGES; ++i) {
621 		if (vm_page_lookup(object, swap->swb_index + i))
622 			break;
623 	}
624 	info->basei = swap->swb_index + SWAP_META_PAGES;
625 	if (i == SWAP_META_PAGES) {
626 		info->begi -= swap->swb_count;
627 		swap_pager_freespace(object, swap->swb_index, SWAP_META_PAGES);
628 	}
629 	--info->endi;
630 	if ((int)info->begi < 0 || (int)info->endi < 0)
631 		return(-1);
632 	return(0);
633 }
634 
635 /*
636  * Called by vm_page_alloc() when a new VM page is inserted
637  * into a VM object.  Checks whether swap has been assigned to
638  * the page and sets PG_SWAPPED as necessary.
639  *
640  * No requirements.
641  */
642 void
643 swap_pager_page_inserted(vm_page_t m)
644 {
645 	if (m->object->swblock_count) {
646 		crit_enter();
647 		lwkt_gettoken(&vm_token);
648 		if (swp_pager_meta_ctl(m->object, m->pindex, 0) != SWAPBLK_NONE)
649 			vm_page_flag_set(m, PG_SWAPPED);
650 		lwkt_reltoken(&vm_token);
651 		crit_exit();
652 	}
653 }
654 
655 /*
656  * SWAP_PAGER_RESERVE() - reserve swap blocks in object
657  *
658  *	Assigns swap blocks to the specified range within the object.  The
659  *	swap blocks are not zerod.  Any previous swap assignment is destroyed.
660  *
661  *	Returns 0 on success, -1 on failure.
662  *
663  * The caller is responsible for avoiding races in the specified range.
664  * No other requirements.
665  */
666 int
667 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
668 {
669 	int n = 0;
670 	daddr_t blk = SWAPBLK_NONE;
671 	vm_pindex_t beg = start;	/* save start index */
672 
673 	crit_enter();
674 	lwkt_gettoken(&vm_token);
675 	while (size) {
676 		if (n == 0) {
677 			n = BLIST_MAX_ALLOC;
678 			while ((blk = swp_pager_getswapspace(object, n)) ==
679 			       SWAPBLK_NONE)
680 			{
681 				n >>= 1;
682 				if (n == 0) {
683 					swp_pager_meta_free(object, beg,
684 							    start - beg);
685 					lwkt_reltoken(&vm_token);
686 					crit_exit();
687 					return(-1);
688 				}
689 			}
690 		}
691 		swp_pager_meta_build(object, start, blk);
692 		--size;
693 		++start;
694 		++blk;
695 		--n;
696 	}
697 	swp_pager_meta_free(object, start, n);
698 	lwkt_reltoken(&vm_token);
699 	crit_exit();
700 	return(0);
701 }
702 
703 /*
704  * SWAP_PAGER_COPY() -  copy blocks from source pager to destination pager
705  *			and destroy the source.
706  *
707  *	Copy any valid swapblks from the source to the destination.  In
708  *	cases where both the source and destination have a valid swapblk,
709  *	we keep the destination's.
710  *
711  *	This routine is allowed to block.  It may block allocating metadata
712  *	indirectly through swp_pager_meta_build() or if paging is still in
713  *	progress on the source.
714  *
715  *	This routine can be called at any spl
716  *
717  *	XXX vm_page_collapse() kinda expects us not to block because we
718  *	supposedly do not need to allocate memory, but for the moment we
719  *	*may* have to get a little memory from the zone allocator, but
720  *	it is taken from the interrupt memory.  We should be ok.
721  *
722  *	The source object contains no vm_page_t's (which is just as well)
723  *
724  *	The source object is of type OBJT_SWAP.
725  *
726  *	The source and destination objects must be locked or
727  *	inaccessible (XXX are they ?)
728  *
729  * The caller must hold vm_token.
730  */
731 void
732 swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
733 		vm_pindex_t base_index, int destroysource)
734 {
735 	vm_pindex_t i;
736 
737 	ASSERT_LWKT_TOKEN_HELD(&vm_token);
738 	crit_enter();
739 
740 	/*
741 	 * transfer source to destination.
742 	 */
743 	for (i = 0; i < dstobject->size; ++i) {
744 		daddr_t dstaddr;
745 
746 		/*
747 		 * Locate (without changing) the swapblk on the destination,
748 		 * unless it is invalid in which case free it silently, or
749 		 * if the destination is a resident page, in which case the
750 		 * source is thrown away.
751 		 */
752 		dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
753 
754 		if (dstaddr == SWAPBLK_NONE) {
755 			/*
756 			 * Destination has no swapblk and is not resident,
757 			 * copy source.
758 			 */
759 			daddr_t srcaddr;
760 
761 			srcaddr = swp_pager_meta_ctl(srcobject,
762 						     base_index + i, SWM_POP);
763 
764 			if (srcaddr != SWAPBLK_NONE)
765 				swp_pager_meta_build(dstobject, i, srcaddr);
766 		} else {
767 			/*
768 			 * Destination has valid swapblk or it is represented
769 			 * by a resident page.  We destroy the sourceblock.
770 			 */
771 			swp_pager_meta_ctl(srcobject, base_index + i, SWM_FREE);
772 		}
773 	}
774 
775 	/*
776 	 * Free left over swap blocks in source.
777 	 *
778 	 * We have to revert the type to OBJT_DEFAULT so we do not accidently
779 	 * double-remove the object from the swap queues.
780 	 */
781 	if (destroysource) {
782 		/*
783 		 * Reverting the type is not necessary, the caller is going
784 		 * to destroy srcobject directly, but I'm doing it here
785 		 * for consistency since we've removed the object from its
786 		 * queues.
787 		 */
788 		swp_pager_meta_free_all(srcobject);
789 		if (srcobject->type == OBJT_SWAP)
790 			srcobject->type = OBJT_DEFAULT;
791 	}
792 	crit_exit();
793 }
794 
795 /*
796  * SWAP_PAGER_HASPAGE() -	determine if we have good backing store for
797  *				the requested page.
798  *
799  *	We determine whether good backing store exists for the requested
800  *	page and return TRUE if it does, FALSE if it doesn't.
801  *
802  *	If TRUE, we also try to determine how much valid, contiguous backing
803  *	store exists before and after the requested page within a reasonable
804  *	distance.  We do not try to restrict it to the swap device stripe
805  *	(that is handled in getpages/putpages).  It probably isn't worth
806  *	doing here.
807  *
808  * No requirements.
809  */
810 boolean_t
811 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex)
812 {
813 	daddr_t blk0;
814 
815 	/*
816 	 * do we have good backing store at the requested index ?
817 	 */
818 
819 	crit_enter();
820 	lwkt_gettoken(&vm_token);
821 	blk0 = swp_pager_meta_ctl(object, pindex, 0);
822 
823 	if (blk0 == SWAPBLK_NONE) {
824 		lwkt_reltoken(&vm_token);
825 		crit_exit();
826 		return (FALSE);
827 	}
828 	lwkt_reltoken(&vm_token);
829 	crit_exit();
830 	return (TRUE);
831 }
832 
833 /*
834  * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
835  *
836  * This removes any associated swap backing store, whether valid or
837  * not, from the page.  This operates on any VM object, not just OBJT_SWAP
838  * objects.
839  *
840  * This routine is typically called when a page is made dirty, at
841  * which point any associated swap can be freed.  MADV_FREE also
842  * calls us in a special-case situation
843  *
844  * NOTE!!!  If the page is clean and the swap was valid, the caller
845  * should make the page dirty before calling this routine.  This routine
846  * does NOT change the m->dirty status of the page.  Also: MADV_FREE
847  * depends on it.
848  *
849  * The page must be busied or soft-busied.
850  * The caller must hold vm_token if the caller does not wish to block here.
851  * No other requirements.
852  */
853 void
854 swap_pager_unswapped(vm_page_t m)
855 {
856 	if (m->flags & PG_SWAPPED) {
857 		crit_enter();
858 		lwkt_gettoken(&vm_token);
859 		KKASSERT(m->flags & PG_SWAPPED);
860 		swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
861 		vm_page_flag_clear(m, PG_SWAPPED);
862 		lwkt_reltoken(&vm_token);
863 		crit_exit();
864 	}
865 }
866 
867 /*
868  * SWAP_PAGER_STRATEGY() - read, write, free blocks
869  *
870  * This implements a VM OBJECT strategy function using swap backing store.
871  * This can operate on any VM OBJECT type, not necessarily just OBJT_SWAP
872  * types.
873  *
874  * This is intended to be a cacheless interface (i.e. caching occurs at
875  * higher levels), and is also used as a swap-based SSD cache for vnode
876  * and device objects.
877  *
878  * All I/O goes directly to and from the swap device.
879  *
880  * We currently attempt to run I/O synchronously or asynchronously as
881  * the caller requests.  This isn't perfect because we loose error
882  * sequencing when we run multiple ops in parallel to satisfy a request.
883  * But this is swap, so we let it all hang out.
884  *
885  * No requirements.
886  */
887 void
888 swap_pager_strategy(vm_object_t object, struct bio *bio)
889 {
890 	struct buf *bp = bio->bio_buf;
891 	struct bio *nbio;
892 	vm_pindex_t start;
893 	vm_pindex_t biox_blkno = 0;
894 	int count;
895 	char *data;
896 	struct bio *biox;
897 	struct buf *bufx;
898 	struct bio_track *track;
899 
900 	/*
901 	 * tracking for swapdev vnode I/Os
902 	 */
903 	if (bp->b_cmd == BUF_CMD_READ)
904 		track = &swapdev_vp->v_track_read;
905 	else
906 		track = &swapdev_vp->v_track_write;
907 
908 	if (bp->b_bcount & PAGE_MASK) {
909 		bp->b_error = EINVAL;
910 		bp->b_flags |= B_ERROR | B_INVAL;
911 		biodone(bio);
912 		kprintf("swap_pager_strategy: bp %p offset %lld size %d, "
913 			"not page bounded\n",
914 			bp, (long long)bio->bio_offset, (int)bp->b_bcount);
915 		return;
916 	}
917 
918 	/*
919 	 * Clear error indication, initialize page index, count, data pointer.
920 	 */
921 	bp->b_error = 0;
922 	bp->b_flags &= ~B_ERROR;
923 	bp->b_resid = bp->b_bcount;
924 
925 	start = (vm_pindex_t)(bio->bio_offset >> PAGE_SHIFT);
926 	count = howmany(bp->b_bcount, PAGE_SIZE);
927 	data = bp->b_data;
928 
929 	/*
930 	 * Deal with BUF_CMD_FREEBLKS
931 	 */
932 	if (bp->b_cmd == BUF_CMD_FREEBLKS) {
933 		/*
934 		 * FREE PAGE(s) - destroy underlying swap that is no longer
935 		 *		  needed.
936 		 */
937 		crit_enter();
938 		lwkt_gettoken(&vm_token);
939 		swp_pager_meta_free(object, start, count);
940 		lwkt_reltoken(&vm_token);
941 		crit_exit();
942 		bp->b_resid = 0;
943 		biodone(bio);
944 		return;
945 	}
946 
947 	/*
948 	 * We need to be able to create a new cluster of I/O's.  We cannot
949 	 * use the caller fields of the passed bio so push a new one.
950 	 *
951 	 * Because nbio is just a placeholder for the cluster links,
952 	 * we can biodone() the original bio instead of nbio to make
953 	 * things a bit more efficient.
954 	 */
955 	nbio = push_bio(bio);
956 	nbio->bio_offset = bio->bio_offset;
957 	nbio->bio_caller_info1.cluster_head = NULL;
958 	nbio->bio_caller_info2.cluster_tail = NULL;
959 
960 	biox = NULL;
961 	bufx = NULL;
962 
963 	/*
964 	 * Execute read or write
965 	 */
966 	crit_enter();
967 	lwkt_gettoken(&vm_token);
968 	while (count > 0) {
969 		daddr_t blk;
970 
971 		/*
972 		 * Obtain block.  If block not found and writing, allocate a
973 		 * new block and build it into the object.
974 		 */
975 		blk = swp_pager_meta_ctl(object, start, 0);
976 		if ((blk == SWAPBLK_NONE) && bp->b_cmd != BUF_CMD_READ) {
977 			blk = swp_pager_getswapspace(object, 1);
978 			if (blk == SWAPBLK_NONE) {
979 				bp->b_error = ENOMEM;
980 				bp->b_flags |= B_ERROR;
981 				break;
982 			}
983 			swp_pager_meta_build(object, start, blk);
984 		}
985 
986 		/*
987 		 * Do we have to flush our current collection?  Yes if:
988 		 *
989 		 *	- no swap block at this index
990 		 *	- swap block is not contiguous
991 		 *	- we cross a physical disk boundry in the
992 		 *	  stripe.
993 		 */
994 		if (
995 		    biox && (biox_blkno + btoc(bufx->b_bcount) != blk ||
996 		     ((biox_blkno ^ blk) & dmmax_mask)
997 		    )
998 		) {
999 			if (bp->b_cmd == BUF_CMD_READ) {
1000 				++mycpu->gd_cnt.v_swapin;
1001 				mycpu->gd_cnt.v_swappgsin += btoc(bufx->b_bcount);
1002 			} else {
1003 				++mycpu->gd_cnt.v_swapout;
1004 				mycpu->gd_cnt.v_swappgsout += btoc(bufx->b_bcount);
1005 				bufx->b_dirtyend = bufx->b_bcount;
1006 			}
1007 
1008 			/*
1009 			 * Finished with this buf.
1010 			 */
1011 			KKASSERT(bufx->b_bcount != 0);
1012 			if (bufx->b_cmd != BUF_CMD_READ)
1013 				bufx->b_dirtyend = bufx->b_bcount;
1014 			biox = NULL;
1015 			bufx = NULL;
1016 		}
1017 
1018 		/*
1019 		 * Add new swapblk to biox, instantiating biox if necessary.
1020 		 * Zero-fill reads are able to take a shortcut.
1021 		 */
1022 		if (blk == SWAPBLK_NONE) {
1023 			/*
1024 			 * We can only get here if we are reading.  Since
1025 			 * we are at splvm() we can safely modify b_resid,
1026 			 * even if chain ops are in progress.
1027 			 */
1028 			bzero(data, PAGE_SIZE);
1029 			bp->b_resid -= PAGE_SIZE;
1030 		} else {
1031 			if (biox == NULL) {
1032 				/* XXX chain count > 4, wait to <= 4 */
1033 
1034 				bufx = getpbuf(NULL);
1035 				biox = &bufx->b_bio1;
1036 				cluster_append(nbio, bufx);
1037 				bufx->b_flags |= (bufx->b_flags & B_ORDERED);
1038 				bufx->b_cmd = bp->b_cmd;
1039 				biox->bio_done = swap_chain_iodone;
1040 				biox->bio_offset = (off_t)blk << PAGE_SHIFT;
1041 				biox->bio_caller_info1.cluster_parent = nbio;
1042 				biox_blkno = blk;
1043 				bufx->b_bcount = 0;
1044 				bufx->b_data = data;
1045 			}
1046 			bufx->b_bcount += PAGE_SIZE;
1047 		}
1048 		--count;
1049 		++start;
1050 		data += PAGE_SIZE;
1051 	}
1052 	lwkt_reltoken(&vm_token);
1053 	crit_exit();
1054 
1055 	/*
1056 	 *  Flush out last buffer
1057 	 */
1058 	if (biox) {
1059 		if (bufx->b_cmd == BUF_CMD_READ) {
1060 			++mycpu->gd_cnt.v_swapin;
1061 			mycpu->gd_cnt.v_swappgsin += btoc(bufx->b_bcount);
1062 		} else {
1063 			++mycpu->gd_cnt.v_swapout;
1064 			mycpu->gd_cnt.v_swappgsout += btoc(bufx->b_bcount);
1065 			bufx->b_dirtyend = bufx->b_bcount;
1066 		}
1067 		KKASSERT(bufx->b_bcount);
1068 		if (bufx->b_cmd != BUF_CMD_READ)
1069 			bufx->b_dirtyend = bufx->b_bcount;
1070 		/* biox, bufx = NULL */
1071 	}
1072 
1073 	/*
1074 	 * Now initiate all the I/O.  Be careful looping on our chain as
1075 	 * I/O's may complete while we are still initiating them.
1076 	 *
1077 	 * If the request is a 100% sparse read no bios will be present
1078 	 * and we just biodone() the buffer.
1079 	 */
1080 	nbio->bio_caller_info2.cluster_tail = NULL;
1081 	bufx = nbio->bio_caller_info1.cluster_head;
1082 
1083 	if (bufx) {
1084 		while (bufx) {
1085 			biox = &bufx->b_bio1;
1086 			BUF_KERNPROC(bufx);
1087 			bufx = bufx->b_cluster_next;
1088 			vn_strategy(swapdev_vp, biox);
1089 		}
1090 	} else {
1091 		biodone(bio);
1092 	}
1093 
1094 	/*
1095 	 * Completion of the cluster will also call biodone_chain(nbio).
1096 	 * We never call biodone(nbio) so we don't have to worry about
1097 	 * setting up a bio_done callback.  It's handled in the sub-IO.
1098 	 */
1099 	/**/
1100 }
1101 
1102 /*
1103  * biodone callback
1104  *
1105  * No requirements.
1106  */
1107 static void
1108 swap_chain_iodone(struct bio *biox)
1109 {
1110 	struct buf **nextp;
1111 	struct buf *bufx;	/* chained sub-buffer */
1112 	struct bio *nbio;	/* parent nbio with chain glue */
1113 	struct buf *bp;		/* original bp associated with nbio */
1114 	int chain_empty;
1115 
1116 	bufx = biox->bio_buf;
1117 	nbio = biox->bio_caller_info1.cluster_parent;
1118 	bp = nbio->bio_buf;
1119 
1120 	/*
1121 	 * Update the original buffer
1122 	 */
1123         KKASSERT(bp != NULL);
1124 	if (bufx->b_flags & B_ERROR) {
1125 		atomic_set_int(&bufx->b_flags, B_ERROR);
1126 		bp->b_error = bufx->b_error;
1127 	} else if (bufx->b_resid != 0) {
1128 		atomic_set_int(&bufx->b_flags, B_ERROR);
1129 		bp->b_error = EINVAL;
1130 	} else {
1131 		atomic_subtract_int(&bp->b_resid, bufx->b_bcount);
1132 	}
1133 
1134 	/*
1135 	 * Remove us from the chain.
1136 	 */
1137 	spin_lock_wr(&bp->b_lock.lk_spinlock);
1138 	nextp = &nbio->bio_caller_info1.cluster_head;
1139 	while (*nextp != bufx) {
1140 		KKASSERT(*nextp != NULL);
1141 		nextp = &(*nextp)->b_cluster_next;
1142 	}
1143 	*nextp = bufx->b_cluster_next;
1144 	chain_empty = (nbio->bio_caller_info1.cluster_head == NULL);
1145 	spin_unlock_wr(&bp->b_lock.lk_spinlock);
1146 
1147 	/*
1148 	 * Clean up bufx.  If the chain is now empty we finish out
1149 	 * the parent.  Note that we may be racing other completions
1150 	 * so we must use the chain_empty status from above.
1151 	 */
1152 	if (chain_empty) {
1153 		if (bp->b_resid != 0 && !(bp->b_flags & B_ERROR)) {
1154 			atomic_set_int(&bp->b_flags, B_ERROR);
1155 			bp->b_error = EINVAL;
1156 		}
1157 		biodone_chain(nbio);
1158         }
1159         relpbuf(bufx, NULL);
1160 }
1161 
1162 /*
1163  * SWAP_PAGER_GETPAGES() - bring page in from swap
1164  *
1165  * The requested page may have to be brought in from swap.  Calculate the
1166  * swap block and bring in additional pages if possible.  All pages must
1167  * have contiguous swap block assignments and reside in the same object.
1168  *
1169  * The caller has a single vm_object_pip_add() reference prior to
1170  * calling us and we should return with the same.
1171  *
1172  * The caller has BUSY'd the page.  We should return with (*mpp) left busy,
1173  * and any additinal pages unbusied.
1174  *
1175  * If the caller encounters a PG_RAM page it will pass it to us even though
1176  * it may be valid and dirty.  We cannot overwrite the page in this case!
1177  * The case is used to allow us to issue pure read-aheads.
1178  *
1179  * NOTE! XXX This code does not entirely pipeline yet due to the fact that
1180  *       the PG_RAM page is validated at the same time as mreq.  What we
1181  *	 really need to do is issue a separate read-ahead pbuf.
1182  *
1183  * No requirements.
1184  */
1185 static int
1186 swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
1187 {
1188 	struct buf *bp;
1189 	struct bio *bio;
1190 	vm_page_t mreq;
1191 	vm_page_t m;
1192 	vm_offset_t kva;
1193 	daddr_t blk;
1194 	int i;
1195 	int j;
1196 	int raonly;
1197 	vm_page_t marray[XIO_INTERNAL_PAGES];
1198 
1199 	mreq = *mpp;
1200 
1201 	if (mreq->object != object) {
1202 		panic("swap_pager_getpages: object mismatch %p/%p",
1203 		    object,
1204 		    mreq->object
1205 		);
1206 	}
1207 
1208 	/*
1209 	 * We don't want to overwrite a fully valid page as it might be
1210 	 * dirty.  This case can occur when e.g. vm_fault hits a perfectly
1211 	 * valid page with PG_RAM set.
1212 	 *
1213 	 * In this case we see if the next page is a suitable page-in
1214 	 * candidate and if it is we issue read-ahead.  PG_RAM will be
1215 	 * set on the last page of the read-ahead to continue the pipeline.
1216 	 */
1217 	if (mreq->valid == VM_PAGE_BITS_ALL) {
1218 		if (swap_burst_read == 0 || mreq->pindex + 1 >= object->size)
1219 			return(VM_PAGER_OK);
1220 		crit_enter();
1221 		lwkt_gettoken(&vm_token);
1222 		blk = swp_pager_meta_ctl(object, mreq->pindex + 1, 0);
1223 		if (blk == SWAPBLK_NONE) {
1224 			lwkt_reltoken(&vm_token);
1225 			crit_exit();
1226 			return(VM_PAGER_OK);
1227 		}
1228 		m = vm_page_lookup(object, mreq->pindex + 1);
1229 		if (m == NULL) {
1230 			m = vm_page_alloc(object, mreq->pindex + 1,
1231 					  VM_ALLOC_QUICK);
1232 			if (m == NULL) {
1233 				lwkt_reltoken(&vm_token);
1234 				crit_exit();
1235 				return(VM_PAGER_OK);
1236 			}
1237 		} else {
1238 			if ((m->flags & PG_BUSY) || m->busy || m->valid) {
1239 				lwkt_reltoken(&vm_token);
1240 				crit_exit();
1241 				return(VM_PAGER_OK);
1242 			}
1243 			vm_page_unqueue_nowakeup(m);
1244 			vm_page_busy(m);
1245 		}
1246 		mreq = m;
1247 		raonly = 1;
1248 		lwkt_reltoken(&vm_token);
1249 		crit_exit();
1250 	} else {
1251 		raonly = 0;
1252 	}
1253 
1254 	/*
1255 	 * Try to block-read contiguous pages from swap if sequential,
1256 	 * otherwise just read one page.  Contiguous pages from swap must
1257 	 * reside within a single device stripe because the I/O cannot be
1258 	 * broken up across multiple stripes.
1259 	 *
1260 	 * Note that blk and iblk can be SWAPBLK_NONE but the loop is
1261 	 * set up such that the case(s) are handled implicitly.
1262 	 */
1263 	crit_enter();
1264 	lwkt_gettoken(&vm_token);
1265 	blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
1266 	marray[0] = mreq;
1267 
1268 	for (i = 1; swap_burst_read &&
1269 		    i < XIO_INTERNAL_PAGES &&
1270 		    mreq->pindex + i < object->size; ++i) {
1271 		daddr_t iblk;
1272 
1273 		iblk = swp_pager_meta_ctl(object, mreq->pindex + i, 0);
1274 		if (iblk != blk + i)
1275 			break;
1276 		if ((blk ^ iblk) & dmmax_mask)
1277 			break;
1278 		m = vm_page_lookup(object, mreq->pindex + i);
1279 		if (m == NULL) {
1280 			m = vm_page_alloc(object, mreq->pindex + i,
1281 					  VM_ALLOC_QUICK);
1282 			if (m == NULL)
1283 				break;
1284 		} else {
1285 			if ((m->flags & PG_BUSY) || m->busy || m->valid)
1286 				break;
1287 			vm_page_unqueue_nowakeup(m);
1288 			vm_page_busy(m);
1289 		}
1290 		marray[i] = m;
1291 	}
1292 	if (i > 1)
1293 		vm_page_flag_set(marray[i - 1], PG_RAM);
1294 
1295 	lwkt_reltoken(&vm_token);
1296 	crit_exit();
1297 
1298 	/*
1299 	 * If mreq is the requested page and we have nothing to do return
1300 	 * VM_PAGER_FAIL.  If raonly is set mreq is just another read-ahead
1301 	 * page and must be cleaned up.
1302 	 */
1303 	if (blk == SWAPBLK_NONE) {
1304 		KKASSERT(i == 1);
1305 		if (raonly) {
1306 			vnode_pager_freepage(mreq);
1307 			return(VM_PAGER_OK);
1308 		} else {
1309 			return(VM_PAGER_FAIL);
1310 		}
1311 	}
1312 
1313 	/*
1314 	 * map our page(s) into kva for input
1315 	 */
1316 	bp = getpbuf(&nsw_rcount);
1317 	bio = &bp->b_bio1;
1318 	kva = (vm_offset_t) bp->b_kvabase;
1319 	bcopy(marray, bp->b_xio.xio_pages, i * sizeof(vm_page_t));
1320 	pmap_qenter(kva, bp->b_xio.xio_pages, i);
1321 
1322 	bp->b_data = (caddr_t)kva;
1323 	bp->b_bcount = PAGE_SIZE * i;
1324 	bp->b_xio.xio_npages = i;
1325 	bio->bio_done = swp_pager_async_iodone;
1326 	bio->bio_offset = (off_t)blk << PAGE_SHIFT;
1327 	bio->bio_caller_info1.index = SWBIO_READ;
1328 
1329 	/*
1330 	 * Set index.  If raonly set the index beyond the array so all
1331 	 * the pages are treated the same, otherwise the original mreq is
1332 	 * at index 0.
1333 	 */
1334 	if (raonly)
1335 		bio->bio_driver_info = (void *)(intptr_t)i;
1336 	else
1337 		bio->bio_driver_info = (void *)(intptr_t)0;
1338 
1339 	for (j = 0; j < i; ++j)
1340 		vm_page_flag_set(bp->b_xio.xio_pages[j], PG_SWAPINPROG);
1341 
1342 	mycpu->gd_cnt.v_swapin++;
1343 	mycpu->gd_cnt.v_swappgsin += bp->b_xio.xio_npages;
1344 
1345 	/*
1346 	 * We still hold the lock on mreq, and our automatic completion routine
1347 	 * does not remove it.
1348 	 */
1349 	vm_object_pip_add(object, bp->b_xio.xio_npages);
1350 
1351 	/*
1352 	 * perform the I/O.  NOTE!!!  bp cannot be considered valid after
1353 	 * this point because we automatically release it on completion.
1354 	 * Instead, we look at the one page we are interested in which we
1355 	 * still hold a lock on even through the I/O completion.
1356 	 *
1357 	 * The other pages in our m[] array are also released on completion,
1358 	 * so we cannot assume they are valid anymore either.
1359 	 */
1360 	bp->b_cmd = BUF_CMD_READ;
1361 	BUF_KERNPROC(bp);
1362 	vn_strategy(swapdev_vp, bio);
1363 
1364 	/*
1365 	 * Wait for the page we want to complete.  PG_SWAPINPROG is always
1366 	 * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
1367 	 * is set in the meta-data.
1368 	 *
1369 	 * If this is a read-ahead only we return immediately without
1370 	 * waiting for I/O.
1371 	 */
1372 	if (raonly)
1373 		return(VM_PAGER_OK);
1374 
1375 	/*
1376 	 * Read-ahead includes originally requested page case.
1377 	 */
1378 	crit_enter();
1379 	lwkt_gettoken(&vm_token);
1380 	while ((mreq->flags & PG_SWAPINPROG) != 0) {
1381 		vm_page_flag_set(mreq, PG_WANTED | PG_REFERENCED);
1382 		mycpu->gd_cnt.v_intrans++;
1383 		if (tsleep(mreq, 0, "swread", hz*20)) {
1384 			kprintf(
1385 			    "swap_pager: indefinite wait buffer: "
1386 				" offset: %lld, size: %ld\n",
1387 			    (long long)bio->bio_offset,
1388 			    (long)bp->b_bcount
1389 			);
1390 		}
1391 	}
1392 	lwkt_reltoken(&vm_token);
1393 	crit_exit();
1394 
1395 	/*
1396 	 * mreq is left bussied after completion, but all the other pages
1397 	 * are freed.  If we had an unrecoverable read error the page will
1398 	 * not be valid.
1399 	 */
1400 	if (mreq->valid != VM_PAGE_BITS_ALL)
1401 		return(VM_PAGER_ERROR);
1402 	else
1403 		return(VM_PAGER_OK);
1404 
1405 	/*
1406 	 * A final note: in a low swap situation, we cannot deallocate swap
1407 	 * and mark a page dirty here because the caller is likely to mark
1408 	 * the page clean when we return, causing the page to possibly revert
1409 	 * to all-zero's later.
1410 	 */
1411 }
1412 
1413 /*
1414  *	swap_pager_putpages:
1415  *
1416  *	Assign swap (if necessary) and initiate I/O on the specified pages.
1417  *
1418  *	We support both OBJT_DEFAULT and OBJT_SWAP objects.  DEFAULT objects
1419  *	are automatically converted to SWAP objects.
1420  *
1421  *	In a low memory situation we may block in vn_strategy(), but the new
1422  *	vm_page reservation system coupled with properly written VFS devices
1423  *	should ensure that no low-memory deadlock occurs.  This is an area
1424  *	which needs work.
1425  *
1426  *	The parent has N vm_object_pip_add() references prior to
1427  *	calling us and will remove references for rtvals[] that are
1428  *	not set to VM_PAGER_PEND.  We need to remove the rest on I/O
1429  *	completion.
1430  *
1431  *	The parent has soft-busy'd the pages it passes us and will unbusy
1432  *	those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
1433  *	We need to unbusy the rest on I/O completion.
1434  *
1435  * No requirements.
1436  */
1437 void
1438 swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
1439 		    boolean_t sync, int *rtvals)
1440 {
1441 	int i;
1442 	int n = 0;
1443 
1444 	if (count && m[0]->object != object) {
1445 		panic("swap_pager_getpages: object mismatch %p/%p",
1446 		    object,
1447 		    m[0]->object
1448 		);
1449 	}
1450 
1451 	/*
1452 	 * Step 1
1453 	 *
1454 	 * Turn object into OBJT_SWAP
1455 	 * check for bogus sysops
1456 	 * force sync if not pageout process
1457 	 */
1458 	if (object->type == OBJT_DEFAULT) {
1459 		lwkt_gettoken(&vm_token);
1460 		if (object->type == OBJT_DEFAULT)
1461 			swp_pager_meta_convert(object);
1462 		lwkt_reltoken(&vm_token);
1463 	}
1464 
1465 	if (curthread != pagethread)
1466 		sync = TRUE;
1467 
1468 	/*
1469 	 * Step 2
1470 	 *
1471 	 * Update nsw parameters from swap_async_max sysctl values.
1472 	 * Do not let the sysop crash the machine with bogus numbers.
1473 	 */
1474 
1475 	if (swap_async_max != nsw_wcount_async_max) {
1476 		int n;
1477 
1478 		/*
1479 		 * limit range
1480 		 */
1481 		if ((n = swap_async_max) > nswbuf / 2)
1482 			n = nswbuf / 2;
1483 		if (n < 1)
1484 			n = 1;
1485 		swap_async_max = n;
1486 
1487 		/*
1488 		 * Adjust difference ( if possible ).  If the current async
1489 		 * count is too low, we may not be able to make the adjustment
1490 		 * at this time.
1491 		 */
1492 		crit_enter();
1493 		lwkt_gettoken(&vm_token);
1494 		n -= nsw_wcount_async_max;
1495 		if (nsw_wcount_async + n >= 0) {
1496 			nsw_wcount_async += n;
1497 			nsw_wcount_async_max += n;
1498 			wakeup(&nsw_wcount_async);
1499 		}
1500 		lwkt_reltoken(&vm_token);
1501 		crit_exit();
1502 	}
1503 
1504 	/*
1505 	 * Step 3
1506 	 *
1507 	 * Assign swap blocks and issue I/O.  We reallocate swap on the fly.
1508 	 * The page is left dirty until the pageout operation completes
1509 	 * successfully.
1510 	 */
1511 
1512 	for (i = 0; i < count; i += n) {
1513 		struct buf *bp;
1514 		struct bio *bio;
1515 		daddr_t blk;
1516 		int j;
1517 
1518 		/*
1519 		 * Maximum I/O size is limited by a number of factors.
1520 		 */
1521 
1522 		n = min(BLIST_MAX_ALLOC, count - i);
1523 		n = min(n, nsw_cluster_max);
1524 
1525 		crit_enter();
1526 		lwkt_gettoken(&vm_token);
1527 
1528 		/*
1529 		 * Get biggest block of swap we can.  If we fail, fall
1530 		 * back and try to allocate a smaller block.  Don't go
1531 		 * overboard trying to allocate space if it would overly
1532 		 * fragment swap.
1533 		 */
1534 		while (
1535 		    (blk = swp_pager_getswapspace(object, n)) == SWAPBLK_NONE &&
1536 		    n > 4
1537 		) {
1538 			n >>= 1;
1539 		}
1540 		if (blk == SWAPBLK_NONE) {
1541 			for (j = 0; j < n; ++j)
1542 				rtvals[i+j] = VM_PAGER_FAIL;
1543 			lwkt_reltoken(&vm_token);
1544 			crit_exit();
1545 			continue;
1546 		}
1547 
1548 		/*
1549 		 * The I/O we are constructing cannot cross a physical
1550 		 * disk boundry in the swap stripe.  Note: we are still
1551 		 * at splvm().
1552 		 */
1553 		if ((blk ^ (blk + n)) & dmmax_mask) {
1554 			j = ((blk + dmmax) & dmmax_mask) - blk;
1555 			swp_pager_freeswapspace(object, blk + j, n - j);
1556 			n = j;
1557 		}
1558 
1559 		/*
1560 		 * All I/O parameters have been satisfied, build the I/O
1561 		 * request and assign the swap space.
1562 		 */
1563 		if (sync == TRUE)
1564 			bp = getpbuf(&nsw_wcount_sync);
1565 		else
1566 			bp = getpbuf(&nsw_wcount_async);
1567 		bio = &bp->b_bio1;
1568 
1569 		pmap_qenter((vm_offset_t)bp->b_data, &m[i], n);
1570 
1571 		bp->b_bcount = PAGE_SIZE * n;
1572 		bio->bio_offset = (off_t)blk << PAGE_SHIFT;
1573 
1574 		for (j = 0; j < n; ++j) {
1575 			vm_page_t mreq = m[i+j];
1576 
1577 			swp_pager_meta_build(mreq->object, mreq->pindex,
1578 					     blk + j);
1579 			if (object->type == OBJT_SWAP)
1580 				vm_page_dirty(mreq);
1581 			rtvals[i+j] = VM_PAGER_OK;
1582 
1583 			vm_page_flag_set(mreq, PG_SWAPINPROG);
1584 			bp->b_xio.xio_pages[j] = mreq;
1585 		}
1586 		bp->b_xio.xio_npages = n;
1587 
1588 		mycpu->gd_cnt.v_swapout++;
1589 		mycpu->gd_cnt.v_swappgsout += bp->b_xio.xio_npages;
1590 
1591 		lwkt_reltoken(&vm_token);
1592 		crit_exit();
1593 
1594 		bp->b_dirtyoff = 0;		/* req'd for NFS */
1595 		bp->b_dirtyend = bp->b_bcount;	/* req'd for NFS */
1596 		bp->b_cmd = BUF_CMD_WRITE;
1597 		bio->bio_caller_info1.index = SWBIO_WRITE;
1598 
1599 		/*
1600 		 * asynchronous
1601 		 */
1602 		if (sync == FALSE) {
1603 			bio->bio_done = swp_pager_async_iodone;
1604 			BUF_KERNPROC(bp);
1605 			vn_strategy(swapdev_vp, bio);
1606 
1607 			for (j = 0; j < n; ++j)
1608 				rtvals[i+j] = VM_PAGER_PEND;
1609 			continue;
1610 		}
1611 
1612 		/*
1613 		 * Issue synchrnously.
1614 		 *
1615 		 * Wait for the sync I/O to complete, then update rtvals.
1616 		 * We just set the rtvals[] to VM_PAGER_PEND so we can call
1617 		 * our async completion routine at the end, thus avoiding a
1618 		 * double-free.
1619 		 */
1620 		bio->bio_caller_info1.index |= SWBIO_SYNC;
1621 		bio->bio_done = biodone_sync;
1622 		bio->bio_flags |= BIO_SYNC;
1623 		vn_strategy(swapdev_vp, bio);
1624 		biowait(bio, "swwrt");
1625 
1626 		for (j = 0; j < n; ++j)
1627 			rtvals[i+j] = VM_PAGER_PEND;
1628 
1629 		/*
1630 		 * Now that we are through with the bp, we can call the
1631 		 * normal async completion, which frees everything up.
1632 		 */
1633 		swp_pager_async_iodone(bio);
1634 	}
1635 }
1636 
1637 /*
1638  * No requirements.
1639  */
1640 void
1641 swap_pager_newswap(void)
1642 {
1643 	swp_sizecheck();
1644 }
1645 
1646 /*
1647  *	swp_pager_async_iodone:
1648  *
1649  *	Completion routine for asynchronous reads and writes from/to swap.
1650  *	Also called manually by synchronous code to finish up a bp.
1651  *
1652  *	For READ operations, the pages are PG_BUSY'd.  For WRITE operations,
1653  *	the pages are vm_page_t->busy'd.  For READ operations, we PG_BUSY
1654  *	unbusy all pages except the 'main' request page.  For WRITE
1655  *	operations, we vm_page_t->busy'd unbusy all pages ( we can do this
1656  *	because we marked them all VM_PAGER_PEND on return from putpages ).
1657  *
1658  *	This routine may not block.
1659  *
1660  * No requirements.
1661  */
1662 static void
1663 swp_pager_async_iodone(struct bio *bio)
1664 {
1665 	struct buf *bp = bio->bio_buf;
1666 	vm_object_t object = NULL;
1667 	int i;
1668 	int *nswptr;
1669 
1670 	/*
1671 	 * report error
1672 	 */
1673 	if (bp->b_flags & B_ERROR) {
1674 		kprintf(
1675 		    "swap_pager: I/O error - %s failed; offset %lld,"
1676 			"size %ld, error %d\n",
1677 		    ((bio->bio_caller_info1.index & SWBIO_READ) ?
1678 			"pagein" : "pageout"),
1679 		    (long long)bio->bio_offset,
1680 		    (long)bp->b_bcount,
1681 		    bp->b_error
1682 		);
1683 	}
1684 
1685 	/*
1686 	 * set object, raise to splvm().
1687 	 */
1688 	if (bp->b_xio.xio_npages)
1689 		object = bp->b_xio.xio_pages[0]->object;
1690 	crit_enter();
1691 	lwkt_gettoken(&vm_token);
1692 
1693 	/*
1694 	 * remove the mapping for kernel virtual
1695 	 */
1696 	pmap_qremove((vm_offset_t)bp->b_data, bp->b_xio.xio_npages);
1697 
1698 	/*
1699 	 * cleanup pages.  If an error occurs writing to swap, we are in
1700 	 * very serious trouble.  If it happens to be a disk error, though,
1701 	 * we may be able to recover by reassigning the swap later on.  So
1702 	 * in this case we remove the m->swapblk assignment for the page
1703 	 * but do not free it in the rlist.  The errornous block(s) are thus
1704 	 * never reallocated as swap.  Redirty the page and continue.
1705 	 */
1706 	for (i = 0; i < bp->b_xio.xio_npages; ++i) {
1707 		vm_page_t m = bp->b_xio.xio_pages[i];
1708 
1709 		if (bp->b_flags & B_ERROR) {
1710 			/*
1711 			 * If an error occurs I'd love to throw the swapblk
1712 			 * away without freeing it back to swapspace, so it
1713 			 * can never be used again.  But I can't from an
1714 			 * interrupt.
1715 			 */
1716 
1717 			if (bio->bio_caller_info1.index & SWBIO_READ) {
1718 				/*
1719 				 * When reading, reqpage needs to stay
1720 				 * locked for the parent, but all other
1721 				 * pages can be freed.  We still want to
1722 				 * wakeup the parent waiting on the page,
1723 				 * though.  ( also: pg_reqpage can be -1 and
1724 				 * not match anything ).
1725 				 *
1726 				 * We have to wake specifically requested pages
1727 				 * up too because we cleared PG_SWAPINPROG and
1728 				 * someone may be waiting for that.
1729 				 *
1730 				 * NOTE: for reads, m->dirty will probably
1731 				 * be overridden by the original caller of
1732 				 * getpages so don't play cute tricks here.
1733 				 *
1734 				 * NOTE: We can't actually free the page from
1735 				 * here, because this is an interrupt.  It
1736 				 * is not legal to mess with object->memq
1737 				 * from an interrupt.  Deactivate the page
1738 				 * instead.
1739 				 */
1740 
1741 				m->valid = 0;
1742 				vm_page_flag_clear(m, PG_ZERO);
1743 				vm_page_flag_clear(m, PG_SWAPINPROG);
1744 
1745 				/*
1746 				 * bio_driver_info holds the requested page
1747 				 * index.
1748 				 */
1749 				if (i != (int)(intptr_t)bio->bio_driver_info) {
1750 					vm_page_deactivate(m);
1751 					vm_page_wakeup(m);
1752 				} else {
1753 					vm_page_flash(m);
1754 				}
1755 				/*
1756 				 * If i == bp->b_pager.pg_reqpage, do not wake
1757 				 * the page up.  The caller needs to.
1758 				 */
1759 			} else {
1760 				/*
1761 				 * If a write error occurs remove the swap
1762 				 * assignment (note that PG_SWAPPED may or
1763 				 * may not be set depending on prior activity).
1764 				 *
1765 				 * Re-dirty OBJT_SWAP pages as there is no
1766 				 * other backing store, we can't throw the
1767 				 * page away.
1768 				 *
1769 				 * Non-OBJT_SWAP pages (aka swapcache) must
1770 				 * not be dirtied since they may not have
1771 				 * been dirty in the first place, and they
1772 				 * do have backing store (the vnode).
1773 				 */
1774 				swp_pager_meta_ctl(m->object, m->pindex,
1775 						   SWM_FREE);
1776 				vm_page_flag_clear(m, PG_SWAPPED);
1777 				if (m->object->type == OBJT_SWAP) {
1778 					vm_page_dirty(m);
1779 					vm_page_activate(m);
1780 				}
1781 				vm_page_flag_clear(m, PG_SWAPINPROG);
1782 				vm_page_io_finish(m);
1783 			}
1784 		} else if (bio->bio_caller_info1.index & SWBIO_READ) {
1785 			/*
1786 			 * NOTE: for reads, m->dirty will probably be
1787 			 * overridden by the original caller of getpages so
1788 			 * we cannot set them in order to free the underlying
1789 			 * swap in a low-swap situation.  I don't think we'd
1790 			 * want to do that anyway, but it was an optimization
1791 			 * that existed in the old swapper for a time before
1792 			 * it got ripped out due to precisely this problem.
1793 			 *
1794 			 * clear PG_ZERO in page.
1795 			 *
1796 			 * If not the requested page then deactivate it.
1797 			 *
1798 			 * Note that the requested page, reqpage, is left
1799 			 * busied, but we still have to wake it up.  The
1800 			 * other pages are released (unbusied) by
1801 			 * vm_page_wakeup().  We do not set reqpage's
1802 			 * valid bits here, it is up to the caller.
1803 			 */
1804 
1805 			/*
1806 			 * NOTE: can't call pmap_clear_modify(m) from an
1807 			 * interrupt thread, the pmap code may have to map
1808 			 * non-kernel pmaps and currently asserts the case.
1809 			 */
1810 			/*pmap_clear_modify(m);*/
1811 			m->valid = VM_PAGE_BITS_ALL;
1812 			vm_page_undirty(m);
1813 			vm_page_flag_clear(m, PG_ZERO | PG_SWAPINPROG);
1814 			vm_page_flag_set(m, PG_SWAPPED);
1815 
1816 			/*
1817 			 * We have to wake specifically requested pages
1818 			 * up too because we cleared PG_SWAPINPROG and
1819 			 * could be waiting for it in getpages.  However,
1820 			 * be sure to not unbusy getpages specifically
1821 			 * requested page - getpages expects it to be
1822 			 * left busy.
1823 			 *
1824 			 * bio_driver_info holds the requested page
1825 			 */
1826 			if (i != (int)(intptr_t)bio->bio_driver_info) {
1827 				vm_page_deactivate(m);
1828 				vm_page_wakeup(m);
1829 			} else {
1830 				vm_page_flash(m);
1831 			}
1832 		} else {
1833 			/*
1834 			 * Mark the page clean but do not mess with the
1835 			 * pmap-layer's modified state.  That state should
1836 			 * also be clear since the caller protected the
1837 			 * page VM_PROT_READ, but allow the case.
1838 			 *
1839 			 * We are in an interrupt, avoid pmap operations.
1840 			 *
1841 			 * If we have a severe page deficit, deactivate the
1842 			 * page.  Do not try to cache it (which would also
1843 			 * involve a pmap op), because the page might still
1844 			 * be read-heavy.
1845 			 *
1846 			 * When using the swap to cache clean vnode pages
1847 			 * we do not mess with the page dirty bits.
1848 			 */
1849 			if (m->object->type == OBJT_SWAP)
1850 				vm_page_undirty(m);
1851 			vm_page_flag_clear(m, PG_SWAPINPROG);
1852 			vm_page_flag_set(m, PG_SWAPPED);
1853 			vm_page_io_finish(m);
1854 			if (vm_page_count_severe())
1855 				vm_page_deactivate(m);
1856 #if 0
1857 			if (!vm_page_count_severe() || !vm_page_try_to_cache(m))
1858 				vm_page_protect(m, VM_PROT_READ);
1859 #endif
1860 		}
1861 	}
1862 
1863 	/*
1864 	 * adjust pip.  NOTE: the original parent may still have its own
1865 	 * pip refs on the object.
1866 	 */
1867 
1868 	if (object)
1869 		vm_object_pip_wakeupn(object, bp->b_xio.xio_npages);
1870 
1871 	/*
1872 	 * Release the physical I/O buffer.
1873 	 *
1874 	 * NOTE: Due to synchronous operations in the write case b_cmd may
1875 	 *	 already be set to BUF_CMD_DONE and BIO_SYNC may have already
1876 	 *	 been cleared.
1877 	 */
1878 	if (bio->bio_caller_info1.index & SWBIO_READ)
1879 		nswptr = &nsw_rcount;
1880 	else if (bio->bio_caller_info1.index & SWBIO_SYNC)
1881 		nswptr = &nsw_wcount_sync;
1882 	else
1883 		nswptr = &nsw_wcount_async;
1884 	bp->b_cmd = BUF_CMD_DONE;
1885 	relpbuf(bp, nswptr);
1886 	lwkt_reltoken(&vm_token);
1887 	crit_exit();
1888 }
1889 
1890 /************************************************************************
1891  *				SWAP META DATA 				*
1892  ************************************************************************
1893  *
1894  *	These routines manipulate the swap metadata stored in the
1895  *	OBJT_SWAP object.  All swp_*() routines must be called at
1896  *	splvm() because swap can be freed up by the low level vm_page
1897  *	code which might be called from interrupts beyond what splbio() covers.
1898  *
1899  *	Swap metadata is implemented with a global hash and not directly
1900  *	linked into the object.  Instead the object simply contains
1901  *	appropriate tracking counters.
1902  */
1903 
1904 /*
1905  * Lookup the swblock containing the specified swap block index.
1906  *
1907  * The caller must hold vm_token.
1908  */
1909 static __inline
1910 struct swblock *
1911 swp_pager_lookup(vm_object_t object, vm_pindex_t index)
1912 {
1913 	index &= ~SWAP_META_MASK;
1914 	return (RB_LOOKUP(swblock_rb_tree, &object->swblock_root, index));
1915 }
1916 
1917 /*
1918  * Remove a swblock from the RB tree.
1919  *
1920  * The caller must hold vm_token.
1921  */
1922 static __inline
1923 void
1924 swp_pager_remove(vm_object_t object, struct swblock *swap)
1925 {
1926 	RB_REMOVE(swblock_rb_tree, &object->swblock_root, swap);
1927 }
1928 
1929 /*
1930  * Convert default object to swap object if necessary
1931  *
1932  * The caller must hold vm_token.
1933  */
1934 static void
1935 swp_pager_meta_convert(vm_object_t object)
1936 {
1937 	if (object->type == OBJT_DEFAULT) {
1938 		object->type = OBJT_SWAP;
1939 		KKASSERT(object->swblock_count == 0);
1940 	}
1941 }
1942 
1943 /*
1944  * SWP_PAGER_META_BUILD() -	add swap block to swap meta data for object
1945  *
1946  *	We first convert the object to a swap object if it is a default
1947  *	object.  Vnode objects do not need to be converted.
1948  *
1949  *	The specified swapblk is added to the object's swap metadata.  If
1950  *	the swapblk is not valid, it is freed instead.  Any previously
1951  *	assigned swapblk is freed.
1952  *
1953  * The caller must hold vm_token.
1954  */
1955 static void
1956 swp_pager_meta_build(vm_object_t object, vm_pindex_t index, daddr_t swapblk)
1957 {
1958 	struct swblock *swap;
1959 	struct swblock *oswap;
1960 
1961 	KKASSERT(swapblk != SWAPBLK_NONE);
1962 
1963 	/*
1964 	 * Convert object if necessary
1965 	 */
1966 	if (object->type == OBJT_DEFAULT)
1967 		swp_pager_meta_convert(object);
1968 
1969 	/*
1970 	 * Locate swblock.  If not found create, but if we aren't adding
1971 	 * anything just return.  If we run out of space in the map we wait
1972 	 * and, since the hash table may have changed, retry.
1973 	 */
1974 retry:
1975 	swap = swp_pager_lookup(object, index);
1976 
1977 	if (swap == NULL) {
1978 		int i;
1979 
1980 		swap = zalloc(swap_zone);
1981 		if (swap == NULL) {
1982 			vm_wait(0);
1983 			goto retry;
1984 		}
1985 		swap->swb_index = index & ~SWAP_META_MASK;
1986 		swap->swb_count = 0;
1987 
1988 		++object->swblock_count;
1989 
1990 		for (i = 0; i < SWAP_META_PAGES; ++i)
1991 			swap->swb_pages[i] = SWAPBLK_NONE;
1992 		oswap = RB_INSERT(swblock_rb_tree, &object->swblock_root, swap);
1993 		KKASSERT(oswap == NULL);
1994 	}
1995 
1996 	/*
1997 	 * Delete prior contents of metadata
1998 	 */
1999 
2000 	index &= SWAP_META_MASK;
2001 
2002 	if (swap->swb_pages[index] != SWAPBLK_NONE) {
2003 		swp_pager_freeswapspace(object, swap->swb_pages[index], 1);
2004 		--swap->swb_count;
2005 	}
2006 
2007 	/*
2008 	 * Enter block into metadata
2009 	 */
2010 	swap->swb_pages[index] = swapblk;
2011 	if (swapblk != SWAPBLK_NONE)
2012 		++swap->swb_count;
2013 }
2014 
2015 /*
2016  * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
2017  *
2018  *	The requested range of blocks is freed, with any associated swap
2019  *	returned to the swap bitmap.
2020  *
2021  *	This routine will free swap metadata structures as they are cleaned
2022  *	out.  This routine does *NOT* operate on swap metadata associated
2023  *	with resident pages.
2024  *
2025  * The caller must hold vm_token.
2026  */
2027 static int swp_pager_meta_free_callback(struct swblock *swb, void *data);
2028 
2029 static void
2030 swp_pager_meta_free(vm_object_t object, vm_pindex_t index, vm_pindex_t count)
2031 {
2032 	struct swfreeinfo info;
2033 
2034 	/*
2035 	 * Nothing to do
2036 	 */
2037 	if (object->swblock_count == 0) {
2038 		KKASSERT(RB_EMPTY(&object->swblock_root));
2039 		return;
2040 	}
2041 	if (count == 0)
2042 		return;
2043 
2044 	/*
2045 	 * Setup for RB tree scan.  Note that the pindex range can be huge
2046 	 * due to the 64 bit page index space so we cannot safely iterate.
2047 	 */
2048 	info.object = object;
2049 	info.basei = index & ~SWAP_META_MASK;
2050 	info.begi = index;
2051 	info.endi = index + count - 1;
2052 	swblock_rb_tree_RB_SCAN(&object->swblock_root, rb_swblock_scancmp,
2053 				swp_pager_meta_free_callback, &info);
2054 }
2055 
2056 /*
2057  * The caller must hold vm_token.
2058  */
2059 static
2060 int
2061 swp_pager_meta_free_callback(struct swblock *swap, void *data)
2062 {
2063 	struct swfreeinfo *info = data;
2064 	vm_object_t object = info->object;
2065 	int index;
2066 	int eindex;
2067 
2068 	/*
2069 	 * Figure out the range within the swblock.  The wider scan may
2070 	 * return edge-case swap blocks when the start and/or end points
2071 	 * are in the middle of a block.
2072 	 */
2073 	if (swap->swb_index < info->begi)
2074 		index = (int)info->begi & SWAP_META_MASK;
2075 	else
2076 		index = 0;
2077 
2078 	if (swap->swb_index + SWAP_META_PAGES > info->endi)
2079 		eindex = (int)info->endi & SWAP_META_MASK;
2080 	else
2081 		eindex = SWAP_META_MASK;
2082 
2083 	/*
2084 	 * Scan and free the blocks.  The loop terminates early
2085 	 * if (swap) runs out of blocks and could be freed.
2086 	 */
2087 	while (index <= eindex) {
2088 		daddr_t v = swap->swb_pages[index];
2089 
2090 		if (v != SWAPBLK_NONE) {
2091 			swp_pager_freeswapspace(object, v, 1);
2092 			swap->swb_pages[index] = SWAPBLK_NONE;
2093 			if (--swap->swb_count == 0) {
2094 				swp_pager_remove(object, swap);
2095 				zfree(swap_zone, swap);
2096 				--object->swblock_count;
2097 				break;
2098 			}
2099 		}
2100 		++index;
2101 	}
2102 	/* swap may be invalid here due to zfree above */
2103 	return(0);
2104 }
2105 
2106 /*
2107  * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
2108  *
2109  *	This routine locates and destroys all swap metadata associated with
2110  *	an object.
2111  *
2112  * The caller must hold vm_token.
2113  */
2114 static void
2115 swp_pager_meta_free_all(vm_object_t object)
2116 {
2117 	struct swblock *swap;
2118 	int i;
2119 
2120 	while ((swap = RB_ROOT(&object->swblock_root)) != NULL) {
2121 		swp_pager_remove(object, swap);
2122 		for (i = 0; i < SWAP_META_PAGES; ++i) {
2123 			daddr_t v = swap->swb_pages[i];
2124 			if (v != SWAPBLK_NONE) {
2125 				--swap->swb_count;
2126 				swp_pager_freeswapspace(object, v, 1);
2127 			}
2128 		}
2129 		if (swap->swb_count != 0)
2130 			panic("swap_pager_meta_free_all: swb_count != 0");
2131 		zfree(swap_zone, swap);
2132 		--object->swblock_count;
2133 	}
2134 	KKASSERT(object->swblock_count == 0);
2135 }
2136 
2137 /*
2138  * SWP_PAGER_METACTL() -  misc control of swap and vm_page_t meta data.
2139  *
2140  *	This routine is capable of looking up, popping, or freeing
2141  *	swapblk assignments in the swap meta data or in the vm_page_t.
2142  *	The routine typically returns the swapblk being looked-up, or popped,
2143  *	or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
2144  *	was invalid.  This routine will automatically free any invalid
2145  *	meta-data swapblks.
2146  *
2147  *	It is not possible to store invalid swapblks in the swap meta data
2148  *	(other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
2149  *
2150  *	When acting on a busy resident page and paging is in progress, we
2151  *	have to wait until paging is complete but otherwise can act on the
2152  *	busy page.
2153  *
2154  *	SWM_FREE	remove and free swap block from metadata
2155  *	SWM_POP		remove from meta data but do not free.. pop it out
2156  *
2157  * The caller must hold vm_token.
2158  */
2159 static daddr_t
2160 swp_pager_meta_ctl(vm_object_t object, vm_pindex_t index, int flags)
2161 {
2162 	struct swblock *swap;
2163 	daddr_t r1;
2164 
2165 	if (object->swblock_count == 0)
2166 		return(SWAPBLK_NONE);
2167 
2168 	r1 = SWAPBLK_NONE;
2169 	swap = swp_pager_lookup(object, index);
2170 
2171 	if (swap != NULL) {
2172 		index &= SWAP_META_MASK;
2173 		r1 = swap->swb_pages[index];
2174 
2175 		if (r1 != SWAPBLK_NONE) {
2176 			if (flags & SWM_FREE) {
2177 				swp_pager_freeswapspace(object, r1, 1);
2178 				r1 = SWAPBLK_NONE;
2179 			}
2180 			if (flags & (SWM_FREE|SWM_POP)) {
2181 				swap->swb_pages[index] = SWAPBLK_NONE;
2182 				if (--swap->swb_count == 0) {
2183 					swp_pager_remove(object, swap);
2184 					zfree(swap_zone, swap);
2185 					--object->swblock_count;
2186 				}
2187 			}
2188 		}
2189 	}
2190 	return(r1);
2191 }
2192