xref: /dflybsd-src/sys/vm/vm_swapcache.c (revision 884717e1debcf4b08bda1d29d01b0c8a34b86a59)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 /*
38  * Implement the swapcache daemon.  When enabled swap is assumed to be
39  * configured on a fast storage device such as a SSD.  Swap is assigned
40  * to clean vnode-backed pages in the inactive queue, clustered by object
41  * if possible, and written out.  The swap assignment sticks around even
42  * after the underlying pages have been recycled.
43  *
44  * The daemon manages write bandwidth based on sysctl settings to control
45  * wear on the SSD.
46  *
47  * The vnode strategy code will check for the swap assignments and divert
48  * reads to the swap device when the data is present in the swapcache.
49  *
50  * This operates on both regular files and the block device vnodes used by
51  * filesystems to manage meta-data.
52  */
53 
54 #include "opt_vm.h"
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/kernel.h>
58 #include <sys/proc.h>
59 #include <sys/kthread.h>
60 #include <sys/resourcevar.h>
61 #include <sys/signalvar.h>
62 #include <sys/vnode.h>
63 #include <sys/vmmeter.h>
64 #include <sys/sysctl.h>
65 #include <sys/eventhandler.h>
66 
67 #include <vm/vm.h>
68 #include <vm/vm_param.h>
69 #include <sys/lock.h>
70 #include <vm/vm_object.h>
71 #include <vm/vm_page.h>
72 #include <vm/vm_map.h>
73 #include <vm/vm_pageout.h>
74 #include <vm/vm_pager.h>
75 #include <vm/swap_pager.h>
76 #include <vm/vm_extern.h>
77 
78 #include <sys/thread2.h>
79 #include <sys/spinlock2.h>
80 #include <vm/vm_page2.h>
81 
82 /* the kernel process "vm_pageout"*/
83 static int vm_swapcached_flush (vm_page_t m, int isblkdev);
84 static int vm_swapcache_test(vm_page_t m);
85 static void vm_swapcache_writing(vm_page_t marker);
86 static void vm_swapcache_cleaning(vm_object_t marker);
87 static void vm_swapcache_movemarker(vm_object_t marker, vm_object_t object);
88 struct thread *swapcached_thread;
89 
90 SYSCTL_NODE(_vm, OID_AUTO, swapcache, CTLFLAG_RW, NULL, NULL);
91 
92 int vm_swapcache_read_enable;
93 int vm_swapcache_inactive_heuristic;
94 static int vm_swapcache_sleep;
95 static int vm_swapcache_maxlaunder = 256;
96 static int vm_swapcache_data_enable = 0;
97 static int vm_swapcache_meta_enable = 0;
98 static int vm_swapcache_maxswappct = 75;
99 static int vm_swapcache_hysteresis;
100 int vm_swapcache_use_chflags = 1;	/* require chflags cache */
101 static int64_t vm_swapcache_minburst = 10000000LL;	/* 10MB */
102 static int64_t vm_swapcache_curburst = 4000000000LL;	/* 4G after boot */
103 static int64_t vm_swapcache_maxburst = 2000000000LL;	/* 2G nominal max */
104 static int64_t vm_swapcache_accrate = 100000LL;		/* 100K/s */
105 static int64_t vm_swapcache_write_count;
106 static int64_t vm_swapcache_maxfilesize;
107 static int64_t vm_swapcache_cleanperobj = 16*1024*1024;
108 
109 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxlaunder,
110 	CTLFLAG_RW, &vm_swapcache_maxlaunder, 0, "");
111 
112 SYSCTL_INT(_vm_swapcache, OID_AUTO, data_enable,
113 	CTLFLAG_RW, &vm_swapcache_data_enable, 0, "");
114 SYSCTL_INT(_vm_swapcache, OID_AUTO, meta_enable,
115 	CTLFLAG_RW, &vm_swapcache_meta_enable, 0, "");
116 SYSCTL_INT(_vm_swapcache, OID_AUTO, read_enable,
117 	CTLFLAG_RW, &vm_swapcache_read_enable, 0, "");
118 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxswappct,
119 	CTLFLAG_RW, &vm_swapcache_maxswappct, 0, "");
120 SYSCTL_INT(_vm_swapcache, OID_AUTO, hysteresis,
121 	CTLFLAG_RW, &vm_swapcache_hysteresis, 0, "");
122 SYSCTL_INT(_vm_swapcache, OID_AUTO, use_chflags,
123 	CTLFLAG_RW, &vm_swapcache_use_chflags, 0, "");
124 
125 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, minburst,
126 	CTLFLAG_RW, &vm_swapcache_minburst, 0, "");
127 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, curburst,
128 	CTLFLAG_RW, &vm_swapcache_curburst, 0, "");
129 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxburst,
130 	CTLFLAG_RW, &vm_swapcache_maxburst, 0, "");
131 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxfilesize,
132 	CTLFLAG_RW, &vm_swapcache_maxfilesize, 0, "");
133 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, accrate,
134 	CTLFLAG_RW, &vm_swapcache_accrate, 0, "");
135 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, write_count,
136 	CTLFLAG_RW, &vm_swapcache_write_count, 0, "");
137 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, cleanperobj,
138 	CTLFLAG_RW, &vm_swapcache_cleanperobj, 0, "");
139 
140 #define SWAPMAX(adj)	\
141 	((int64_t)vm_swap_max * (vm_swapcache_maxswappct + (adj)) / 100)
142 
143 /*
144  * When shutting down the machine we want to stop swapcache operation
145  * immediately so swap is not accessed after devices have been shuttered.
146  */
147 static void
148 shutdown_swapcache(void *arg __unused)
149 {
150 	vm_swapcache_read_enable = 0;
151 	vm_swapcache_data_enable = 0;
152 	vm_swapcache_meta_enable = 0;
153 	wakeup(&vm_swapcache_sleep);	/* shortcut 5-second wait */
154 }
155 
156 /*
157  * vm_swapcached is the high level pageout daemon.
158  *
159  * No requirements.
160  */
161 static void
162 vm_swapcached_thread(void)
163 {
164 	enum { SWAPC_WRITING, SWAPC_CLEANING } state = SWAPC_WRITING;
165 	enum { SWAPB_BURSTING, SWAPB_RECOVERING } burst = SWAPB_BURSTING;
166 	static struct vm_page page_marker[PQ_L2_SIZE];
167 	static struct vm_object object_marker;
168 	int q;
169 
170 	/*
171 	 * Thread setup
172 	 */
173 	curthread->td_flags |= TDF_SYSTHREAD;
174 	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc,
175 			      swapcached_thread, SHUTDOWN_PRI_FIRST);
176 	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_swapcache,
177 			      NULL, SHUTDOWN_PRI_SECOND);
178 
179 	/*
180 	 * Initialize our marker for the inactive scan (SWAPC_WRITING)
181 	 */
182 	bzero(&page_marker, sizeof(page_marker));
183 	for (q = 0; q < PQ_L2_SIZE; ++q) {
184 		page_marker[q].flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
185 		page_marker[q].queue = PQ_INACTIVE + q;
186 		page_marker[q].pc = q;
187 		page_marker[q].wire_count = 1;
188 		vm_page_queues_spin_lock(PQ_INACTIVE + q);
189 		TAILQ_INSERT_HEAD(
190 			&vm_page_queues[PQ_INACTIVE + q].pl,
191 			&page_marker[q], pageq);
192 		vm_page_queues_spin_unlock(PQ_INACTIVE + q);
193 	}
194 
195 	vm_swapcache_hysteresis = vmstats.v_inactive_target / 2;
196 	vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis;
197 
198 	/*
199 	 * Initialize our marker for the vm_object scan (SWAPC_CLEANING)
200 	 */
201 	bzero(&object_marker, sizeof(object_marker));
202 	object_marker.type = OBJT_MARKER;
203 	lwkt_gettoken(&vmobj_token);
204 	TAILQ_INSERT_HEAD(&vm_object_list, &object_marker, object_list);
205 	lwkt_reltoken(&vmobj_token);
206 
207 	for (;;) {
208 		/*
209 		 * Handle shutdown
210 		 */
211 		kproc_suspend_loop();
212 
213 		/*
214 		 * Check every 5 seconds when not enabled or if no swap
215 		 * is present.
216 		 */
217 		if ((vm_swapcache_data_enable == 0 &&
218 		     vm_swapcache_meta_enable == 0) ||
219 		    vm_swap_max == 0) {
220 			tsleep(&vm_swapcache_sleep, 0, "csleep", hz * 5);
221 			continue;
222 		}
223 
224 		/*
225 		 * Polling rate when enabled is approximately 10 hz.
226 		 */
227 		tsleep(&vm_swapcache_sleep, 0, "csleep", hz / 10);
228 
229 		/*
230 		 * State hysteresis.  Generate write activity up to 75% of
231 		 * swap, then clean out swap assignments down to 70%, then
232 		 * repeat.
233 		 */
234 		if (state == SWAPC_WRITING) {
235 			if (vm_swap_cache_use > SWAPMAX(0))
236 				state = SWAPC_CLEANING;
237 		} else {
238 			if (vm_swap_cache_use < SWAPMAX(-10))
239 				state = SWAPC_WRITING;
240 		}
241 
242 		/*
243 		 * We are allowed to continue accumulating burst value
244 		 * in either state.  Allow the user to set curburst > maxburst
245 		 * for the initial load-in.
246 		 */
247 		if (vm_swapcache_curburst < vm_swapcache_maxburst) {
248 			vm_swapcache_curburst += vm_swapcache_accrate / 10;
249 			if (vm_swapcache_curburst > vm_swapcache_maxburst)
250 				vm_swapcache_curburst = vm_swapcache_maxburst;
251 		}
252 
253 		/*
254 		 * We don't want to nickle-and-dime the scan as that will
255 		 * create unnecessary fragmentation.  The minimum burst
256 		 * is one-seconds worth of accumulation.
257 		 */
258 		if (state == SWAPC_WRITING) {
259 			if (vm_swapcache_curburst >= vm_swapcache_accrate) {
260 				if (burst == SWAPB_BURSTING) {
261 					for (q = 0; q < PQ_L2_SIZE; ++q) {
262 						vm_swapcache_writing(
263 							&page_marker[q]);
264 					}
265 					if (vm_swapcache_curburst <= 0)
266 						burst = SWAPB_RECOVERING;
267 				} else if (vm_swapcache_curburst >
268 					   vm_swapcache_minburst) {
269 					for (q = 0; q < PQ_L2_SIZE; ++q) {
270 						vm_swapcache_writing(
271 							&page_marker[q]);
272 					}
273 					burst = SWAPB_BURSTING;
274 				}
275 			}
276 		} else {
277 			vm_swapcache_cleaning(&object_marker);
278 		}
279 	}
280 
281 	/*
282 	 * Cleanup (NOT REACHED)
283 	 */
284 	for (q = 0; q < PQ_L2_SIZE; ++q) {
285 		vm_page_queues_spin_lock(PQ_INACTIVE + q);
286 		TAILQ_REMOVE(
287 			&vm_page_queues[PQ_INACTIVE + q].pl,
288 			&page_marker[q], pageq);
289 		vm_page_queues_spin_unlock(PQ_INACTIVE + q);
290 	}
291 
292 	lwkt_gettoken(&vmobj_token);
293 	TAILQ_REMOVE(&vm_object_list, &object_marker, object_list);
294 	lwkt_reltoken(&vmobj_token);
295 }
296 
297 static struct kproc_desc swpc_kp = {
298 	"swapcached",
299 	vm_swapcached_thread,
300 	&swapcached_thread
301 };
302 SYSINIT(swapcached, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &swpc_kp)
303 
304 static void
305 vm_swapcache_writing(vm_page_t marker)
306 {
307 	vm_object_t object;
308 	struct vnode *vp;
309 	vm_page_t m;
310 	int count;
311 	int isblkdev;
312 
313 	/*
314 	 * Deal with an overflow of the heuristic counter or if the user
315 	 * manually changes the hysteresis.
316 	 *
317 	 * Try to avoid small incremental pageouts by waiting for enough
318 	 * pages to buildup in the inactive queue to hopefully get a good
319 	 * burst in.  This heuristic is bumped by the VM system and reset
320 	 * when our scan hits the end of the queue.
321 	 */
322 	if (vm_swapcache_inactive_heuristic < -vm_swapcache_hysteresis)
323 		vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis;
324 	if (vm_swapcache_inactive_heuristic < 0)
325 		return;
326 
327 	/*
328 	 * Scan the inactive queue from our marker to locate
329 	 * suitable pages to push to the swap cache.
330 	 *
331 	 * We are looking for clean vnode-backed pages.
332 	 *
333 	 * NOTE: PG_SWAPPED pages in particular are not part of
334 	 *	 our count because once the cache stabilizes we
335 	 *	 can end up with a very high datarate of VM pages
336 	 *	 cycling from it.
337 	 */
338 	count = vm_swapcache_maxlaunder;
339 
340 	vm_page_queues_spin_lock(marker->queue);
341 	while ((m = TAILQ_NEXT(marker, pageq)) != NULL && count-- > 0) {
342 		KKASSERT(m->queue == marker->queue);
343 
344 		if (vm_swapcache_curburst < 0)
345 			break;
346 		TAILQ_REMOVE(
347 			&vm_page_queues[marker->queue].pl, marker, pageq);
348 		TAILQ_INSERT_AFTER(
349 			&vm_page_queues[marker->queue].pl, m, marker, pageq);
350 
351 		/*
352 		 * Ignore markers and ignore pages that already have a swap
353 		 * assignment.
354 		 */
355 		if (m->flags & (PG_MARKER | PG_SWAPPED)) {
356 			++count;
357 			continue;
358 		}
359 		if (vm_page_busy_try(m, TRUE))
360 			continue;
361 		vm_page_queues_spin_unlock(marker->queue);
362 
363 		if ((object = m->object) == NULL) {
364 			vm_page_wakeup(m);
365 			vm_page_queues_spin_lock(marker->queue);
366 			continue;
367 		}
368 		vm_object_hold(object);
369 		if (m->object != object) {
370 			vm_object_drop(object);
371 			vm_page_wakeup(m);
372 			vm_page_queues_spin_lock(marker->queue);
373 			continue;
374 		}
375 		if (vm_swapcache_test(m)) {
376 			vm_object_drop(object);
377 			vm_page_wakeup(m);
378 			vm_page_queues_spin_lock(marker->queue);
379 			continue;
380 		}
381 
382 		vp = object->handle;
383 		if (vp == NULL) {
384 			vm_object_drop(object);
385 			vm_page_wakeup(m);
386 			vm_page_queues_spin_lock(marker->queue);
387 			continue;
388 		}
389 
390 		switch(vp->v_type) {
391 		case VREG:
392 			/*
393 			 * PG_NOTMETA generically means 'don't swapcache this',
394 			 * and HAMMER will set this for regular data buffers
395 			 * (and leave it unset for meta-data buffers) as
396 			 * appropriate when double buffering is enabled.
397 			 */
398 			if (m->flags & PG_NOTMETA) {
399 				vm_object_drop(object);
400 				vm_page_wakeup(m);
401 				vm_page_queues_spin_lock(marker->queue);
402 				continue;
403 			}
404 
405 			/*
406 			 * If data_enable is 0 do not try to swapcache data.
407 			 * If use_chflags is set then only swapcache data for
408 			 * VSWAPCACHE marked vnodes, otherwise any vnode.
409 			 */
410 			if (vm_swapcache_data_enable == 0 ||
411 			    ((vp->v_flag & VSWAPCACHE) == 0 &&
412 			     vm_swapcache_use_chflags)) {
413 				vm_object_drop(object);
414 				vm_page_wakeup(m);
415 				vm_page_queues_spin_lock(marker->queue);
416 				continue;
417 			}
418 			if (vm_swapcache_maxfilesize &&
419 			    object->size >
420 			    (vm_swapcache_maxfilesize >> PAGE_SHIFT)) {
421 				vm_object_drop(object);
422 				vm_page_wakeup(m);
423 				vm_page_queues_spin_lock(marker->queue);
424 				continue;
425 			}
426 			isblkdev = 0;
427 			break;
428 		case VCHR:
429 			/*
430 			 * PG_NOTMETA generically means 'don't swapcache this',
431 			 * and HAMMER will set this for regular data buffers
432 			 * (and leave it unset for meta-data buffers) as
433 			 * appropriate when double buffering is enabled.
434 			 */
435 			if (m->flags & PG_NOTMETA) {
436 				vm_object_drop(object);
437 				vm_page_wakeup(m);
438 				vm_page_queues_spin_lock(marker->queue);
439 				continue;
440 			}
441 			if (vm_swapcache_meta_enable == 0) {
442 				vm_object_drop(object);
443 				vm_page_wakeup(m);
444 				vm_page_queues_spin_lock(marker->queue);
445 				continue;
446 			}
447 			isblkdev = 1;
448 			break;
449 		default:
450 			vm_object_drop(object);
451 			vm_page_wakeup(m);
452 			vm_page_queues_spin_lock(marker->queue);
453 			continue;
454 		}
455 
456 
457 		/*
458 		 * Assign swap and initiate I/O.
459 		 *
460 		 * (adjust for the --count which also occurs in the loop)
461 		 */
462 		count -= vm_swapcached_flush(m, isblkdev) - 1;
463 
464 		/*
465 		 * Setup for next loop using marker.
466 		 */
467 		vm_object_drop(object);
468 		vm_page_queues_spin_lock(marker->queue);
469 	}
470 
471 	/*
472 	 * The marker could wind up at the end, which is ok.  If we hit the
473 	 * end of the list adjust the heuristic.
474 	 *
475 	 * Earlier inactive pages that were dirty and become clean
476 	 * are typically moved to the end of PQ_INACTIVE by virtue
477 	 * of vfs_vmio_release() when they become unwired from the
478 	 * buffer cache.
479 	 */
480 	if (m == NULL)
481 		vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis;
482 	vm_page_queues_spin_unlock(marker->queue);
483 }
484 
485 /*
486  * Flush the specified page using the swap_pager.  The page
487  * must be busied by the caller and its disposition will become
488  * the responsibility of this function.
489  *
490  * Try to collect surrounding pages, including pages which may
491  * have already been assigned swap.  Try to cluster within a
492  * contiguous aligned SMAP_META_PAGES (typ 16 x PAGE_SIZE) block
493  * to match what swap_pager_putpages() can do.
494  *
495  * We also want to try to match against the buffer cache blocksize
496  * but we don't really know what it is here.  Since the buffer cache
497  * wires and unwires pages in groups the fact that we skip wired pages
498  * should be sufficient.
499  *
500  * Returns a count of pages we might have flushed (minimum 1)
501  */
502 static
503 int
504 vm_swapcached_flush(vm_page_t m, int isblkdev)
505 {
506 	vm_object_t object;
507 	vm_page_t marray[SWAP_META_PAGES];
508 	vm_pindex_t basei;
509 	int rtvals[SWAP_META_PAGES];
510 	int x;
511 	int i;
512 	int j;
513 	int count;
514 	int error;
515 
516 	vm_page_io_start(m);
517 	vm_page_protect(m, VM_PROT_READ);
518 	object = m->object;
519 	vm_object_hold(object);
520 
521 	/*
522 	 * Try to cluster around (m), keeping in mind that the swap pager
523 	 * can only do SMAP_META_PAGES worth of continguous write.
524 	 */
525 	x = (int)m->pindex & SWAP_META_MASK;
526 	marray[x] = m;
527 	basei = m->pindex;
528 	vm_page_wakeup(m);
529 
530 	for (i = x - 1; i >= 0; --i) {
531 		m = vm_page_lookup_busy_try(object, basei - x + i,
532 					    TRUE, &error);
533 		if (error || m == NULL)
534 			break;
535 		if (vm_swapcache_test(m)) {
536 			vm_page_wakeup(m);
537 			break;
538 		}
539 		if (isblkdev && (m->flags & PG_NOTMETA)) {
540 			vm_page_wakeup(m);
541 			break;
542 		}
543 		vm_page_io_start(m);
544 		vm_page_protect(m, VM_PROT_READ);
545 		if (m->queue - m->pc == PQ_CACHE) {
546 			vm_page_unqueue_nowakeup(m);
547 			vm_page_deactivate(m);
548 		}
549 		marray[i] = m;
550 		vm_page_wakeup(m);
551 	}
552 	++i;
553 
554 	for (j = x + 1; j < SWAP_META_PAGES; ++j) {
555 		m = vm_page_lookup_busy_try(object, basei - x + j,
556 					    TRUE, &error);
557 		if (error || m == NULL)
558 			break;
559 		if (vm_swapcache_test(m)) {
560 			vm_page_wakeup(m);
561 			break;
562 		}
563 		if (isblkdev && (m->flags & PG_NOTMETA)) {
564 			vm_page_wakeup(m);
565 			break;
566 		}
567 		vm_page_io_start(m);
568 		vm_page_protect(m, VM_PROT_READ);
569 		if (m->queue - m->pc == PQ_CACHE) {
570 			vm_page_unqueue_nowakeup(m);
571 			vm_page_deactivate(m);
572 		}
573 		marray[j] = m;
574 		vm_page_wakeup(m);
575 	}
576 
577 	count = j - i;
578 	vm_object_pip_add(object, count);
579 	swap_pager_putpages(object, marray + i, count, FALSE, rtvals + i);
580 	vm_swapcache_write_count += count * PAGE_SIZE;
581 	vm_swapcache_curburst -= count * PAGE_SIZE;
582 
583 	while (i < j) {
584 		if (rtvals[i] != VM_PAGER_PEND) {
585 			vm_page_busy_wait(marray[i], FALSE, "swppgfd");
586 			vm_page_io_finish(marray[i]);
587 			vm_page_wakeup(marray[i]);
588 			vm_object_pip_wakeup(object);
589 		}
590 		++i;
591 	}
592 	vm_object_drop(object);
593 	return(count);
594 }
595 
596 /*
597  * Test whether a VM page is suitable for writing to the swapcache.
598  * Does not test m->queue, PG_MARKER, or PG_SWAPPED.
599  *
600  * Returns 0 on success, 1 on failure
601  */
602 static int
603 vm_swapcache_test(vm_page_t m)
604 {
605 	vm_object_t object;
606 
607 	if (m->flags & PG_UNMANAGED)
608 		return(1);
609 	if (m->hold_count || m->wire_count)
610 		return(1);
611 	if (m->valid != VM_PAGE_BITS_ALL)
612 		return(1);
613 	if (m->dirty & m->valid)
614 		return(1);
615 	if ((object = m->object) == NULL)
616 		return(1);
617 	if (object->type != OBJT_VNODE ||
618 	    (object->flags & OBJ_DEAD)) {
619 		return(1);
620 	}
621 	vm_page_test_dirty(m);
622 	if (m->dirty & m->valid)
623 		return(1);
624 	return(0);
625 }
626 
627 /*
628  * Cleaning pass.
629  *
630  * We clean whole objects up to 16MB
631  */
632 static
633 void
634 vm_swapcache_cleaning(vm_object_t marker)
635 {
636 	vm_object_t object;
637 	struct vnode *vp;
638 	int count;
639 	int n;
640 
641 	count = vm_swapcache_maxlaunder;
642 
643 	/*
644 	 * Look for vnode objects
645 	 */
646 	lwkt_gettoken(&vmobj_token);
647 
648 	while ((object = TAILQ_NEXT(marker, object_list)) != NULL) {
649 		/*
650 		 * We have to skip markers.  We cannot hold/drop marker
651 		 * objects!
652 		 */
653 		if (object->type == OBJT_MARKER) {
654 			vm_swapcache_movemarker(marker, object);
655 			continue;
656 		}
657 
658 		/*
659 		 * Safety, or in case there are millions of VM objects
660 		 * without swapcache backing.
661 		 */
662 		if (--count <= 0)
663 			break;
664 
665 		/*
666 		 * We must hold the object before potentially yielding.
667 		 */
668 		vm_object_hold(object);
669 		lwkt_yield();
670 
671 		/*
672 		 * Only operate on live VNODE objects that are either
673 		 * VREG or VCHR (VCHR for meta-data).
674 		 */
675 		if ((object->type != OBJT_VNODE) ||
676 		    ((object->flags & OBJ_DEAD) ||
677 		     object->swblock_count == 0) ||
678 		    ((vp = object->handle) == NULL) ||
679 		    (vp->v_type != VREG && vp->v_type != VCHR)) {
680 			vm_object_drop(object);
681 			/* object may be invalid now */
682 			vm_swapcache_movemarker(marker, object);
683 			continue;
684 		}
685 
686 		/*
687 		 * Reset the object pindex stored in the marker if the
688 		 * working object has changed.
689 		 */
690 		if (marker->backing_object != object) {
691 			marker->size = 0;
692 			marker->backing_object_offset = 0;
693 			marker->backing_object = object;
694 		}
695 
696 		/*
697 		 * Look for swblocks starting at our iterator.
698 		 *
699 		 * The swap_pager_condfree() function attempts to free
700 		 * swap space starting at the specified index.  The index
701 		 * will be updated on return.  The function will return
702 		 * a scan factor (NOT the number of blocks freed).
703 		 *
704 		 * If it must cut its scan of the object short due to an
705 		 * excessive number of swblocks, or is able to free the
706 		 * requested number of blocks, it will return n >= count
707 		 * and we break and pick it back up on a future attempt.
708 		 *
709 		 * Scan the object linearly and try to batch large sets of
710 		 * blocks that are likely to clean out entire swap radix
711 		 * tree leafs.
712 		 */
713 		lwkt_token_swap();
714 		lwkt_reltoken(&vmobj_token);
715 
716 		n = swap_pager_condfree(object, &marker->size,
717 				    (count + SWAP_META_MASK) & ~SWAP_META_MASK);
718 
719 		vm_object_drop(object);		/* object may be invalid now */
720 		lwkt_gettoken(&vmobj_token);
721 
722 		/*
723 		 * If we have exhausted the object or deleted our per-pass
724 		 * page limit then move us to the next object.  Note that
725 		 * the current object may no longer be on the vm_object_list.
726 		 */
727 		if (n <= 0 ||
728 		    marker->backing_object_offset > vm_swapcache_cleanperobj) {
729 			vm_swapcache_movemarker(marker, object);
730 		}
731 
732 		/*
733 		 * If we have exhausted our max-launder stop for now.
734 		 */
735 		count -= n;
736 		marker->backing_object_offset += n * PAGE_SIZE;
737 		if (count < 0)
738 			break;
739 	}
740 
741 	/*
742 	 * If we wound up at the end of the list this will move the
743 	 * marker back to the beginning.
744 	 */
745 	if (object == NULL)
746 		vm_swapcache_movemarker(marker, NULL);
747 
748 	lwkt_reltoken(&vmobj_token);
749 }
750 
751 /*
752  * Move the marker past the current object.  Object can be stale, but we
753  * still need it to determine if the marker has to be moved.  If the object
754  * is still the 'current object' (object after the marker), we hop-scotch
755  * the marker past it.
756  */
757 static void
758 vm_swapcache_movemarker(vm_object_t marker, vm_object_t object)
759 {
760 	if (TAILQ_NEXT(marker, object_list) == object) {
761 		TAILQ_REMOVE(&vm_object_list, marker, object_list);
762 		if (object) {
763 			TAILQ_INSERT_AFTER(&vm_object_list, object,
764 					   marker, object_list);
765 		} else {
766 			TAILQ_INSERT_HEAD(&vm_object_list,
767 					  marker, object_list);
768 		}
769 	}
770 }
771