xref: /dflybsd-src/sys/vm/vm_swapcache.c (revision 7f357fef10b5ba09ef6123cf559206c73e3d290c)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 /*
38  * Implement the swapcache daemon.  When enabled swap is assumed to be
39  * configured on a fast storage device such as a SSD.  Swap is assigned
40  * to clean vnode-backed pages in the inactive queue, clustered by object
41  * if possible, and written out.  The swap assignment sticks around even
42  * after the underlying pages have been recycled.
43  *
44  * The daemon manages write bandwidth based on sysctl settings to control
45  * wear on the SSD.
46  *
47  * The vnode strategy code will check for the swap assignments and divert
48  * reads to the swap device when the data is present in the swapcache.
49  *
50  * This operates on both regular files and the block device vnodes used by
51  * filesystems to manage meta-data.
52  */
53 
54 #include "opt_vm.h"
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/kernel.h>
58 #include <sys/proc.h>
59 #include <sys/kthread.h>
60 #include <sys/resourcevar.h>
61 #include <sys/signalvar.h>
62 #include <sys/vnode.h>
63 #include <sys/vmmeter.h>
64 #include <sys/sysctl.h>
65 #include <sys/eventhandler.h>
66 
67 #include <vm/vm.h>
68 #include <vm/vm_param.h>
69 #include <sys/lock.h>
70 #include <vm/vm_object.h>
71 #include <vm/vm_page.h>
72 #include <vm/vm_map.h>
73 #include <vm/vm_pageout.h>
74 #include <vm/vm_pager.h>
75 #include <vm/swap_pager.h>
76 #include <vm/vm_extern.h>
77 
78 #include <sys/thread2.h>
79 #include <sys/spinlock2.h>
80 #include <vm/vm_page2.h>
81 
82 /* the kernel process "vm_pageout"*/
83 static int vm_swapcached_flush (vm_page_t m, int isblkdev);
84 static int vm_swapcache_test(vm_page_t m);
85 static void vm_swapcache_writing(vm_page_t marker);
86 static void vm_swapcache_cleaning(vm_object_t marker);
87 static void vm_swapcache_movemarker(vm_object_t marker, vm_object_t object);
88 struct thread *swapcached_thread;
89 
90 SYSCTL_NODE(_vm, OID_AUTO, swapcache, CTLFLAG_RW, NULL, NULL);
91 
92 int vm_swapcache_read_enable;
93 int vm_swapcache_inactive_heuristic;
94 static int vm_swapcache_sleep;
95 static int vm_swapcache_maxscan = 256 * 4;
96 static int vm_swapcache_maxlaunder = 256;
97 static int vm_swapcache_data_enable = 0;
98 static int vm_swapcache_meta_enable = 0;
99 static int vm_swapcache_maxswappct = 75;
100 static int vm_swapcache_hysteresis;
101 int vm_swapcache_use_chflags = 1;	/* require chflags cache */
102 static int64_t vm_swapcache_minburst = 10000000LL;	/* 10MB */
103 static int64_t vm_swapcache_curburst = 4000000000LL;	/* 4G after boot */
104 static int64_t vm_swapcache_maxburst = 2000000000LL;	/* 2G nominal max */
105 static int64_t vm_swapcache_accrate = 100000LL;		/* 100K/s */
106 static int64_t vm_swapcache_write_count;
107 static int64_t vm_swapcache_maxfilesize;
108 static int64_t vm_swapcache_cleanperobj = 16*1024*1024;
109 
110 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxlaunder,
111 	CTLFLAG_RW, &vm_swapcache_maxlaunder, 0, "");
112 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxscan,
113 	CTLFLAG_RW, &vm_swapcache_maxscan, 0, "");
114 
115 SYSCTL_INT(_vm_swapcache, OID_AUTO, data_enable,
116 	CTLFLAG_RW, &vm_swapcache_data_enable, 0, "");
117 SYSCTL_INT(_vm_swapcache, OID_AUTO, meta_enable,
118 	CTLFLAG_RW, &vm_swapcache_meta_enable, 0, "");
119 SYSCTL_INT(_vm_swapcache, OID_AUTO, read_enable,
120 	CTLFLAG_RW, &vm_swapcache_read_enable, 0, "");
121 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxswappct,
122 	CTLFLAG_RW, &vm_swapcache_maxswappct, 0, "");
123 SYSCTL_INT(_vm_swapcache, OID_AUTO, hysteresis,
124 	CTLFLAG_RW, &vm_swapcache_hysteresis, 0, "");
125 SYSCTL_INT(_vm_swapcache, OID_AUTO, use_chflags,
126 	CTLFLAG_RW, &vm_swapcache_use_chflags, 0, "");
127 
128 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, minburst,
129 	CTLFLAG_RW, &vm_swapcache_minburst, 0, "");
130 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, curburst,
131 	CTLFLAG_RW, &vm_swapcache_curburst, 0, "");
132 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxburst,
133 	CTLFLAG_RW, &vm_swapcache_maxburst, 0, "");
134 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxfilesize,
135 	CTLFLAG_RW, &vm_swapcache_maxfilesize, 0, "");
136 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, accrate,
137 	CTLFLAG_RW, &vm_swapcache_accrate, 0, "");
138 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, write_count,
139 	CTLFLAG_RW, &vm_swapcache_write_count, 0, "");
140 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, cleanperobj,
141 	CTLFLAG_RW, &vm_swapcache_cleanperobj, 0, "");
142 
143 #define SWAPMAX(adj)	\
144 	((int64_t)vm_swap_max * (vm_swapcache_maxswappct + (adj)) / 100)
145 
146 /*
147  * When shutting down the machine we want to stop swapcache operation
148  * immediately so swap is not accessed after devices have been shuttered.
149  */
150 static void
151 shutdown_swapcache(void *arg __unused)
152 {
153 	vm_swapcache_read_enable = 0;
154 	vm_swapcache_data_enable = 0;
155 	vm_swapcache_meta_enable = 0;
156 	wakeup(&vm_swapcache_sleep);	/* shortcut 5-second wait */
157 }
158 
159 /*
160  * vm_swapcached is the high level pageout daemon.
161  *
162  * No requirements.
163  */
164 static void
165 vm_swapcached_thread(void)
166 {
167 	enum { SWAPC_WRITING, SWAPC_CLEANING } state = SWAPC_WRITING;
168 	enum { SWAPB_BURSTING, SWAPB_RECOVERING } burst = SWAPB_BURSTING;
169 	static struct vm_page page_marker[PQ_L2_SIZE];
170 	static struct vm_object object_marker;
171 	int q;
172 
173 	/*
174 	 * Thread setup
175 	 */
176 	curthread->td_flags |= TDF_SYSTHREAD;
177 	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc,
178 			      swapcached_thread, SHUTDOWN_PRI_FIRST);
179 	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_swapcache,
180 			      NULL, SHUTDOWN_PRI_SECOND);
181 
182 	/*
183 	 * Initialize our marker for the inactive scan (SWAPC_WRITING)
184 	 */
185 	bzero(&page_marker, sizeof(page_marker));
186 	for (q = 0; q < PQ_L2_SIZE; ++q) {
187 		page_marker[q].flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
188 		page_marker[q].queue = PQ_INACTIVE + q;
189 		page_marker[q].pc = q;
190 		page_marker[q].wire_count = 1;
191 		vm_page_queues_spin_lock(PQ_INACTIVE + q);
192 		TAILQ_INSERT_HEAD(
193 			&vm_page_queues[PQ_INACTIVE + q].pl,
194 			&page_marker[q], pageq);
195 		vm_page_queues_spin_unlock(PQ_INACTIVE + q);
196 	}
197 
198 	vm_swapcache_hysteresis = vmstats.v_inactive_target / 2;
199 	vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis;
200 
201 	/*
202 	 * Initialize our marker for the vm_object scan (SWAPC_CLEANING)
203 	 */
204 	bzero(&object_marker, sizeof(object_marker));
205 	object_marker.type = OBJT_MARKER;
206 	lwkt_gettoken(&vmobj_token);
207 	TAILQ_INSERT_HEAD(&vm_object_list, &object_marker, object_list);
208 	lwkt_reltoken(&vmobj_token);
209 
210 	for (;;) {
211 		/*
212 		 * Handle shutdown
213 		 */
214 		kproc_suspend_loop();
215 
216 		/*
217 		 * Check every 5 seconds when not enabled or if no swap
218 		 * is present.
219 		 */
220 		if ((vm_swapcache_data_enable == 0 &&
221 		     vm_swapcache_meta_enable == 0) ||
222 		    vm_swap_max == 0) {
223 			tsleep(&vm_swapcache_sleep, 0, "csleep", hz * 5);
224 			continue;
225 		}
226 
227 		/*
228 		 * Polling rate when enabled is approximately 10 hz.
229 		 */
230 		tsleep(&vm_swapcache_sleep, 0, "csleep", hz / 10);
231 
232 		/*
233 		 * State hysteresis.  Generate write activity up to 75% of
234 		 * swap, then clean out swap assignments down to 70%, then
235 		 * repeat.
236 		 */
237 		if (state == SWAPC_WRITING) {
238 			if (vm_swap_cache_use > SWAPMAX(0))
239 				state = SWAPC_CLEANING;
240 		} else {
241 			if (vm_swap_cache_use < SWAPMAX(-10))
242 				state = SWAPC_WRITING;
243 		}
244 
245 		/*
246 		 * We are allowed to continue accumulating burst value
247 		 * in either state.  Allow the user to set curburst > maxburst
248 		 * for the initial load-in.
249 		 */
250 		if (vm_swapcache_curburst < vm_swapcache_maxburst) {
251 			vm_swapcache_curburst += vm_swapcache_accrate / 10;
252 			if (vm_swapcache_curburst > vm_swapcache_maxburst)
253 				vm_swapcache_curburst = vm_swapcache_maxburst;
254 		}
255 
256 		/*
257 		 * We don't want to nickle-and-dime the scan as that will
258 		 * create unnecessary fragmentation.  The minimum burst
259 		 * is one-seconds worth of accumulation.
260 		 */
261 		if (state == SWAPC_WRITING) {
262 			if (vm_swapcache_curburst >= vm_swapcache_accrate) {
263 				if (burst == SWAPB_BURSTING) {
264 					for (q = 0; q < PQ_L2_SIZE; ++q) {
265 						vm_swapcache_writing(
266 							&page_marker[q]);
267 					}
268 					if (vm_swapcache_curburst <= 0)
269 						burst = SWAPB_RECOVERING;
270 				} else if (vm_swapcache_curburst >
271 					   vm_swapcache_minburst) {
272 					for (q = 0; q < PQ_L2_SIZE; ++q) {
273 						vm_swapcache_writing(
274 							&page_marker[q]);
275 					}
276 					burst = SWAPB_BURSTING;
277 				}
278 			}
279 		} else {
280 			vm_swapcache_cleaning(&object_marker);
281 		}
282 	}
283 
284 	/*
285 	 * Cleanup (NOT REACHED)
286 	 */
287 	for (q = 0; q < PQ_L2_SIZE; ++q) {
288 		vm_page_queues_spin_lock(PQ_INACTIVE + q);
289 		TAILQ_REMOVE(
290 			&vm_page_queues[PQ_INACTIVE + q].pl,
291 			&page_marker[q], pageq);
292 		vm_page_queues_spin_unlock(PQ_INACTIVE + q);
293 	}
294 
295 	lwkt_gettoken(&vmobj_token);
296 	TAILQ_REMOVE(&vm_object_list, &object_marker, object_list);
297 	lwkt_reltoken(&vmobj_token);
298 }
299 
300 static struct kproc_desc swpc_kp = {
301 	"swapcached",
302 	vm_swapcached_thread,
303 	&swapcached_thread
304 };
305 SYSINIT(swapcached, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &swpc_kp)
306 
307 static void
308 vm_swapcache_writing(vm_page_t marker)
309 {
310 	vm_object_t object;
311 	struct vnode *vp;
312 	vm_page_t m;
313 	int count;
314 	int scount;
315 	int isblkdev;
316 
317 	/*
318 	 * Deal with an overflow of the heuristic counter or if the user
319 	 * manually changes the hysteresis.
320 	 *
321 	 * Try to avoid small incremental pageouts by waiting for enough
322 	 * pages to buildup in the inactive queue to hopefully get a good
323 	 * burst in.  This heuristic is bumped by the VM system and reset
324 	 * when our scan hits the end of the queue.
325 	 */
326 	if (vm_swapcache_inactive_heuristic < -vm_swapcache_hysteresis)
327 		vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis;
328 	if (vm_swapcache_inactive_heuristic < 0)
329 		return;
330 
331 	/*
332 	 * Scan the inactive queue from our marker to locate
333 	 * suitable pages to push to the swap cache.
334 	 *
335 	 * We are looking for clean vnode-backed pages.
336 	 */
337 	count = vm_swapcache_maxlaunder;
338 	scount = vm_swapcache_maxscan;
339 
340 	vm_page_queues_spin_lock(marker->queue);
341 	while ((m = TAILQ_NEXT(marker, pageq)) != NULL &&
342 	       count > 0 && scount-- > 0) {
343 		KKASSERT(m->queue == marker->queue);
344 
345 		if (vm_swapcache_curburst < 0)
346 			break;
347 		TAILQ_REMOVE(
348 			&vm_page_queues[marker->queue].pl, marker, pageq);
349 		TAILQ_INSERT_AFTER(
350 			&vm_page_queues[marker->queue].pl, m, marker, pageq);
351 
352 		/*
353 		 * Ignore markers and ignore pages that already have a swap
354 		 * assignment.
355 		 */
356 		if (m->flags & (PG_MARKER | PG_SWAPPED))
357 			continue;
358 		if (vm_page_busy_try(m, TRUE))
359 			continue;
360 		vm_page_queues_spin_unlock(marker->queue);
361 
362 		if ((object = m->object) == NULL) {
363 			vm_page_wakeup(m);
364 			vm_page_queues_spin_lock(marker->queue);
365 			continue;
366 		}
367 		vm_object_hold(object);
368 		if (m->object != object) {
369 			vm_object_drop(object);
370 			vm_page_wakeup(m);
371 			vm_page_queues_spin_lock(marker->queue);
372 			continue;
373 		}
374 		if (vm_swapcache_test(m)) {
375 			vm_object_drop(object);
376 			vm_page_wakeup(m);
377 			vm_page_queues_spin_lock(marker->queue);
378 			continue;
379 		}
380 
381 		vp = object->handle;
382 		if (vp == NULL) {
383 			vm_object_drop(object);
384 			vm_page_wakeup(m);
385 			vm_page_queues_spin_lock(marker->queue);
386 			continue;
387 		}
388 
389 		switch(vp->v_type) {
390 		case VREG:
391 			/*
392 			 * PG_NOTMETA generically means 'don't swapcache this',
393 			 * and HAMMER will set this for regular data buffers
394 			 * (and leave it unset for meta-data buffers) as
395 			 * appropriate when double buffering is enabled.
396 			 */
397 			if (m->flags & PG_NOTMETA) {
398 				vm_object_drop(object);
399 				vm_page_wakeup(m);
400 				vm_page_queues_spin_lock(marker->queue);
401 				continue;
402 			}
403 
404 			/*
405 			 * If data_enable is 0 do not try to swapcache data.
406 			 * If use_chflags is set then only swapcache data for
407 			 * VSWAPCACHE marked vnodes, otherwise any vnode.
408 			 */
409 			if (vm_swapcache_data_enable == 0 ||
410 			    ((vp->v_flag & VSWAPCACHE) == 0 &&
411 			     vm_swapcache_use_chflags)) {
412 				vm_object_drop(object);
413 				vm_page_wakeup(m);
414 				vm_page_queues_spin_lock(marker->queue);
415 				continue;
416 			}
417 			if (vm_swapcache_maxfilesize &&
418 			    object->size >
419 			    (vm_swapcache_maxfilesize >> PAGE_SHIFT)) {
420 				vm_object_drop(object);
421 				vm_page_wakeup(m);
422 				vm_page_queues_spin_lock(marker->queue);
423 				continue;
424 			}
425 			isblkdev = 0;
426 			break;
427 		case VCHR:
428 			/*
429 			 * PG_NOTMETA generically means 'don't swapcache this',
430 			 * and HAMMER will set this for regular data buffers
431 			 * (and leave it unset for meta-data buffers) as
432 			 * appropriate when double buffering is enabled.
433 			 */
434 			if (m->flags & PG_NOTMETA) {
435 				vm_object_drop(object);
436 				vm_page_wakeup(m);
437 				vm_page_queues_spin_lock(marker->queue);
438 				continue;
439 			}
440 			if (vm_swapcache_meta_enable == 0) {
441 				vm_object_drop(object);
442 				vm_page_wakeup(m);
443 				vm_page_queues_spin_lock(marker->queue);
444 				continue;
445 			}
446 			isblkdev = 1;
447 			break;
448 		default:
449 			vm_object_drop(object);
450 			vm_page_wakeup(m);
451 			vm_page_queues_spin_lock(marker->queue);
452 			continue;
453 		}
454 
455 
456 		/*
457 		 * Assign swap and initiate I/O.
458 		 *
459 		 * (adjust for the --count which also occurs in the loop)
460 		 */
461 		count -= vm_swapcached_flush(m, isblkdev);
462 
463 		/*
464 		 * Setup for next loop using marker.
465 		 */
466 		vm_object_drop(object);
467 		vm_page_queues_spin_lock(marker->queue);
468 	}
469 
470 	/*
471 	 * The marker could wind up at the end, which is ok.  If we hit the
472 	 * end of the list adjust the heuristic.
473 	 *
474 	 * Earlier inactive pages that were dirty and become clean
475 	 * are typically moved to the end of PQ_INACTIVE by virtue
476 	 * of vfs_vmio_release() when they become unwired from the
477 	 * buffer cache.
478 	 */
479 	if (m == NULL)
480 		vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis;
481 	vm_page_queues_spin_unlock(marker->queue);
482 }
483 
484 /*
485  * Flush the specified page using the swap_pager.  The page
486  * must be busied by the caller and its disposition will become
487  * the responsibility of this function.
488  *
489  * Try to collect surrounding pages, including pages which may
490  * have already been assigned swap.  Try to cluster within a
491  * contiguous aligned SMAP_META_PAGES (typ 16 x PAGE_SIZE) block
492  * to match what swap_pager_putpages() can do.
493  *
494  * We also want to try to match against the buffer cache blocksize
495  * but we don't really know what it is here.  Since the buffer cache
496  * wires and unwires pages in groups the fact that we skip wired pages
497  * should be sufficient.
498  *
499  * Returns a count of pages we might have flushed (minimum 1)
500  */
501 static
502 int
503 vm_swapcached_flush(vm_page_t m, int isblkdev)
504 {
505 	vm_object_t object;
506 	vm_page_t marray[SWAP_META_PAGES];
507 	vm_pindex_t basei;
508 	int rtvals[SWAP_META_PAGES];
509 	int x;
510 	int i;
511 	int j;
512 	int count;
513 	int error;
514 
515 	vm_page_io_start(m);
516 	vm_page_protect(m, VM_PROT_READ);
517 	object = m->object;
518 	vm_object_hold(object);
519 
520 	/*
521 	 * Try to cluster around (m), keeping in mind that the swap pager
522 	 * can only do SMAP_META_PAGES worth of continguous write.
523 	 */
524 	x = (int)m->pindex & SWAP_META_MASK;
525 	marray[x] = m;
526 	basei = m->pindex;
527 	vm_page_wakeup(m);
528 
529 	for (i = x - 1; i >= 0; --i) {
530 		m = vm_page_lookup_busy_try(object, basei - x + i,
531 					    TRUE, &error);
532 		if (error || m == NULL)
533 			break;
534 		if (vm_swapcache_test(m)) {
535 			vm_page_wakeup(m);
536 			break;
537 		}
538 		if (isblkdev && (m->flags & PG_NOTMETA)) {
539 			vm_page_wakeup(m);
540 			break;
541 		}
542 		vm_page_io_start(m);
543 		vm_page_protect(m, VM_PROT_READ);
544 		if (m->queue - m->pc == PQ_CACHE) {
545 			vm_page_unqueue_nowakeup(m);
546 			vm_page_deactivate(m);
547 		}
548 		marray[i] = m;
549 		vm_page_wakeup(m);
550 	}
551 	++i;
552 
553 	for (j = x + 1; j < SWAP_META_PAGES; ++j) {
554 		m = vm_page_lookup_busy_try(object, basei - x + j,
555 					    TRUE, &error);
556 		if (error || m == NULL)
557 			break;
558 		if (vm_swapcache_test(m)) {
559 			vm_page_wakeup(m);
560 			break;
561 		}
562 		if (isblkdev && (m->flags & PG_NOTMETA)) {
563 			vm_page_wakeup(m);
564 			break;
565 		}
566 		vm_page_io_start(m);
567 		vm_page_protect(m, VM_PROT_READ);
568 		if (m->queue - m->pc == PQ_CACHE) {
569 			vm_page_unqueue_nowakeup(m);
570 			vm_page_deactivate(m);
571 		}
572 		marray[j] = m;
573 		vm_page_wakeup(m);
574 	}
575 
576 	count = j - i;
577 	vm_object_pip_add(object, count);
578 	swap_pager_putpages(object, marray + i, count, FALSE, rtvals + i);
579 	vm_swapcache_write_count += count * PAGE_SIZE;
580 	vm_swapcache_curburst -= count * PAGE_SIZE;
581 
582 	while (i < j) {
583 		if (rtvals[i] != VM_PAGER_PEND) {
584 			vm_page_busy_wait(marray[i], FALSE, "swppgfd");
585 			vm_page_io_finish(marray[i]);
586 			vm_page_wakeup(marray[i]);
587 			vm_object_pip_wakeup(object);
588 		}
589 		++i;
590 	}
591 	vm_object_drop(object);
592 	return(count);
593 }
594 
595 /*
596  * Test whether a VM page is suitable for writing to the swapcache.
597  * Does not test m->queue, PG_MARKER, or PG_SWAPPED.
598  *
599  * Returns 0 on success, 1 on failure
600  */
601 static int
602 vm_swapcache_test(vm_page_t m)
603 {
604 	vm_object_t object;
605 
606 	if (m->flags & PG_UNMANAGED)
607 		return(1);
608 	if (m->hold_count || m->wire_count)
609 		return(1);
610 	if (m->valid != VM_PAGE_BITS_ALL)
611 		return(1);
612 	if (m->dirty & m->valid)
613 		return(1);
614 	if ((object = m->object) == NULL)
615 		return(1);
616 	if (object->type != OBJT_VNODE ||
617 	    (object->flags & OBJ_DEAD)) {
618 		return(1);
619 	}
620 	vm_page_test_dirty(m);
621 	if (m->dirty & m->valid)
622 		return(1);
623 	return(0);
624 }
625 
626 /*
627  * Cleaning pass.
628  *
629  * We clean whole objects up to 16MB
630  */
631 static
632 void
633 vm_swapcache_cleaning(vm_object_t marker)
634 {
635 	vm_object_t object;
636 	struct vnode *vp;
637 	int count;
638 	int scount;
639 	int n;
640 
641 	count = vm_swapcache_maxlaunder;
642 	scount = vm_swapcache_maxscan;
643 
644 	/*
645 	 * Look for vnode objects
646 	 */
647 	lwkt_gettoken(&vmobj_token);
648 
649 	while ((object = TAILQ_NEXT(marker, object_list)) != NULL) {
650 		/*
651 		 * We have to skip markers.  We cannot hold/drop marker
652 		 * objects!
653 		 */
654 		if (object->type == OBJT_MARKER) {
655 			vm_swapcache_movemarker(marker, object);
656 			continue;
657 		}
658 
659 		/*
660 		 * Safety, or in case there are millions of VM objects
661 		 * without swapcache backing.
662 		 */
663 		if (--scount <= 0)
664 			break;
665 
666 		/*
667 		 * We must hold the object before potentially yielding.
668 		 */
669 		vm_object_hold(object);
670 		lwkt_yield();
671 
672 		/*
673 		 * Only operate on live VNODE objects that are either
674 		 * VREG or VCHR (VCHR for meta-data).
675 		 */
676 		if ((object->type != OBJT_VNODE) ||
677 		    ((object->flags & OBJ_DEAD) ||
678 		     object->swblock_count == 0) ||
679 		    ((vp = object->handle) == NULL) ||
680 		    (vp->v_type != VREG && vp->v_type != VCHR)) {
681 			vm_object_drop(object);
682 			/* object may be invalid now */
683 			vm_swapcache_movemarker(marker, object);
684 			continue;
685 		}
686 
687 		/*
688 		 * Reset the object pindex stored in the marker if the
689 		 * working object has changed.
690 		 */
691 		if (marker->backing_object != object) {
692 			marker->size = 0;
693 			marker->backing_object_offset = 0;
694 			marker->backing_object = object;
695 		}
696 
697 		/*
698 		 * Look for swblocks starting at our iterator.
699 		 *
700 		 * The swap_pager_condfree() function attempts to free
701 		 * swap space starting at the specified index.  The index
702 		 * will be updated on return.  The function will return
703 		 * a scan factor (NOT the number of blocks freed).
704 		 *
705 		 * If it must cut its scan of the object short due to an
706 		 * excessive number of swblocks, or is able to free the
707 		 * requested number of blocks, it will return n >= count
708 		 * and we break and pick it back up on a future attempt.
709 		 *
710 		 * Scan the object linearly and try to batch large sets of
711 		 * blocks that are likely to clean out entire swap radix
712 		 * tree leafs.
713 		 */
714 		lwkt_token_swap();
715 		lwkt_reltoken(&vmobj_token);
716 
717 		n = swap_pager_condfree(object, &marker->size,
718 				    (count + SWAP_META_MASK) & ~SWAP_META_MASK);
719 
720 		vm_object_drop(object);		/* object may be invalid now */
721 		lwkt_gettoken(&vmobj_token);
722 
723 		/*
724 		 * If we have exhausted the object or deleted our per-pass
725 		 * page limit then move us to the next object.  Note that
726 		 * the current object may no longer be on the vm_object_list.
727 		 */
728 		if (n <= 0 ||
729 		    marker->backing_object_offset > vm_swapcache_cleanperobj) {
730 			vm_swapcache_movemarker(marker, object);
731 		}
732 
733 		/*
734 		 * If we have exhausted our max-launder stop for now.
735 		 */
736 		count -= n;
737 		marker->backing_object_offset += n * PAGE_SIZE;
738 		if (count < 0)
739 			break;
740 	}
741 
742 	/*
743 	 * If we wound up at the end of the list this will move the
744 	 * marker back to the beginning.
745 	 */
746 	if (object == NULL)
747 		vm_swapcache_movemarker(marker, NULL);
748 
749 	lwkt_reltoken(&vmobj_token);
750 }
751 
752 /*
753  * Move the marker past the current object.  Object can be stale, but we
754  * still need it to determine if the marker has to be moved.  If the object
755  * is still the 'current object' (object after the marker), we hop-scotch
756  * the marker past it.
757  */
758 static void
759 vm_swapcache_movemarker(vm_object_t marker, vm_object_t object)
760 {
761 	if (TAILQ_NEXT(marker, object_list) == object) {
762 		TAILQ_REMOVE(&vm_object_list, marker, object_list);
763 		if (object) {
764 			TAILQ_INSERT_AFTER(&vm_object_list, object,
765 					   marker, object_list);
766 		} else {
767 			TAILQ_INSERT_HEAD(&vm_object_list,
768 					  marker, object_list);
769 		}
770 	}
771 }
772