xref: /csrg-svn/sys/vm/swap_pager.c (revision 65682)
1 /*
2  * Copyright (c) 1990 University of Utah.
3  * Copyright (c) 1991, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * the Systems Programming Group of the University of Utah Computer
8  * Science Department.
9  *
10  * %sccs.include.redist.c%
11  *
12  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
13  *
14  *	@(#)swap_pager.c	8.6 (Berkeley) 01/12/94
15  */
16 
17 /*
18  * Quick hack to page to dedicated partition(s).
19  * TODO:
20  *	Add multiprocessor locks
21  *	Deal with async writes in a better fashion
22  */
23 
24 #include <sys/param.h>
25 #include <sys/systm.h>
26 #include <sys/proc.h>
27 #include <sys/buf.h>
28 #include <sys/map.h>
29 #include <sys/vnode.h>
30 #include <sys/malloc.h>
31 
32 #include <miscfs/specfs/specdev.h>
33 
34 #include <vm/vm.h>
35 #include <vm/vm_page.h>
36 #include <vm/vm_pageout.h>
37 #include <vm/swap_pager.h>
38 
39 #define NSWSIZES	16	/* size of swtab */
40 #define MAXDADDRS	64	/* max # of disk addrs for fixed allocations */
41 #ifndef NPENDINGIO
42 #define NPENDINGIO	64	/* max # of pending cleans */
43 #endif
44 
45 #ifdef DEBUG
46 int	swpagerdebug = 0x100;
47 #define	SDB_FOLLOW	0x001
48 #define SDB_INIT	0x002
49 #define SDB_ALLOC	0x004
50 #define SDB_IO		0x008
51 #define SDB_WRITE	0x010
52 #define SDB_FAIL	0x020
53 #define SDB_ALLOCBLK	0x040
54 #define SDB_FULL	0x080
55 #define SDB_ANOM	0x100
56 #define SDB_ANOMPANIC	0x200
57 #define SDB_CLUSTER	0x400
58 #define SDB_PARANOIA	0x800
59 #endif
60 
61 TAILQ_HEAD(swpclean, swpagerclean);
62 
63 struct swpagerclean {
64 	TAILQ_ENTRY(swpagerclean)	spc_list;
65 	int				spc_flags;
66 	struct buf			*spc_bp;
67 	sw_pager_t			spc_swp;
68 	vm_offset_t			spc_kva;
69 	vm_page_t			spc_m;
70 	int				spc_npages;
71 } swcleanlist[NPENDINGIO];
72 typedef struct swpagerclean *swp_clean_t;
73 
74 /* spc_flags values */
75 #define SPC_FREE	0x00
76 #define SPC_BUSY	0x01
77 #define SPC_DONE	0x02
78 #define SPC_ERROR	0x04
79 
80 struct swtab {
81 	vm_size_t st_osize;	/* size of object (bytes) */
82 	int	  st_bsize;	/* vs. size of swap block (DEV_BSIZE units) */
83 #ifdef DEBUG
84 	u_long	  st_inuse;	/* number in this range in use */
85 	u_long	  st_usecnt;	/* total used of this size */
86 #endif
87 } swtab[NSWSIZES+1];
88 
89 #ifdef DEBUG
90 int		swap_pager_poip;	/* pageouts in progress */
91 int		swap_pager_piip;	/* pageins in progress */
92 #endif
93 
94 int		swap_pager_maxcluster;	/* maximum cluster size */
95 int		swap_pager_npendingio;	/* number of pager clean structs */
96 
97 struct swpclean	swap_pager_inuse;	/* list of pending page cleans */
98 struct swpclean	swap_pager_free;	/* list of free pager clean structs */
99 struct pagerlst	swap_pager_list;	/* list of "named" anon regions */
100 
101 static int		swap_pager_finish __P((swp_clean_t));
102 static void 		swap_pager_init __P((void));
103 static vm_pager_t	swap_pager_alloc
104 			    __P((caddr_t, vm_size_t, vm_prot_t, vm_offset_t));
105 static void		swap_pager_clean __P((int));
106 #ifdef DEBUG
107 static void		swap_pager_clean_check __P((vm_page_t *, int, int));
108 #endif
109 static void		swap_pager_cluster
110 			    __P((vm_pager_t, vm_offset_t,
111 				 vm_offset_t *, vm_offset_t *));
112 static void		swap_pager_dealloc __P((vm_pager_t));
113 static int		swap_pager_getpage
114 			    __P((vm_pager_t, vm_page_t *, int, boolean_t));
115 static boolean_t	swap_pager_haspage __P((vm_pager_t, vm_offset_t));
116 static int		swap_pager_io __P((sw_pager_t, vm_page_t *, int, int));
117 static void		swap_pager_iodone __P((struct buf *));
118 static int		swap_pager_putpage
119 			    __P((vm_pager_t, vm_page_t *, int, boolean_t));
120 
121 struct pagerops swappagerops = {
122 	swap_pager_init,
123 	swap_pager_alloc,
124 	swap_pager_dealloc,
125 	swap_pager_getpage,
126 	swap_pager_putpage,
127 	swap_pager_haspage,
128 	swap_pager_cluster
129 };
130 
131 static void
132 swap_pager_init()
133 {
134 	register swp_clean_t spc;
135 	register int i, bsize;
136 	extern int dmmin, dmmax;
137 	int maxbsize;
138 
139 #ifdef DEBUG
140 	if (swpagerdebug & (SDB_FOLLOW|SDB_INIT))
141 		printf("swpg_init()\n");
142 #endif
143 	dfltpagerops = &swappagerops;
144 	TAILQ_INIT(&swap_pager_list);
145 
146 	/*
147 	 * Allocate async IO structures.
148 	 *
149 	 * XXX it would be nice if we could do this dynamically based on
150 	 * the value of nswbuf (since we are ultimately limited by that)
151 	 * but neither nswbuf or malloc has been initialized yet.  So the
152 	 * structs are statically allocated above.
153 	 */
154 	swap_pager_npendingio = NPENDINGIO;
155 
156 	/*
157 	 * Initialize clean lists
158 	 */
159 	TAILQ_INIT(&swap_pager_inuse);
160 	TAILQ_INIT(&swap_pager_free);
161 	for (i = 0, spc = swcleanlist; i < swap_pager_npendingio; i++, spc++) {
162 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
163 		spc->spc_flags = SPC_FREE;
164 	}
165 
166 	/*
167 	 * Calculate the swap allocation constants.
168 	 */
169         if (dmmin == 0) {
170                 dmmin = DMMIN;
171 		if (dmmin < CLBYTES/DEV_BSIZE)
172 			dmmin = CLBYTES/DEV_BSIZE;
173 	}
174         if (dmmax == 0)
175                 dmmax = DMMAX;
176 
177 	/*
178 	 * Fill in our table of object size vs. allocation size
179 	 */
180 	bsize = btodb(PAGE_SIZE);
181 	if (bsize < dmmin)
182 		bsize = dmmin;
183 	maxbsize = btodb(sizeof(sw_bm_t) * NBBY * PAGE_SIZE);
184 	if (maxbsize > dmmax)
185 		maxbsize = dmmax;
186 	for (i = 0; i < NSWSIZES; i++) {
187 		swtab[i].st_osize = (vm_size_t) (MAXDADDRS * dbtob(bsize));
188 		swtab[i].st_bsize = bsize;
189 		if (bsize <= btodb(MAXPHYS))
190 			swap_pager_maxcluster = dbtob(bsize);
191 #ifdef DEBUG
192 		if (swpagerdebug & SDB_INIT)
193 			printf("swpg_init: ix %d, size %x, bsize %x\n",
194 			       i, swtab[i].st_osize, swtab[i].st_bsize);
195 #endif
196 		if (bsize >= maxbsize)
197 			break;
198 		bsize *= 2;
199 	}
200 	swtab[i].st_osize = 0;
201 	swtab[i].st_bsize = bsize;
202 }
203 
204 /*
205  * Allocate a pager structure and associated resources.
206  * Note that if we are called from the pageout daemon (handle == NULL)
207  * we should not wait for memory as it could resulting in deadlock.
208  */
209 static vm_pager_t
210 swap_pager_alloc(handle, size, prot, foff)
211 	caddr_t handle;
212 	register vm_size_t size;
213 	vm_prot_t prot;
214 	vm_offset_t foff;
215 {
216 	register vm_pager_t pager;
217 	register sw_pager_t swp;
218 	struct swtab *swt;
219 	int waitok;
220 
221 #ifdef DEBUG
222 	if (swpagerdebug & (SDB_FOLLOW|SDB_ALLOC))
223 		printf("swpg_alloc(%x, %x, %x)\n", handle, size, prot);
224 #endif
225 	/*
226 	 * If this is a "named" anonymous region, look it up and
227 	 * return the appropriate pager if it exists.
228 	 */
229 	if (handle) {
230 		pager = vm_pager_lookup(&swap_pager_list, handle);
231 		if (pager != NULL) {
232 			/*
233 			 * Use vm_object_lookup to gain a reference
234 			 * to the object and also to remove from the
235 			 * object cache.
236 			 */
237 			if (vm_object_lookup(pager) == NULL)
238 				panic("swap_pager_alloc: bad object");
239 			return(pager);
240 		}
241 	}
242 	/*
243 	 * Pager doesn't exist, allocate swap management resources
244 	 * and initialize.
245 	 */
246 	waitok = handle ? M_WAITOK : M_NOWAIT;
247 	pager = (vm_pager_t)malloc(sizeof *pager, M_VMPAGER, waitok);
248 	if (pager == NULL)
249 		return(NULL);
250 	swp = (sw_pager_t)malloc(sizeof *swp, M_VMPGDATA, waitok);
251 	if (swp == NULL) {
252 #ifdef DEBUG
253 		if (swpagerdebug & SDB_FAIL)
254 			printf("swpg_alloc: swpager malloc failed\n");
255 #endif
256 		free((caddr_t)pager, M_VMPAGER);
257 		return(NULL);
258 	}
259 	size = round_page(size);
260 	for (swt = swtab; swt->st_osize; swt++)
261 		if (size <= swt->st_osize)
262 			break;
263 #ifdef DEBUG
264 	swt->st_inuse++;
265 	swt->st_usecnt++;
266 #endif
267 	swp->sw_osize = size;
268 	swp->sw_bsize = swt->st_bsize;
269 	swp->sw_nblocks = (btodb(size) + swp->sw_bsize - 1) / swp->sw_bsize;
270 	swp->sw_blocks = (sw_blk_t)
271 		malloc(swp->sw_nblocks*sizeof(*swp->sw_blocks),
272 		       M_VMPGDATA, M_NOWAIT);
273 	if (swp->sw_blocks == NULL) {
274 		free((caddr_t)swp, M_VMPGDATA);
275 		free((caddr_t)pager, M_VMPAGER);
276 #ifdef DEBUG
277 		if (swpagerdebug & SDB_FAIL)
278 			printf("swpg_alloc: sw_blocks malloc failed\n");
279 		swt->st_inuse--;
280 		swt->st_usecnt--;
281 #endif
282 		return(FALSE);
283 	}
284 	bzero((caddr_t)swp->sw_blocks,
285 	      swp->sw_nblocks * sizeof(*swp->sw_blocks));
286 	swp->sw_poip = 0;
287 	if (handle) {
288 		vm_object_t object;
289 
290 		swp->sw_flags = SW_NAMED;
291 		TAILQ_INSERT_TAIL(&swap_pager_list, pager, pg_list);
292 		/*
293 		 * Consistant with other pagers: return with object
294 		 * referenced.  Can't do this with handle == NULL
295 		 * since it might be the pageout daemon calling.
296 		 */
297 		object = vm_object_allocate(size);
298 		vm_object_enter(object, pager);
299 		vm_object_setpager(object, pager, 0, FALSE);
300 	} else {
301 		swp->sw_flags = 0;
302 		pager->pg_list.tqe_next = NULL;
303 		pager->pg_list.tqe_prev = NULL;
304 	}
305 	pager->pg_handle = handle;
306 	pager->pg_ops = &swappagerops;
307 	pager->pg_type = PG_SWAP;
308 	pager->pg_flags = PG_CLUSTERPUT;
309 	pager->pg_data = swp;
310 
311 #ifdef DEBUG
312 	if (swpagerdebug & SDB_ALLOC)
313 		printf("swpg_alloc: pg_data %x, %x of %x at %x\n",
314 		       swp, swp->sw_nblocks, swp->sw_bsize, swp->sw_blocks);
315 #endif
316 	return(pager);
317 }
318 
319 static void
320 swap_pager_dealloc(pager)
321 	vm_pager_t pager;
322 {
323 	register int i;
324 	register sw_blk_t bp;
325 	register sw_pager_t swp;
326 	struct swtab *swt;
327 	int s;
328 
329 #ifdef DEBUG
330 	/* save panic time state */
331 	if ((swpagerdebug & SDB_ANOMPANIC) && panicstr)
332 		return;
333 	if (swpagerdebug & (SDB_FOLLOW|SDB_ALLOC))
334 		printf("swpg_dealloc(%x)\n", pager);
335 #endif
336 	/*
337 	 * Remove from list right away so lookups will fail if we
338 	 * block for pageout completion.
339 	 */
340 	swp = (sw_pager_t) pager->pg_data;
341 	if (swp->sw_flags & SW_NAMED) {
342 		TAILQ_REMOVE(&swap_pager_list, pager, pg_list);
343 		swp->sw_flags &= ~SW_NAMED;
344 	}
345 #ifdef DEBUG
346 	for (swt = swtab; swt->st_osize; swt++)
347 		if (swp->sw_osize <= swt->st_osize)
348 			break;
349 	swt->st_inuse--;
350 #endif
351 
352 	/*
353 	 * Wait for all pageouts to finish and remove
354 	 * all entries from cleaning list.
355 	 */
356 	s = splbio();
357 	while (swp->sw_poip) {
358 		swp->sw_flags |= SW_WANTED;
359 		(void) tsleep(swp, PVM, "swpgdealloc", 0);
360 	}
361 	splx(s);
362 	swap_pager_clean(B_WRITE);
363 
364 	/*
365 	 * Free left over swap blocks
366 	 */
367 	for (i = 0, bp = swp->sw_blocks; i < swp->sw_nblocks; i++, bp++)
368 		if (bp->swb_block) {
369 #ifdef DEBUG
370 			if (swpagerdebug & (SDB_ALLOCBLK|SDB_FULL))
371 				printf("swpg_dealloc: blk %x\n",
372 				       bp->swb_block);
373 #endif
374 			rmfree(swapmap, swp->sw_bsize, bp->swb_block);
375 		}
376 	/*
377 	 * Free swap management resources
378 	 */
379 	free((caddr_t)swp->sw_blocks, M_VMPGDATA);
380 	free((caddr_t)swp, M_VMPGDATA);
381 	free((caddr_t)pager, M_VMPAGER);
382 }
383 
384 static int
385 swap_pager_getpage(pager, mlist, npages, sync)
386 	vm_pager_t pager;
387 	vm_page_t *mlist;
388 	int npages;
389 	boolean_t sync;
390 {
391 #ifdef DEBUG
392 	if (swpagerdebug & SDB_FOLLOW)
393 		printf("swpg_getpage(%x, %x, %x, %x)\n",
394 		       pager, mlist, npages, sync);
395 #endif
396 	return(swap_pager_io((sw_pager_t)pager->pg_data,
397 			     mlist, npages, B_READ));
398 }
399 
400 static int
401 swap_pager_putpage(pager, mlist, npages, sync)
402 	vm_pager_t pager;
403 	vm_page_t *mlist;
404 	int npages;
405 	boolean_t sync;
406 {
407 	int flags;
408 
409 #ifdef DEBUG
410 	if (swpagerdebug & SDB_FOLLOW)
411 		printf("swpg_putpage(%x, %x, %x, %x)\n",
412 		       pager, mlist, npages, sync);
413 #endif
414 	if (pager == NULL) {
415 		swap_pager_clean(B_WRITE);
416 		return (VM_PAGER_OK);		/* ??? */
417 	}
418 	flags = B_WRITE;
419 	if (!sync)
420 		flags |= B_ASYNC;
421 	return(swap_pager_io((sw_pager_t)pager->pg_data,
422 			     mlist, npages, flags));
423 }
424 
425 static boolean_t
426 swap_pager_haspage(pager, offset)
427 	vm_pager_t pager;
428 	vm_offset_t offset;
429 {
430 	register sw_pager_t swp;
431 	register sw_blk_t swb;
432 	int ix;
433 
434 #ifdef DEBUG
435 	if (swpagerdebug & (SDB_FOLLOW|SDB_ALLOCBLK))
436 		printf("swpg_haspage(%x, %x) ", pager, offset);
437 #endif
438 	swp = (sw_pager_t) pager->pg_data;
439 	ix = offset / dbtob(swp->sw_bsize);
440 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
441 #ifdef DEBUG
442 		if (swpagerdebug & (SDB_FAIL|SDB_FOLLOW|SDB_ALLOCBLK))
443 			printf("swpg_haspage: %x bad offset %x, ix %x\n",
444 			       swp->sw_blocks, offset, ix);
445 #endif
446 		return(FALSE);
447 	}
448 	swb = &swp->sw_blocks[ix];
449 	if (swb->swb_block)
450 		ix = atop(offset % dbtob(swp->sw_bsize));
451 #ifdef DEBUG
452 	if (swpagerdebug & SDB_ALLOCBLK)
453 		printf("%x blk %x+%x ", swp->sw_blocks, swb->swb_block, ix);
454 	if (swpagerdebug & (SDB_FOLLOW|SDB_ALLOCBLK))
455 		printf("-> %c\n",
456 		       "FT"[swb->swb_block && (swb->swb_mask & (1 << ix))]);
457 #endif
458 	if (swb->swb_block && (swb->swb_mask & (1 << ix)))
459 		return(TRUE);
460 	return(FALSE);
461 }
462 
463 static void
464 swap_pager_cluster(pager, offset, loffset, hoffset)
465 	vm_pager_t	pager;
466 	vm_offset_t	offset;
467 	vm_offset_t	*loffset;
468 	vm_offset_t	*hoffset;
469 {
470 	sw_pager_t swp;
471 	register int bsize;
472 	vm_offset_t loff, hoff;
473 
474 #ifdef DEBUG
475 	if (swpagerdebug & (SDB_FOLLOW|SDB_CLUSTER))
476 		printf("swpg_cluster(%x, %x) ", pager, offset);
477 #endif
478 	swp = (sw_pager_t) pager->pg_data;
479 	bsize = dbtob(swp->sw_bsize);
480 	if (bsize > swap_pager_maxcluster)
481 		bsize = swap_pager_maxcluster;
482 
483 	loff = offset - (offset % bsize);
484 	if (loff >= swp->sw_osize)
485 		panic("swap_pager_cluster: bad offset");
486 
487 	hoff = loff + bsize;
488 	if (hoff > swp->sw_osize)
489 		hoff = swp->sw_osize;
490 
491 	*loffset = loff;
492 	*hoffset = hoff;
493 #ifdef DEBUG
494 	if (swpagerdebug & (SDB_FOLLOW|SDB_CLUSTER))
495 		printf("returns [%x-%x]\n", loff, hoff);
496 #endif
497 }
498 
499 /*
500  * Scaled down version of swap().
501  * Assumes that PAGE_SIZE < MAXPHYS; i.e. only one operation needed.
502  * BOGUS:  lower level IO routines expect a KVA so we have to map our
503  * provided physical page into the KVA to keep them happy.
504  */
505 static int
506 swap_pager_io(swp, mlist, npages, flags)
507 	register sw_pager_t swp;
508 	vm_page_t *mlist;
509 	int npages;
510 	int flags;
511 {
512 	register struct buf *bp;
513 	register sw_blk_t swb;
514 	register int s;
515 	int ix, mask;
516 	boolean_t rv;
517 	vm_offset_t kva, off;
518 	swp_clean_t spc;
519 	vm_page_t m;
520 
521 #ifdef DEBUG
522 	/* save panic time state */
523 	if ((swpagerdebug & SDB_ANOMPANIC) && panicstr)
524 		return (VM_PAGER_FAIL);		/* XXX: correct return? */
525 	if (swpagerdebug & (SDB_FOLLOW|SDB_IO))
526 		printf("swpg_io(%x, %x, %x, %x)\n", swp, mlist, npages, flags);
527 	if (flags & B_READ) {
528 		if (flags & B_ASYNC)
529 			panic("swap_pager_io: cannot do ASYNC reads");
530 		if (npages != 1)
531 			panic("swap_pager_io: cannot do clustered reads");
532 	}
533 #endif
534 
535 	/*
536 	 * First determine if the page exists in the pager if this is
537 	 * a sync read.  This quickly handles cases where we are
538 	 * following shadow chains looking for the top level object
539 	 * with the page.
540 	 */
541 	m = *mlist;
542 	off = m->offset + m->object->paging_offset;
543 	ix = off / dbtob(swp->sw_bsize);
544 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
545 #ifdef DEBUG
546 		if ((flags & B_READ) == 0 && (swpagerdebug & SDB_ANOM)) {
547 			printf("swap_pager_io: no swap block on write\n");
548 			return(VM_PAGER_BAD);
549 		}
550 #endif
551 		return(VM_PAGER_FAIL);
552 	}
553 	swb = &swp->sw_blocks[ix];
554 	off = off % dbtob(swp->sw_bsize);
555 	if ((flags & B_READ) &&
556 	    (swb->swb_block == 0 || (swb->swb_mask & (1 << atop(off))) == 0))
557 		return(VM_PAGER_FAIL);
558 
559 	/*
560 	 * For reads (pageins) and synchronous writes, we clean up
561 	 * all completed async pageouts.
562 	 */
563 	if ((flags & B_ASYNC) == 0) {
564 		s = splbio();
565 		swap_pager_clean(flags&B_READ);
566 #ifdef DEBUG
567 		if (swpagerdebug & SDB_PARANOIA)
568 			swap_pager_clean_check(mlist, npages, flags&B_READ);
569 #endif
570 		splx(s);
571 	}
572 	/*
573 	 * For async writes (pageouts), we cleanup completed pageouts so
574 	 * that all available resources are freed.  Also tells us if this
575 	 * page is already being cleaned.  If it is, or no resources
576 	 * are available, we try again later.
577 	 */
578 	else {
579 		swap_pager_clean(B_WRITE);
580 #ifdef DEBUG
581 		if (swpagerdebug & SDB_PARANOIA)
582 			swap_pager_clean_check(mlist, npages, B_WRITE);
583 #endif
584 		if (swap_pager_free.tqh_first == NULL) {
585 #ifdef DEBUG
586 			if (swpagerdebug & SDB_FAIL)
587 				printf("%s: no available io headers\n",
588 				       "swap_pager_io");
589 #endif
590 			return(VM_PAGER_AGAIN);
591 		}
592 	}
593 
594 	/*
595 	 * Allocate a swap block if necessary.
596 	 */
597 	if (swb->swb_block == 0) {
598 		swb->swb_block = rmalloc(swapmap, swp->sw_bsize);
599 		if (swb->swb_block == 0) {
600 #ifdef DEBUG
601 			if (swpagerdebug & SDB_FAIL)
602 				printf("swpg_io: rmalloc of %x failed\n",
603 				       swp->sw_bsize);
604 #endif
605 			/*
606 			 * XXX this is technically a resource shortage that
607 			 * should return AGAIN, but the situation isn't likely
608 			 * to be remedied just by delaying a little while and
609 			 * trying again (the pageout daemon's current response
610 			 * to AGAIN) so we just return FAIL.
611 			 */
612 			return(VM_PAGER_FAIL);
613 		}
614 #ifdef DEBUG
615 		if (swpagerdebug & (SDB_FULL|SDB_ALLOCBLK))
616 			printf("swpg_io: %x alloc blk %x at ix %x\n",
617 			       swp->sw_blocks, swb->swb_block, ix);
618 #endif
619 	}
620 
621 	/*
622 	 * Allocate a kernel virtual address and initialize so that PTE
623 	 * is available for lower level IO drivers.
624 	 */
625 	kva = vm_pager_map_pages(mlist, npages, !(flags & B_ASYNC));
626 	if (kva == NULL) {
627 #ifdef DEBUG
628 		if (swpagerdebug & SDB_FAIL)
629 			printf("%s: no KVA space to map pages\n",
630 			       "swap_pager_io");
631 #endif
632 		return(VM_PAGER_AGAIN);
633 	}
634 
635 	/*
636 	 * Get a swap buffer header and initialize it.
637 	 */
638 	s = splbio();
639 	while (bswlist.b_actf == NULL) {
640 #ifdef DEBUG
641 		if (swpagerdebug & SDB_ANOM)
642 			printf("swap_pager_io: wait on swbuf for %x (%d)\n",
643 			       m, flags);
644 #endif
645 		bswlist.b_flags |= B_WANTED;
646 		tsleep((caddr_t)&bswlist, PSWP+1, "swpgiobuf", 0);
647 	}
648 	bp = bswlist.b_actf;
649 	bswlist.b_actf = bp->b_actf;
650 	splx(s);
651 	bp->b_flags = B_BUSY | (flags & B_READ);
652 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
653 	bp->b_data = (caddr_t)kva;
654 	bp->b_blkno = swb->swb_block + btodb(off);
655 	VHOLD(swapdev_vp);
656 	bp->b_vp = swapdev_vp;
657 	if (swapdev_vp->v_type == VBLK)
658 		bp->b_dev = swapdev_vp->v_rdev;
659 	bp->b_bcount = npages * PAGE_SIZE;
660 
661 	/*
662 	 * For writes we set up additional buffer fields, record a pageout
663 	 * in progress and mark that these swap blocks are now allocated.
664 	 */
665 	if ((bp->b_flags & B_READ) == 0) {
666 		bp->b_dirtyoff = 0;
667 		bp->b_dirtyend = npages * PAGE_SIZE;
668 		swapdev_vp->v_numoutput++;
669 		s = splbio();
670 		swp->sw_poip++;
671 		splx(s);
672 		mask = (~(~0 << npages)) << atop(off);
673 #ifdef DEBUG
674 		swap_pager_poip++;
675 		if (swpagerdebug & SDB_WRITE)
676 			printf("swpg_io: write: bp=%x swp=%x poip=%d\n",
677 			       bp, swp, swp->sw_poip);
678 		if ((swpagerdebug & SDB_ALLOCBLK) &&
679 		    (swb->swb_mask & mask) != mask)
680 			printf("swpg_io: %x write %d pages at %x+%x\n",
681 			       swp->sw_blocks, npages, swb->swb_block,
682 			       atop(off));
683 		if (swpagerdebug & SDB_CLUSTER)
684 			printf("swpg_io: off=%x, npg=%x, mask=%x, bmask=%x\n",
685 			       off, npages, mask, swb->swb_mask);
686 #endif
687 		swb->swb_mask |= mask;
688 	}
689 	/*
690 	 * If this is an async write we set up still more buffer fields
691 	 * and place a "cleaning" entry on the inuse queue.
692 	 */
693 	if ((flags & (B_READ|B_ASYNC)) == B_ASYNC) {
694 #ifdef DEBUG
695 		if (swap_pager_free.tqh_first == NULL)
696 			panic("swpg_io: lost spc");
697 #endif
698 		spc = swap_pager_free.tqh_first;
699 		TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
700 #ifdef DEBUG
701 		if (spc->spc_flags != SPC_FREE)
702 			panic("swpg_io: bad free spc");
703 #endif
704 		spc->spc_flags = SPC_BUSY;
705 		spc->spc_bp = bp;
706 		spc->spc_swp = swp;
707 		spc->spc_kva = kva;
708 		/*
709 		 * Record the first page.  This allows swap_pager_finish
710 		 * to efficiently handle the common case of a single page.
711 		 * For clusters, it allows us to locate the object easily
712 		 * and we then reconstruct the rest of the mlist from spc_kva.
713 		 */
714 		spc->spc_m = m;
715 		spc->spc_npages = npages;
716 		bp->b_flags |= B_CALL;
717 		bp->b_iodone = swap_pager_iodone;
718 		s = splbio();
719 		TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list);
720 		splx(s);
721 	}
722 
723 	/*
724 	 * Finally, start the IO operation.
725 	 * If it is async we are all done, otherwise we must wait for
726 	 * completion and cleanup afterwards.
727 	 */
728 #ifdef DEBUG
729 	if (swpagerdebug & SDB_IO)
730 		printf("swpg_io: IO start: bp %x, db %x, va %x, pa %x\n",
731 		       bp, swb->swb_block+btodb(off), kva, VM_PAGE_TO_PHYS(m));
732 #endif
733 	VOP_STRATEGY(bp);
734 	if ((flags & (B_READ|B_ASYNC)) == B_ASYNC) {
735 #ifdef DEBUG
736 		if (swpagerdebug & SDB_IO)
737 			printf("swpg_io:  IO started: bp %x\n", bp);
738 #endif
739 		return(VM_PAGER_PEND);
740 	}
741 	s = splbio();
742 #ifdef DEBUG
743 	if (flags & B_READ)
744 		swap_pager_piip++;
745 	else
746 		swap_pager_poip++;
747 #endif
748 	while ((bp->b_flags & B_DONE) == 0)
749 		(void) tsleep(bp, PVM, "swpgio", 0);
750 	if ((flags & B_READ) == 0)
751 		--swp->sw_poip;
752 #ifdef DEBUG
753 	if (flags & B_READ)
754 		--swap_pager_piip;
755 	else
756 		--swap_pager_poip;
757 #endif
758 	rv = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK;
759 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY);
760 	bp->b_actf = bswlist.b_actf;
761 	bswlist.b_actf = bp;
762 	if (bp->b_vp)
763 		brelvp(bp);
764 	if (bswlist.b_flags & B_WANTED) {
765 		bswlist.b_flags &= ~B_WANTED;
766 		wakeup(&bswlist);
767 	}
768 	if ((flags & B_READ) == 0 && rv == VM_PAGER_OK) {
769 		m->flags |= PG_CLEAN;
770 		pmap_clear_modify(VM_PAGE_TO_PHYS(m));
771 	}
772 	splx(s);
773 #ifdef DEBUG
774 	if (swpagerdebug & SDB_IO)
775 		printf("swpg_io:  IO done: bp %x, rv %d\n", bp, rv);
776 	if ((swpagerdebug & SDB_FAIL) && rv == VM_PAGER_ERROR)
777 		printf("swpg_io: IO error\n");
778 #endif
779 	vm_pager_unmap_pages(kva, npages);
780 	return(rv);
781 }
782 
783 static void
784 swap_pager_clean(rw)
785 	int rw;
786 {
787 	register swp_clean_t spc;
788 	register int s, i;
789 	vm_object_t object;
790 	vm_page_t m;
791 
792 #ifdef DEBUG
793 	/* save panic time state */
794 	if ((swpagerdebug & SDB_ANOMPANIC) && panicstr)
795 		return;
796 	if (swpagerdebug & SDB_FOLLOW)
797 		printf("swpg_clean(%x)\n", rw);
798 #endif
799 
800 	for (;;) {
801 		/*
802 		 * Look up and removal from inuse list must be done
803 		 * at splbio() to avoid conflicts with swap_pager_iodone.
804 		 */
805 		s = splbio();
806 		for (spc = swap_pager_inuse.tqh_first;
807 		     spc != NULL;
808 		     spc = spc->spc_list.tqe_next) {
809 			/*
810 			 * If the operation is done, remove it from the
811 			 * list and process it.
812 			 *
813 			 * XXX if we can't get the object lock we also
814 			 * leave it on the list and try again later.
815 			 * Is there something better we could do?
816 			 */
817 			if ((spc->spc_flags & SPC_DONE) &&
818 			    vm_object_lock_try(spc->spc_m->object)) {
819 				TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list);
820 				break;
821 			}
822 		}
823 
824 		/*
825 		 * No operations done, thats all we can do for now.
826 		 */
827 		if (spc == NULL) {
828 			splx(s);
829 			break;
830 		}
831 		splx(s);
832 
833 		/*
834 		 * Found a completed operation so finish it off.
835 		 * Note: no longer at splbio since entry is off the list.
836 		 */
837 		m = spc->spc_m;
838 		object = m->object;
839 
840 		/*
841 		 * Process each page in the cluster.
842 		 * The first page is explicitly kept in the cleaning
843 		 * entry, others must be reconstructed from the KVA.
844 		 */
845 		for (i = 0; i < spc->spc_npages; i++) {
846 			if (i)
847 				m = vm_pager_atop(spc->spc_kva + ptoa(i));
848 			/*
849 			 * If no error mark as clean and inform the pmap
850 			 * system.  If there was an error, mark as dirty
851 			 * so we will try again.
852 			 *
853 			 * XXX could get stuck doing this, should give up
854 			 * after awhile.
855 			 */
856 			if (spc->spc_flags & SPC_ERROR) {
857 				printf("%s: clean of page %x failed\n",
858 				       "swap_pager_clean",
859 				       VM_PAGE_TO_PHYS(m));
860 				m->flags |= PG_LAUNDRY;
861 			} else {
862 				m->flags |= PG_CLEAN;
863 				pmap_clear_modify(VM_PAGE_TO_PHYS(m));
864 			}
865 			m->flags &= ~PG_BUSY;
866 			PAGE_WAKEUP(m);
867 		}
868 
869 		/*
870 		 * Done with the object, decrement the paging count
871 		 * and unlock it.
872 		 */
873 		if (--object->paging_in_progress == 0)
874 			wakeup(object);
875 		vm_object_unlock(object);
876 
877 		/*
878 		 * Free up KVM used and put the entry back on the list.
879 		 */
880 		vm_pager_unmap_pages(spc->spc_kva, spc->spc_npages);
881 		spc->spc_flags = SPC_FREE;
882 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
883 #ifdef DEBUG
884 		if (swpagerdebug & SDB_WRITE)
885 			printf("swpg_clean: free spc %x\n", spc);
886 #endif
887 	}
888 }
889 
890 #ifdef DEBUG
891 static void
892 swap_pager_clean_check(mlist, npages, rw)
893 	vm_page_t *mlist;
894 	int npages;
895 	int rw;
896 {
897 	register swp_clean_t spc;
898 	boolean_t bad;
899 	int i, j, s;
900 	vm_page_t m;
901 
902 	if (panicstr)
903 		return;
904 
905 	bad = FALSE;
906 	s = splbio();
907 	for (spc = swap_pager_inuse.tqh_first;
908 	     spc != NULL;
909 	     spc = spc->spc_list.tqe_next) {
910 		for (j = 0; j < spc->spc_npages; j++) {
911 			m = vm_pager_atop(spc->spc_kva + ptoa(j));
912 			for (i = 0; i < npages; i++)
913 				if (m == mlist[i]) {
914 					if (swpagerdebug & SDB_ANOM)
915 						printf(
916 		"swpg_clean_check: %s: page %x on list, flags %x\n",
917 		rw == B_WRITE ? "write" : "read", mlist[i], spc->spc_flags);
918 					bad = TRUE;
919 				}
920 		}
921 	}
922 	splx(s);
923 	if (bad)
924 		panic("swpg_clean_check");
925 }
926 #endif
927 
928 static void
929 swap_pager_iodone(bp)
930 	register struct buf *bp;
931 {
932 	register swp_clean_t spc;
933 	daddr_t blk;
934 	int s;
935 
936 #ifdef DEBUG
937 	/* save panic time state */
938 	if ((swpagerdebug & SDB_ANOMPANIC) && panicstr)
939 		return;
940 	if (swpagerdebug & SDB_FOLLOW)
941 		printf("swpg_iodone(%x)\n", bp);
942 #endif
943 	s = splbio();
944 	for (spc = swap_pager_inuse.tqh_first;
945 	     spc != NULL;
946 	     spc = spc->spc_list.tqe_next)
947 		if (spc->spc_bp == bp)
948 			break;
949 #ifdef DEBUG
950 	if (spc == NULL)
951 		panic("swap_pager_iodone: bp not found");
952 #endif
953 
954 	spc->spc_flags &= ~SPC_BUSY;
955 	spc->spc_flags |= SPC_DONE;
956 	if (bp->b_flags & B_ERROR)
957 		spc->spc_flags |= SPC_ERROR;
958 	spc->spc_bp = NULL;
959 	blk = bp->b_blkno;
960 
961 #ifdef DEBUG
962 	--swap_pager_poip;
963 	if (swpagerdebug & SDB_WRITE)
964 		printf("swpg_iodone: bp=%x swp=%x flags=%x spc=%x poip=%x\n",
965 		       bp, spc->spc_swp, spc->spc_swp->sw_flags,
966 		       spc, spc->spc_swp->sw_poip);
967 #endif
968 
969 	spc->spc_swp->sw_poip--;
970 	if (spc->spc_swp->sw_flags & SW_WANTED) {
971 		spc->spc_swp->sw_flags &= ~SW_WANTED;
972 		wakeup(spc->spc_swp);
973 	}
974 
975 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY);
976 	bp->b_actf = bswlist.b_actf;
977 	bswlist.b_actf = bp;
978 	if (bp->b_vp)
979 		brelvp(bp);
980 	if (bswlist.b_flags & B_WANTED) {
981 		bswlist.b_flags &= ~B_WANTED;
982 		wakeup(&bswlist);
983 	}
984 	wakeup(&vm_pages_needed);
985 	splx(s);
986 }
987