xref: /onnv-gate/usr/src/uts/common/vm/vm_pvn.c (revision 11888)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
52999Sstans  * Common Development and Distribution License (the "License").
62999Sstans  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
22*11888SPavel.Filipensky@Sun.COM  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
270Sstevel@tonic-gate /*	  All Rights Reserved  	*/
280Sstevel@tonic-gate 
290Sstevel@tonic-gate /*
300Sstevel@tonic-gate  * University Copyright- Copyright (c) 1982, 1986, 1988
310Sstevel@tonic-gate  * The Regents of the University of California
320Sstevel@tonic-gate  * All Rights Reserved
330Sstevel@tonic-gate  *
340Sstevel@tonic-gate  * University Acknowledgment- Portions of this document are derived from
350Sstevel@tonic-gate  * software developed by the University of California, Berkeley, and its
360Sstevel@tonic-gate  * contributors.
370Sstevel@tonic-gate  */
380Sstevel@tonic-gate 
390Sstevel@tonic-gate /*
400Sstevel@tonic-gate  * VM - paged vnode.
410Sstevel@tonic-gate  *
420Sstevel@tonic-gate  * This file supplies vm support for the vnode operations that deal with pages.
430Sstevel@tonic-gate  */
440Sstevel@tonic-gate #include <sys/types.h>
450Sstevel@tonic-gate #include <sys/t_lock.h>
460Sstevel@tonic-gate #include <sys/param.h>
470Sstevel@tonic-gate #include <sys/sysmacros.h>
480Sstevel@tonic-gate #include <sys/systm.h>
490Sstevel@tonic-gate #include <sys/time.h>
500Sstevel@tonic-gate #include <sys/buf.h>
510Sstevel@tonic-gate #include <sys/vnode.h>
520Sstevel@tonic-gate #include <sys/uio.h>
530Sstevel@tonic-gate #include <sys/vmmeter.h>
540Sstevel@tonic-gate #include <sys/vmsystm.h>
550Sstevel@tonic-gate #include <sys/mman.h>
560Sstevel@tonic-gate #include <sys/vfs.h>
570Sstevel@tonic-gate #include <sys/cred.h>
580Sstevel@tonic-gate #include <sys/user.h>
590Sstevel@tonic-gate #include <sys/kmem.h>
600Sstevel@tonic-gate #include <sys/cmn_err.h>
610Sstevel@tonic-gate #include <sys/debug.h>
620Sstevel@tonic-gate #include <sys/cpuvar.h>
630Sstevel@tonic-gate #include <sys/vtrace.h>
640Sstevel@tonic-gate #include <sys/tnf_probe.h>
650Sstevel@tonic-gate 
660Sstevel@tonic-gate #include <vm/hat.h>
670Sstevel@tonic-gate #include <vm/as.h>
680Sstevel@tonic-gate #include <vm/seg.h>
690Sstevel@tonic-gate #include <vm/rm.h>
700Sstevel@tonic-gate #include <vm/pvn.h>
710Sstevel@tonic-gate #include <vm/page.h>
720Sstevel@tonic-gate #include <vm/seg_map.h>
730Sstevel@tonic-gate #include <vm/seg_kmem.h>
740Sstevel@tonic-gate #include <sys/fs/swapnode.h>
750Sstevel@tonic-gate 
760Sstevel@tonic-gate int pvn_nofodklust = 0;
770Sstevel@tonic-gate int pvn_write_noklust = 0;
780Sstevel@tonic-gate 
790Sstevel@tonic-gate uint_t pvn_vmodsort_supported = 0;	/* set if HAT supports VMODSORT */
800Sstevel@tonic-gate uint_t pvn_vmodsort_disable = 0;	/* set in /etc/system to disable HAT */
810Sstevel@tonic-gate 					/* support for vmodsort for testing */
820Sstevel@tonic-gate 
830Sstevel@tonic-gate static struct kmem_cache *marker_cache = NULL;
840Sstevel@tonic-gate 
850Sstevel@tonic-gate /*
860Sstevel@tonic-gate  * Find the largest contiguous block which contains `addr' for file offset
870Sstevel@tonic-gate  * `offset' in it while living within the file system block sizes (`vp_off'
880Sstevel@tonic-gate  * and `vp_len') and the address space limits for which no pages currently
890Sstevel@tonic-gate  * exist and which map to consecutive file offsets.
900Sstevel@tonic-gate  */
910Sstevel@tonic-gate page_t *
920Sstevel@tonic-gate pvn_read_kluster(
930Sstevel@tonic-gate 	struct vnode *vp,
940Sstevel@tonic-gate 	u_offset_t off,
950Sstevel@tonic-gate 	struct seg *seg,
960Sstevel@tonic-gate 	caddr_t addr,
970Sstevel@tonic-gate 	u_offset_t *offp,			/* return values */
980Sstevel@tonic-gate 	size_t *lenp,				/* return values */
990Sstevel@tonic-gate 	u_offset_t vp_off,
1000Sstevel@tonic-gate 	size_t vp_len,
1010Sstevel@tonic-gate 	int isra)
1020Sstevel@tonic-gate {
1030Sstevel@tonic-gate 	ssize_t deltaf, deltab;
1040Sstevel@tonic-gate 	page_t *pp;
1050Sstevel@tonic-gate 	page_t *plist = NULL;
1060Sstevel@tonic-gate 	spgcnt_t pagesavail;
1070Sstevel@tonic-gate 	u_offset_t vp_end;
1080Sstevel@tonic-gate 
1090Sstevel@tonic-gate 	ASSERT(off >= vp_off && off < vp_off + vp_len);
1100Sstevel@tonic-gate 
1110Sstevel@tonic-gate 	/*
1120Sstevel@tonic-gate 	 * We only want to do klustering/read ahead if there
1130Sstevel@tonic-gate 	 * is more than minfree pages currently available.
1140Sstevel@tonic-gate 	 */
1150Sstevel@tonic-gate 	pagesavail = freemem - minfree;
1160Sstevel@tonic-gate 
1170Sstevel@tonic-gate 	if (pagesavail <= 0)
1180Sstevel@tonic-gate 		if (isra)
1190Sstevel@tonic-gate 			return ((page_t *)NULL);    /* ra case - give up */
1200Sstevel@tonic-gate 		else
1210Sstevel@tonic-gate 			pagesavail = 1;		    /* must return a page */
1220Sstevel@tonic-gate 
1230Sstevel@tonic-gate 	/* We calculate in pages instead of bytes due to 32-bit overflows */
1240Sstevel@tonic-gate 	if (pagesavail < (spgcnt_t)btopr(vp_len)) {
1250Sstevel@tonic-gate 		/*
1260Sstevel@tonic-gate 		 * Don't have enough free memory for the
1270Sstevel@tonic-gate 		 * max request, try sizing down vp request.
1280Sstevel@tonic-gate 		 */
1290Sstevel@tonic-gate 		deltab = (ssize_t)(off - vp_off);
1300Sstevel@tonic-gate 		vp_len -= deltab;
1310Sstevel@tonic-gate 		vp_off += deltab;
1320Sstevel@tonic-gate 		if (pagesavail < btopr(vp_len)) {
1330Sstevel@tonic-gate 			/*
1340Sstevel@tonic-gate 			 * Still not enough memory, just settle for
1350Sstevel@tonic-gate 			 * pagesavail which is at least 1.
1360Sstevel@tonic-gate 			 */
1370Sstevel@tonic-gate 			vp_len = ptob(pagesavail);
1380Sstevel@tonic-gate 		}
1390Sstevel@tonic-gate 	}
1400Sstevel@tonic-gate 
1410Sstevel@tonic-gate 	vp_end = vp_off + vp_len;
1420Sstevel@tonic-gate 	ASSERT(off >= vp_off && off < vp_end);
1430Sstevel@tonic-gate 
1440Sstevel@tonic-gate 	if (isra && SEGOP_KLUSTER(seg, addr, 0))
1450Sstevel@tonic-gate 		return ((page_t *)NULL);	/* segment driver says no */
1460Sstevel@tonic-gate 
1470Sstevel@tonic-gate 	if ((plist = page_create_va(vp, off,
1480Sstevel@tonic-gate 	    PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
1490Sstevel@tonic-gate 		return ((page_t *)NULL);
1500Sstevel@tonic-gate 
1510Sstevel@tonic-gate 	if (vp_len <= PAGESIZE || pvn_nofodklust) {
1520Sstevel@tonic-gate 		*offp = off;
1530Sstevel@tonic-gate 		*lenp = MIN(vp_len, PAGESIZE);
1540Sstevel@tonic-gate 	} else {
1550Sstevel@tonic-gate 		/*
1560Sstevel@tonic-gate 		 * Scan back from front by incrementing "deltab" and
1570Sstevel@tonic-gate 		 * comparing "off" with "vp_off + deltab" to avoid
1580Sstevel@tonic-gate 		 * "signed" versus "unsigned" conversion problems.
1590Sstevel@tonic-gate 		 */
1600Sstevel@tonic-gate 		for (deltab = PAGESIZE; off >= vp_off + deltab;
1610Sstevel@tonic-gate 		    deltab += PAGESIZE) {
1620Sstevel@tonic-gate 			/*
1630Sstevel@tonic-gate 			 * Call back to the segment driver to verify that
1640Sstevel@tonic-gate 			 * the klustering/read ahead operation makes sense.
1650Sstevel@tonic-gate 			 */
1660Sstevel@tonic-gate 			if (SEGOP_KLUSTER(seg, addr, -deltab))
1670Sstevel@tonic-gate 				break;		/* page not eligible */
1680Sstevel@tonic-gate 			if ((pp = page_create_va(vp, off - deltab,
1690Sstevel@tonic-gate 			    PAGESIZE, PG_EXCL, seg, addr - deltab))
1700Sstevel@tonic-gate 			    == NULL)
1710Sstevel@tonic-gate 				break;		/* already have the page */
1720Sstevel@tonic-gate 			/*
1730Sstevel@tonic-gate 			 * Add page to front of page list.
1740Sstevel@tonic-gate 			 */
1750Sstevel@tonic-gate 			page_add(&plist, pp);
1760Sstevel@tonic-gate 		}
1770Sstevel@tonic-gate 		deltab -= PAGESIZE;
1780Sstevel@tonic-gate 
1790Sstevel@tonic-gate 		/* scan forward from front */
1800Sstevel@tonic-gate 		for (deltaf = PAGESIZE; off + deltaf < vp_end;
1810Sstevel@tonic-gate 		    deltaf += PAGESIZE) {
1820Sstevel@tonic-gate 			/*
1830Sstevel@tonic-gate 			 * Call back to the segment driver to verify that
1840Sstevel@tonic-gate 			 * the klustering/read ahead operation makes sense.
1850Sstevel@tonic-gate 			 */
1860Sstevel@tonic-gate 			if (SEGOP_KLUSTER(seg, addr, deltaf))
1870Sstevel@tonic-gate 				break;		/* page not file extension */
1880Sstevel@tonic-gate 			if ((pp = page_create_va(vp, off + deltaf,
1890Sstevel@tonic-gate 			    PAGESIZE, PG_EXCL, seg, addr + deltaf))
1900Sstevel@tonic-gate 			    == NULL)
1910Sstevel@tonic-gate 				break;		/* already have page */
1920Sstevel@tonic-gate 
1930Sstevel@tonic-gate 			/*
1940Sstevel@tonic-gate 			 * Add page to end of page list.
1950Sstevel@tonic-gate 			 */
1960Sstevel@tonic-gate 			page_add(&plist, pp);
1970Sstevel@tonic-gate 			plist = plist->p_next;
1980Sstevel@tonic-gate 		}
1990Sstevel@tonic-gate 		*offp = off = off - deltab;
2000Sstevel@tonic-gate 		*lenp = deltab + deltaf;
2010Sstevel@tonic-gate 		ASSERT(off >= vp_off);
2020Sstevel@tonic-gate 
2030Sstevel@tonic-gate 		/*
2040Sstevel@tonic-gate 		 * If we ended up getting more than was actually
2050Sstevel@tonic-gate 		 * requested, retract the returned length to only
2060Sstevel@tonic-gate 		 * reflect what was requested.  This might happen
2070Sstevel@tonic-gate 		 * if we were allowed to kluster pages across a
2080Sstevel@tonic-gate 		 * span of (say) 5 frags, and frag size is less
2090Sstevel@tonic-gate 		 * than PAGESIZE.  We need a whole number of
2100Sstevel@tonic-gate 		 * pages to contain those frags, but the returned
2110Sstevel@tonic-gate 		 * size should only allow the returned range to
2120Sstevel@tonic-gate 		 * extend as far as the end of the frags.
2130Sstevel@tonic-gate 		 */
2140Sstevel@tonic-gate 		if ((vp_off + vp_len) < (off + *lenp)) {
2150Sstevel@tonic-gate 			ASSERT(vp_end > off);
2160Sstevel@tonic-gate 			*lenp = vp_end - off;
2170Sstevel@tonic-gate 		}
2180Sstevel@tonic-gate 	}
2190Sstevel@tonic-gate 	TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
220*11888SPavel.Filipensky@Sun.COM 	    "pvn_read_kluster:seg %p addr %x isra %x",
221*11888SPavel.Filipensky@Sun.COM 	    seg, addr, isra);
2220Sstevel@tonic-gate 	return (plist);
2230Sstevel@tonic-gate }
2240Sstevel@tonic-gate 
2250Sstevel@tonic-gate /*
2260Sstevel@tonic-gate  * Handle pages for this vnode on either side of the page "pp"
2270Sstevel@tonic-gate  * which has been locked by the caller.  This routine will also
2280Sstevel@tonic-gate  * do klustering in the range [vp_off, vp_off + vp_len] up
2290Sstevel@tonic-gate  * until a page which is not found.  The offset and length
2300Sstevel@tonic-gate  * of pages included is returned in "*offp" and "*lenp".
2310Sstevel@tonic-gate  *
2320Sstevel@tonic-gate  * Returns a list of dirty locked pages all ready to be
2330Sstevel@tonic-gate  * written back.
2340Sstevel@tonic-gate  */
2350Sstevel@tonic-gate page_t *
2360Sstevel@tonic-gate pvn_write_kluster(
2370Sstevel@tonic-gate 	struct vnode *vp,
2380Sstevel@tonic-gate 	page_t *pp,
2390Sstevel@tonic-gate 	u_offset_t *offp,		/* return values */
2400Sstevel@tonic-gate 	size_t *lenp,			/* return values */
2410Sstevel@tonic-gate 	u_offset_t vp_off,
2420Sstevel@tonic-gate 	size_t vp_len,
2430Sstevel@tonic-gate 	int flags)
2440Sstevel@tonic-gate {
2450Sstevel@tonic-gate 	u_offset_t off;
2460Sstevel@tonic-gate 	page_t *dirty;
2470Sstevel@tonic-gate 	size_t deltab, deltaf;
2480Sstevel@tonic-gate 	se_t se;
2490Sstevel@tonic-gate 	u_offset_t vp_end;
2500Sstevel@tonic-gate 
2510Sstevel@tonic-gate 	off = pp->p_offset;
2520Sstevel@tonic-gate 
2530Sstevel@tonic-gate 	/*
2540Sstevel@tonic-gate 	 * Kustering should not be done if we are invalidating
2550Sstevel@tonic-gate 	 * pages since we could destroy pages that belong to
2560Sstevel@tonic-gate 	 * some other process if this is a swap vnode.
2570Sstevel@tonic-gate 	 */
2580Sstevel@tonic-gate 	if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
2590Sstevel@tonic-gate 		*offp = off;
2600Sstevel@tonic-gate 		*lenp = PAGESIZE;
2610Sstevel@tonic-gate 		return (pp);
2620Sstevel@tonic-gate 	}
2630Sstevel@tonic-gate 
2640Sstevel@tonic-gate 	if (flags & (B_FREE | B_INVAL))
2650Sstevel@tonic-gate 		se = SE_EXCL;
2660Sstevel@tonic-gate 	else
2670Sstevel@tonic-gate 		se = SE_SHARED;
2680Sstevel@tonic-gate 
2690Sstevel@tonic-gate 	dirty = pp;
2700Sstevel@tonic-gate 	/*
2710Sstevel@tonic-gate 	 * Scan backwards looking for pages to kluster by incrementing
2720Sstevel@tonic-gate 	 * "deltab" and comparing "off" with "vp_off + deltab" to
2730Sstevel@tonic-gate 	 * avoid "signed" versus "unsigned" conversion problems.
2740Sstevel@tonic-gate 	 */
2750Sstevel@tonic-gate 	for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
2760Sstevel@tonic-gate 		pp = page_lookup_nowait(vp, off - deltab, se);
2770Sstevel@tonic-gate 		if (pp == NULL)
2780Sstevel@tonic-gate 			break;		/* page not found */
2790Sstevel@tonic-gate 		if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
2800Sstevel@tonic-gate 			break;
2810Sstevel@tonic-gate 		page_add(&dirty, pp);
2820Sstevel@tonic-gate 	}
2830Sstevel@tonic-gate 	deltab -= PAGESIZE;
2840Sstevel@tonic-gate 
2850Sstevel@tonic-gate 	vp_end = vp_off + vp_len;
2860Sstevel@tonic-gate 	/* now scan forwards looking for pages to kluster */
2870Sstevel@tonic-gate 	for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
2880Sstevel@tonic-gate 		pp = page_lookup_nowait(vp, off + deltaf, se);
2890Sstevel@tonic-gate 		if (pp == NULL)
2900Sstevel@tonic-gate 			break;		/* page not found */
2910Sstevel@tonic-gate 		if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
2920Sstevel@tonic-gate 			break;
2930Sstevel@tonic-gate 		page_add(&dirty, pp);
2940Sstevel@tonic-gate 		dirty = dirty->p_next;
2950Sstevel@tonic-gate 	}
2960Sstevel@tonic-gate 
2970Sstevel@tonic-gate 	*offp = off - deltab;
2980Sstevel@tonic-gate 	*lenp = deltab + deltaf;
2990Sstevel@tonic-gate 	return (dirty);
3000Sstevel@tonic-gate }
3010Sstevel@tonic-gate 
3020Sstevel@tonic-gate /*
3030Sstevel@tonic-gate  * Generic entry point used to release the "shared/exclusive" lock
3040Sstevel@tonic-gate  * and the "p_iolock" on pages after i/o is complete.
3050Sstevel@tonic-gate  */
3060Sstevel@tonic-gate void
3070Sstevel@tonic-gate pvn_io_done(page_t *plist)
3080Sstevel@tonic-gate {
3090Sstevel@tonic-gate 	page_t *pp;
3100Sstevel@tonic-gate 
3110Sstevel@tonic-gate 	while (plist != NULL) {
3120Sstevel@tonic-gate 		pp = plist;
3130Sstevel@tonic-gate 		page_sub(&plist, pp);
3140Sstevel@tonic-gate 		page_io_unlock(pp);
3150Sstevel@tonic-gate 		page_unlock(pp);
3160Sstevel@tonic-gate 	}
3170Sstevel@tonic-gate }
3180Sstevel@tonic-gate 
3190Sstevel@tonic-gate /*
3200Sstevel@tonic-gate  * Entry point to be used by file system getpage subr's and
3210Sstevel@tonic-gate  * other such routines which either want to unlock pages (B_ASYNC
3220Sstevel@tonic-gate  * request) or destroy a list of pages if an error occurred.
3230Sstevel@tonic-gate  */
3240Sstevel@tonic-gate void
3250Sstevel@tonic-gate pvn_read_done(page_t *plist, int flags)
3260Sstevel@tonic-gate {
3270Sstevel@tonic-gate 	page_t *pp;
3280Sstevel@tonic-gate 
3290Sstevel@tonic-gate 	while (plist != NULL) {
3300Sstevel@tonic-gate 		pp = plist;
3310Sstevel@tonic-gate 		page_sub(&plist, pp);
3320Sstevel@tonic-gate 		page_io_unlock(pp);
3330Sstevel@tonic-gate 		if (flags & B_ERROR) {
3340Sstevel@tonic-gate 			/*LINTED: constant in conditional context*/
3350Sstevel@tonic-gate 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
3360Sstevel@tonic-gate 		} else {
3370Sstevel@tonic-gate 			(void) page_release(pp, 0);
3380Sstevel@tonic-gate 		}
3390Sstevel@tonic-gate 	}
3400Sstevel@tonic-gate }
3410Sstevel@tonic-gate 
3420Sstevel@tonic-gate /*
3430Sstevel@tonic-gate  * Automagic pageout.
3440Sstevel@tonic-gate  * When memory gets tight, start freeing pages popping out of the
3450Sstevel@tonic-gate  * write queue.
3460Sstevel@tonic-gate  */
3470Sstevel@tonic-gate int	write_free = 1;
3480Sstevel@tonic-gate pgcnt_t	pages_before_pager = 200;	/* LMXXX */
3490Sstevel@tonic-gate 
3500Sstevel@tonic-gate /*
3510Sstevel@tonic-gate  * Routine to be called when page-out's complete.
3520Sstevel@tonic-gate  * The caller, typically VOP_PUTPAGE, has to explicity call this routine
3530Sstevel@tonic-gate  * after waiting for i/o to complete (biowait) to free the list of
3540Sstevel@tonic-gate  * pages associated with the buffer.  These pages must be locked
3550Sstevel@tonic-gate  * before i/o is initiated.
3560Sstevel@tonic-gate  *
3570Sstevel@tonic-gate  * If a write error occurs, the pages are marked as modified
3580Sstevel@tonic-gate  * so the write will be re-tried later.
3590Sstevel@tonic-gate  */
3600Sstevel@tonic-gate 
3610Sstevel@tonic-gate void
3620Sstevel@tonic-gate pvn_write_done(page_t *plist, int flags)
3630Sstevel@tonic-gate {
3640Sstevel@tonic-gate 	int dfree = 0;
3650Sstevel@tonic-gate 	int pgrec = 0;
3660Sstevel@tonic-gate 	int pgout = 0;
3670Sstevel@tonic-gate 	int pgpgout = 0;
3680Sstevel@tonic-gate 	int anonpgout = 0;
3690Sstevel@tonic-gate 	int anonfree = 0;
3700Sstevel@tonic-gate 	int fspgout = 0;
3710Sstevel@tonic-gate 	int fsfree = 0;
3720Sstevel@tonic-gate 	int execpgout = 0;
3730Sstevel@tonic-gate 	int execfree = 0;
3740Sstevel@tonic-gate 	page_t *pp;
3750Sstevel@tonic-gate 	struct cpu *cpup;
3760Sstevel@tonic-gate 	struct vnode *vp = NULL;	/* for probe */
3770Sstevel@tonic-gate 	uint_t ppattr;
3782999Sstans 	kmutex_t *vphm = NULL;
3790Sstevel@tonic-gate 
3800Sstevel@tonic-gate 	ASSERT((flags & B_READ) == 0);
3810Sstevel@tonic-gate 
3820Sstevel@tonic-gate 	/*
3830Sstevel@tonic-gate 	 * If we are about to start paging anyway, start freeing pages.
3840Sstevel@tonic-gate 	 */
3850Sstevel@tonic-gate 	if (write_free && freemem < lotsfree + pages_before_pager &&
3860Sstevel@tonic-gate 	    (flags & B_ERROR) == 0) {
3870Sstevel@tonic-gate 		flags |= B_FREE;
3880Sstevel@tonic-gate 	}
3890Sstevel@tonic-gate 
3900Sstevel@tonic-gate 	/*
3910Sstevel@tonic-gate 	 * Handle each page involved in the i/o operation.
3920Sstevel@tonic-gate 	 */
3930Sstevel@tonic-gate 	while (plist != NULL) {
3940Sstevel@tonic-gate 		pp = plist;
3950Sstevel@tonic-gate 		ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
3960Sstevel@tonic-gate 		page_sub(&plist, pp);
3970Sstevel@tonic-gate 
3980Sstevel@tonic-gate 		/* Kernel probe support */
3990Sstevel@tonic-gate 		if (vp == NULL)
4000Sstevel@tonic-gate 			vp = pp->p_vnode;
4010Sstevel@tonic-gate 
4024324Sqiao 		if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
4032999Sstans 			/*
4042999Sstans 			 * Move page to the top of the v_page list.
4052999Sstans 			 * Skip pages modified during IO.
4062999Sstans 			 */
4072999Sstans 			vphm = page_vnode_mutex(vp);
4082999Sstans 			mutex_enter(vphm);
4092999Sstans 			if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
4102999Sstans 				page_vpsub(&vp->v_pages, pp);
4112999Sstans 				page_vpadd(&vp->v_pages, pp);
4122999Sstans 			}
4132999Sstans 			mutex_exit(vphm);
4142999Sstans 		}
4152999Sstans 
4160Sstevel@tonic-gate 		if (flags & B_ERROR) {
4170Sstevel@tonic-gate 			/*
4180Sstevel@tonic-gate 			 * Write operation failed.  We don't want
4190Sstevel@tonic-gate 			 * to destroy (or free) the page unless B_FORCE
4200Sstevel@tonic-gate 			 * is set. We set the mod bit again and release
4210Sstevel@tonic-gate 			 * all locks on the page so that it will get written
4220Sstevel@tonic-gate 			 * back again later when things are hopefully
4230Sstevel@tonic-gate 			 * better again.
4240Sstevel@tonic-gate 			 * If B_INVAL and B_FORCE is set we really have
4250Sstevel@tonic-gate 			 * to destroy the page.
4260Sstevel@tonic-gate 			 */
4270Sstevel@tonic-gate 			if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
4280Sstevel@tonic-gate 				page_io_unlock(pp);
4290Sstevel@tonic-gate 				/*LINTED: constant in conditional context*/
4300Sstevel@tonic-gate 				VN_DISPOSE(pp, B_INVAL, 0, kcred);
4310Sstevel@tonic-gate 			} else {
4324324Sqiao 				hat_setmod_only(pp);
4330Sstevel@tonic-gate 				page_io_unlock(pp);
4340Sstevel@tonic-gate 				page_unlock(pp);
4350Sstevel@tonic-gate 			}
4360Sstevel@tonic-gate 		} else if (flags & B_INVAL) {
4370Sstevel@tonic-gate 			/*
4380Sstevel@tonic-gate 			 * XXX - Failed writes with B_INVAL set are
4390Sstevel@tonic-gate 			 * not handled appropriately.
4400Sstevel@tonic-gate 			 */
4410Sstevel@tonic-gate 			page_io_unlock(pp);
4420Sstevel@tonic-gate 			/*LINTED: constant in conditional context*/
4430Sstevel@tonic-gate 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
4440Sstevel@tonic-gate 		} else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
4450Sstevel@tonic-gate 			/*
4460Sstevel@tonic-gate 			 * Update statistics for pages being paged out
4470Sstevel@tonic-gate 			 */
4480Sstevel@tonic-gate 			if (pp->p_vnode) {
4490Sstevel@tonic-gate 				if (IS_SWAPFSVP(pp->p_vnode)) {
4500Sstevel@tonic-gate 					anonpgout++;
4510Sstevel@tonic-gate 				} else {
4520Sstevel@tonic-gate 					if (pp->p_vnode->v_flag & VVMEXEC) {
4530Sstevel@tonic-gate 						execpgout++;
4540Sstevel@tonic-gate 					} else {
4550Sstevel@tonic-gate 						fspgout++;
4560Sstevel@tonic-gate 					}
4570Sstevel@tonic-gate 				}
4580Sstevel@tonic-gate 			}
4590Sstevel@tonic-gate 			page_io_unlock(pp);
4600Sstevel@tonic-gate 			pgout = 1;
4610Sstevel@tonic-gate 			pgpgout++;
4620Sstevel@tonic-gate 			TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
463*11888SPavel.Filipensky@Sun.COM 			    "page_ws_out:pp %p", pp);
4640Sstevel@tonic-gate 
4650Sstevel@tonic-gate 			/*
4660Sstevel@tonic-gate 			 * The page_struct_lock need not be acquired to
4670Sstevel@tonic-gate 			 * examine "p_lckcnt" and "p_cowcnt" since we'll
4680Sstevel@tonic-gate 			 * have an "exclusive" lock if the upgrade succeeds.
4690Sstevel@tonic-gate 			 */
4700Sstevel@tonic-gate 			if (page_tryupgrade(pp) &&
4710Sstevel@tonic-gate 			    pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
4720Sstevel@tonic-gate 				/*
4730Sstevel@tonic-gate 				 * Check if someone has reclaimed the
4740Sstevel@tonic-gate 				 * page.  If ref and mod are not set, no
4750Sstevel@tonic-gate 				 * one is using it so we can free it.
4760Sstevel@tonic-gate 				 * The rest of the system is careful
4770Sstevel@tonic-gate 				 * to use the NOSYNC flag to unload
4780Sstevel@tonic-gate 				 * translations set up for i/o w/o
4790Sstevel@tonic-gate 				 * affecting ref and mod bits.
4800Sstevel@tonic-gate 				 *
4810Sstevel@tonic-gate 				 * Obtain a copy of the real hardware
4820Sstevel@tonic-gate 				 * mod bit using hat_pagesync(pp, HAT_DONTZERO)
4830Sstevel@tonic-gate 				 * to avoid having to flush the cache.
4840Sstevel@tonic-gate 				 */
4850Sstevel@tonic-gate 				ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
486*11888SPavel.Filipensky@Sun.COM 				    HAT_SYNC_STOPON_MOD);
4870Sstevel@tonic-gate 			ck_refmod:
4880Sstevel@tonic-gate 				if (!(ppattr & (P_REF | P_MOD))) {
4890Sstevel@tonic-gate 					if (hat_page_is_mapped(pp)) {
4900Sstevel@tonic-gate 						/*
4910Sstevel@tonic-gate 						 * Doesn't look like the page
4920Sstevel@tonic-gate 						 * was modified so now we
4930Sstevel@tonic-gate 						 * really have to unload the
4940Sstevel@tonic-gate 						 * translations.  Meanwhile
4950Sstevel@tonic-gate 						 * another CPU could've
4960Sstevel@tonic-gate 						 * modified it so we have to
4970Sstevel@tonic-gate 						 * check again.  We don't loop
4980Sstevel@tonic-gate 						 * forever here because now
4990Sstevel@tonic-gate 						 * the translations are gone
5000Sstevel@tonic-gate 						 * and no one can get a new one
5010Sstevel@tonic-gate 						 * since we have the "exclusive"
5020Sstevel@tonic-gate 						 * lock on the page.
5030Sstevel@tonic-gate 						 */
5040Sstevel@tonic-gate 						(void) hat_pageunload(pp,
505*11888SPavel.Filipensky@Sun.COM 						    HAT_FORCE_PGUNLOAD);
5060Sstevel@tonic-gate 						ppattr = hat_page_getattr(pp,
507*11888SPavel.Filipensky@Sun.COM 						    P_REF | P_MOD);
5080Sstevel@tonic-gate 						goto ck_refmod;
5090Sstevel@tonic-gate 					}
5100Sstevel@tonic-gate 					/*
5110Sstevel@tonic-gate 					 * Update statistics for pages being
5120Sstevel@tonic-gate 					 * freed
5130Sstevel@tonic-gate 					 */
5140Sstevel@tonic-gate 					if (pp->p_vnode) {
5150Sstevel@tonic-gate 						if (IS_SWAPFSVP(pp->p_vnode)) {
5160Sstevel@tonic-gate 							anonfree++;
5170Sstevel@tonic-gate 						} else {
5180Sstevel@tonic-gate 							if (pp->p_vnode->v_flag
5190Sstevel@tonic-gate 							    & VVMEXEC) {
5200Sstevel@tonic-gate 								execfree++;
5210Sstevel@tonic-gate 							} else {
5220Sstevel@tonic-gate 								fsfree++;
5230Sstevel@tonic-gate 							}
5240Sstevel@tonic-gate 						}
5250Sstevel@tonic-gate 					}
5260Sstevel@tonic-gate 					/*LINTED: constant in conditional ctx*/
5270Sstevel@tonic-gate 					VN_DISPOSE(pp, B_FREE,
528*11888SPavel.Filipensky@Sun.COM 					    (flags & B_DONTNEED), kcred);
5290Sstevel@tonic-gate 					dfree++;
5300Sstevel@tonic-gate 				} else {
5310Sstevel@tonic-gate 					page_unlock(pp);
5320Sstevel@tonic-gate 					pgrec++;
5330Sstevel@tonic-gate 					TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
5340Sstevel@tonic-gate 					    "page_ws_free:pp %p", pp);
5350Sstevel@tonic-gate 				}
5360Sstevel@tonic-gate 			} else {
5370Sstevel@tonic-gate 				/*
5380Sstevel@tonic-gate 				 * Page is either `locked' in memory
5390Sstevel@tonic-gate 				 * or was reclaimed and now has a
5400Sstevel@tonic-gate 				 * "shared" lock, so release it.
5410Sstevel@tonic-gate 				 */
5420Sstevel@tonic-gate 				page_unlock(pp);
5430Sstevel@tonic-gate 			}
5440Sstevel@tonic-gate 		} else {
5450Sstevel@tonic-gate 			/*
5460Sstevel@tonic-gate 			 * Neither B_FREE nor B_INVAL nor B_ERROR.
5470Sstevel@tonic-gate 			 * Just release locks.
5480Sstevel@tonic-gate 			 */
5490Sstevel@tonic-gate 			page_io_unlock(pp);
5500Sstevel@tonic-gate 			page_unlock(pp);
5510Sstevel@tonic-gate 		}
5520Sstevel@tonic-gate 	}
5530Sstevel@tonic-gate 
5540Sstevel@tonic-gate 	CPU_STATS_ENTER_K();
5550Sstevel@tonic-gate 	cpup = CPU;		/* get cpup now that CPU cannot change */
5560Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
5570Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
5580Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
5590Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
5600Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
5610Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
5620Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
5630Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
5640Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
5650Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
5660Sstevel@tonic-gate 	CPU_STATS_EXIT_K();
5670Sstevel@tonic-gate 
5680Sstevel@tonic-gate 	/* Kernel probe */
5690Sstevel@tonic-gate 	TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
570*11888SPavel.Filipensky@Sun.COM 	    tnf_opaque,	vnode,			vp,
571*11888SPavel.Filipensky@Sun.COM 	    tnf_ulong,	pages_pageout,		pgpgout,
572*11888SPavel.Filipensky@Sun.COM 	    tnf_ulong,	pages_freed,		dfree,
573*11888SPavel.Filipensky@Sun.COM 	    tnf_ulong,	pages_reclaimed,	pgrec);
5740Sstevel@tonic-gate }
5750Sstevel@tonic-gate 
5760Sstevel@tonic-gate /*
5770Sstevel@tonic-gate  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
5780Sstevel@tonic-gate  * B_TRUNC, B_FORCE}.  B_DELWRI indicates that this page is part of a kluster
5790Sstevel@tonic-gate  * operation and is only to be considered if it doesn't involve any
5800Sstevel@tonic-gate  * waiting here.  B_TRUNC indicates that the file is being truncated
5810Sstevel@tonic-gate  * and so no i/o needs to be done. B_FORCE indicates that the page
5820Sstevel@tonic-gate  * must be destroyed so don't try wrting it out.
5830Sstevel@tonic-gate  *
5840Sstevel@tonic-gate  * The caller must ensure that the page is locked.  Returns 1, if
5850Sstevel@tonic-gate  * the page should be written back (the "iolock" is held in this
5860Sstevel@tonic-gate  * case), or 0 if the page has been dealt with or has been
5870Sstevel@tonic-gate  * unlocked.
5880Sstevel@tonic-gate  */
5890Sstevel@tonic-gate int
5900Sstevel@tonic-gate pvn_getdirty(page_t *pp, int flags)
5910Sstevel@tonic-gate {
5920Sstevel@tonic-gate 	ASSERT((flags & (B_INVAL | B_FREE)) ?
5930Sstevel@tonic-gate 	    PAGE_EXCL(pp) : PAGE_SHARED(pp));
5940Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp) == 0);
5950Sstevel@tonic-gate 
5960Sstevel@tonic-gate 	/*
5970Sstevel@tonic-gate 	 * If trying to invalidate or free a logically `locked' page,
5980Sstevel@tonic-gate 	 * forget it.  Don't need page_struct_lock to check p_lckcnt and
5990Sstevel@tonic-gate 	 * p_cowcnt as the page is exclusively locked.
6000Sstevel@tonic-gate 	 */
6010Sstevel@tonic-gate 	if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
6020Sstevel@tonic-gate 	    (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
6030Sstevel@tonic-gate 		page_unlock(pp);
6040Sstevel@tonic-gate 		return (0);
6050Sstevel@tonic-gate 	}
6060Sstevel@tonic-gate 
6070Sstevel@tonic-gate 	/*
6080Sstevel@tonic-gate 	 * Now acquire the i/o lock so we can add it to the dirty
6090Sstevel@tonic-gate 	 * list (if necessary).  We avoid blocking on the i/o lock
6100Sstevel@tonic-gate 	 * in the following cases:
6110Sstevel@tonic-gate 	 *
6120Sstevel@tonic-gate 	 *	If B_DELWRI is set, which implies that this request is
6130Sstevel@tonic-gate 	 *	due to a klustering operartion.
6140Sstevel@tonic-gate 	 *
6150Sstevel@tonic-gate 	 *	If this is an async (B_ASYNC) operation and we are not doing
6160Sstevel@tonic-gate 	 *	invalidation (B_INVAL) [The current i/o or fsflush will ensure
6170Sstevel@tonic-gate 	 *	that the the page is written out].
6180Sstevel@tonic-gate 	 */
6190Sstevel@tonic-gate 	if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
6200Sstevel@tonic-gate 		if (!page_io_trylock(pp)) {
6210Sstevel@tonic-gate 			page_unlock(pp);
6220Sstevel@tonic-gate 			return (0);
6230Sstevel@tonic-gate 		}
6240Sstevel@tonic-gate 	} else {
6250Sstevel@tonic-gate 		page_io_lock(pp);
6260Sstevel@tonic-gate 	}
6270Sstevel@tonic-gate 
6280Sstevel@tonic-gate 	/*
6290Sstevel@tonic-gate 	 * If we want to free or invalidate the page then
6300Sstevel@tonic-gate 	 * we need to unload it so that anyone who wants
6310Sstevel@tonic-gate 	 * it will have to take a minor fault to get it.
6320Sstevel@tonic-gate 	 * Otherwise, we're just writing the page back so we
6330Sstevel@tonic-gate 	 * need to sync up the hardwre and software mod bit to
6340Sstevel@tonic-gate 	 * detect any future modifications.  We clear the
6350Sstevel@tonic-gate 	 * software mod bit when we put the page on the dirty
6360Sstevel@tonic-gate 	 * list.
6370Sstevel@tonic-gate 	 */
6380Sstevel@tonic-gate 	if (flags & (B_INVAL | B_FREE)) {
6390Sstevel@tonic-gate 		(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
6400Sstevel@tonic-gate 	} else {
6410Sstevel@tonic-gate 		(void) hat_pagesync(pp, HAT_SYNC_ZERORM);
6420Sstevel@tonic-gate 	}
6430Sstevel@tonic-gate 
6440Sstevel@tonic-gate 	if (!hat_ismod(pp) || (flags & B_TRUNC)) {
6450Sstevel@tonic-gate 		/*
6460Sstevel@tonic-gate 		 * Don't need to add it to the
6470Sstevel@tonic-gate 		 * list after all.
6480Sstevel@tonic-gate 		 */
6490Sstevel@tonic-gate 		page_io_unlock(pp);
6500Sstevel@tonic-gate 		if (flags & B_INVAL) {
6510Sstevel@tonic-gate 			/*LINTED: constant in conditional context*/
6520Sstevel@tonic-gate 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
6530Sstevel@tonic-gate 		} else if (flags & B_FREE) {
6540Sstevel@tonic-gate 			/*LINTED: constant in conditional context*/
6550Sstevel@tonic-gate 			VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
6560Sstevel@tonic-gate 		} else {
6570Sstevel@tonic-gate 			/*
6580Sstevel@tonic-gate 			 * This is advisory path for the callers
6590Sstevel@tonic-gate 			 * of VOP_PUTPAGE() who prefer freeing the
6600Sstevel@tonic-gate 			 * page _only_ if no one else is accessing it.
6610Sstevel@tonic-gate 			 * E.g. segmap_release()
6620Sstevel@tonic-gate 			 *
6630Sstevel@tonic-gate 			 * The above hat_ismod() check is useless because:
6640Sstevel@tonic-gate 			 * (1) we may not be holding SE_EXCL lock;
6650Sstevel@tonic-gate 			 * (2) we've not unloaded _all_ translations
6660Sstevel@tonic-gate 			 *
6670Sstevel@tonic-gate 			 * Let page_release() do the heavy-lifting.
6680Sstevel@tonic-gate 			 */
6690Sstevel@tonic-gate 			(void) page_release(pp, 1);
6700Sstevel@tonic-gate 		}
6710Sstevel@tonic-gate 		return (0);
6720Sstevel@tonic-gate 	}
6730Sstevel@tonic-gate 
6740Sstevel@tonic-gate 	/*
6750Sstevel@tonic-gate 	 * Page is dirty, get it ready for the write back
6760Sstevel@tonic-gate 	 * and add page to the dirty list.
6770Sstevel@tonic-gate 	 */
6780Sstevel@tonic-gate 	hat_clrrefmod(pp);
6790Sstevel@tonic-gate 
6800Sstevel@tonic-gate 	/*
6810Sstevel@tonic-gate 	 * If we're going to free the page when we're done
6820Sstevel@tonic-gate 	 * then we can let others try to use it starting now.
6830Sstevel@tonic-gate 	 * We'll detect the fact that they used it when the
6840Sstevel@tonic-gate 	 * i/o is done and avoid freeing the page.
6850Sstevel@tonic-gate 	 */
6860Sstevel@tonic-gate 	if (flags & B_FREE)
6870Sstevel@tonic-gate 		page_downgrade(pp);
6880Sstevel@tonic-gate 
6890Sstevel@tonic-gate 
6900Sstevel@tonic-gate 	TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
6910Sstevel@tonic-gate 
6920Sstevel@tonic-gate 	return (1);
6930Sstevel@tonic-gate }
6940Sstevel@tonic-gate 
6950Sstevel@tonic-gate 
6960Sstevel@tonic-gate /*ARGSUSED*/
6970Sstevel@tonic-gate static int
6980Sstevel@tonic-gate marker_constructor(void *buf, void *cdrarg, int kmflags)
6990Sstevel@tonic-gate {
7000Sstevel@tonic-gate 	page_t *mark = buf;
7010Sstevel@tonic-gate 	bzero(mark, sizeof (page_t));
702*11888SPavel.Filipensky@Sun.COM 	mark->p_hash = PVN_VPLIST_HASH_TAG;
7030Sstevel@tonic-gate 	return (0);
7040Sstevel@tonic-gate }
7050Sstevel@tonic-gate 
7060Sstevel@tonic-gate void
7070Sstevel@tonic-gate pvn_init()
7080Sstevel@tonic-gate {
7090Sstevel@tonic-gate 	if (pvn_vmodsort_disable == 0)
7100Sstevel@tonic-gate 		pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
7110Sstevel@tonic-gate 	marker_cache = kmem_cache_create("marker_cache",
7120Sstevel@tonic-gate 	    sizeof (page_t), 0, marker_constructor,
7130Sstevel@tonic-gate 	    NULL, NULL, NULL, NULL, 0);
7140Sstevel@tonic-gate }
7150Sstevel@tonic-gate 
7160Sstevel@tonic-gate 
7170Sstevel@tonic-gate /*
7180Sstevel@tonic-gate  * Process a vnode's page list for all pages whose offset is >= off.
7190Sstevel@tonic-gate  * Pages are to either be free'd, invalidated, or written back to disk.
7200Sstevel@tonic-gate  *
7210Sstevel@tonic-gate  * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
7220Sstevel@tonic-gate  * is specified, otherwise they are "shared" locked.
7230Sstevel@tonic-gate  *
7240Sstevel@tonic-gate  * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
7250Sstevel@tonic-gate  *
7260Sstevel@tonic-gate  * Special marker page_t's are inserted in the list in order
7270Sstevel@tonic-gate  * to keep track of where we are in the list when locks are dropped.
7280Sstevel@tonic-gate  *
7290Sstevel@tonic-gate  * Note the list is circular and insertions can happen only at the
7300Sstevel@tonic-gate  * head and tail of the list. The algorithm ensures visiting all pages
7310Sstevel@tonic-gate  * on the list in the following way:
7320Sstevel@tonic-gate  *
7330Sstevel@tonic-gate  *    Drop two marker pages at the end of the list.
7340Sstevel@tonic-gate  *
7350Sstevel@tonic-gate  *    Move one marker page backwards towards the start of the list until
7360Sstevel@tonic-gate  *    it is at the list head, processing the pages passed along the way.
7370Sstevel@tonic-gate  *
7380Sstevel@tonic-gate  *    Due to race conditions when the vphm mutex is dropped, additional pages
7390Sstevel@tonic-gate  *    can be added to either end of the list, so we'll continue to move
7400Sstevel@tonic-gate  *    the marker and process pages until it is up against the end marker.
7410Sstevel@tonic-gate  *
7420Sstevel@tonic-gate  * There is one special exit condition. If we are processing a VMODSORT
7430Sstevel@tonic-gate  * vnode and only writing back modified pages, we can stop as soon as
7440Sstevel@tonic-gate  * we run into an unmodified page.  This makes fsync(3) operations fast.
7450Sstevel@tonic-gate  */
7460Sstevel@tonic-gate int
7470Sstevel@tonic-gate pvn_vplist_dirty(
7480Sstevel@tonic-gate 	vnode_t		*vp,
7490Sstevel@tonic-gate 	u_offset_t	off,
7500Sstevel@tonic-gate 	int		(*putapage)(vnode_t *, page_t *, u_offset_t *,
7510Sstevel@tonic-gate 			size_t *, int, cred_t *),
7520Sstevel@tonic-gate 	int		flags,
7530Sstevel@tonic-gate 	cred_t		*cred)
7540Sstevel@tonic-gate {
7550Sstevel@tonic-gate 	page_t		*pp;
7560Sstevel@tonic-gate 	page_t		*mark;		/* marker page that moves toward head */
7570Sstevel@tonic-gate 	page_t		*end;		/* marker page at end of list */
7580Sstevel@tonic-gate 	int		err = 0;
7590Sstevel@tonic-gate 	int		error;
7600Sstevel@tonic-gate 	kmutex_t	*vphm;
7610Sstevel@tonic-gate 	se_t		se;
7620Sstevel@tonic-gate 	page_t		**where_to_move;
7630Sstevel@tonic-gate 
7640Sstevel@tonic-gate 	ASSERT(vp->v_type != VCHR);
7650Sstevel@tonic-gate 
7660Sstevel@tonic-gate 	if (vp->v_pages == NULL)
7670Sstevel@tonic-gate 		return (0);
7680Sstevel@tonic-gate 
7690Sstevel@tonic-gate 
7700Sstevel@tonic-gate 	/*
7710Sstevel@tonic-gate 	 * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
7720Sstevel@tonic-gate 	 *
7730Sstevel@tonic-gate 	 * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
7740Sstevel@tonic-gate 	 * from getting blocked while flushing pages to a dead NFS server.
7750Sstevel@tonic-gate 	 */
7760Sstevel@tonic-gate 	mutex_enter(&vp->v_lock);
7770Sstevel@tonic-gate 	if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
7780Sstevel@tonic-gate 		mutex_exit(&vp->v_lock);
7790Sstevel@tonic-gate 		return (EAGAIN);
7800Sstevel@tonic-gate 	}
7810Sstevel@tonic-gate 
7820Sstevel@tonic-gate 	while (vp->v_flag & VVMLOCK)
7830Sstevel@tonic-gate 		cv_wait(&vp->v_cv, &vp->v_lock);
7840Sstevel@tonic-gate 
7850Sstevel@tonic-gate 	if (vp->v_pages == NULL) {
7860Sstevel@tonic-gate 		mutex_exit(&vp->v_lock);
7870Sstevel@tonic-gate 		return (0);
7880Sstevel@tonic-gate 	}
7890Sstevel@tonic-gate 
7900Sstevel@tonic-gate 	vp->v_flag |= VVMLOCK;
7910Sstevel@tonic-gate 	mutex_exit(&vp->v_lock);
7920Sstevel@tonic-gate 
7930Sstevel@tonic-gate 
7940Sstevel@tonic-gate 	/*
7950Sstevel@tonic-gate 	 * Set up the marker pages used to walk the list
7960Sstevel@tonic-gate 	 */
7970Sstevel@tonic-gate 	end = kmem_cache_alloc(marker_cache, KM_SLEEP);
7980Sstevel@tonic-gate 	end->p_vnode = vp;
7990Sstevel@tonic-gate 	end->p_offset = (u_offset_t)-2;
8000Sstevel@tonic-gate 	mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
8010Sstevel@tonic-gate 	mark->p_vnode = vp;
8020Sstevel@tonic-gate 	mark->p_offset = (u_offset_t)-1;
8030Sstevel@tonic-gate 
8040Sstevel@tonic-gate 	/*
8050Sstevel@tonic-gate 	 * Grab the lock protecting the vnode's page list
8060Sstevel@tonic-gate 	 * note that this lock is dropped at times in the loop.
8070Sstevel@tonic-gate 	 */
8080Sstevel@tonic-gate 	vphm = page_vnode_mutex(vp);
8090Sstevel@tonic-gate 	mutex_enter(vphm);
8100Sstevel@tonic-gate 	if (vp->v_pages == NULL)
8110Sstevel@tonic-gate 		goto leave;
8120Sstevel@tonic-gate 
8130Sstevel@tonic-gate 	/*
8140Sstevel@tonic-gate 	 * insert the markers and loop through the list of pages
8150Sstevel@tonic-gate 	 */
8160Sstevel@tonic-gate 	page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
8170Sstevel@tonic-gate 	page_vpadd(&mark->p_vpnext, end);
8180Sstevel@tonic-gate 	for (;;) {
8190Sstevel@tonic-gate 
8200Sstevel@tonic-gate 		/*
8210Sstevel@tonic-gate 		 * If only doing an async write back, then we can
8220Sstevel@tonic-gate 		 * stop as soon as we get to start of the list.
8230Sstevel@tonic-gate 		 */
8240Sstevel@tonic-gate 		if (flags == B_ASYNC && vp->v_pages == mark)
8250Sstevel@tonic-gate 			break;
8260Sstevel@tonic-gate 
8270Sstevel@tonic-gate 		/*
8280Sstevel@tonic-gate 		 * otherwise stop when we've gone through all the pages
8290Sstevel@tonic-gate 		 */
8300Sstevel@tonic-gate 		if (mark->p_vpprev == end)
8310Sstevel@tonic-gate 			break;
8320Sstevel@tonic-gate 
8330Sstevel@tonic-gate 		pp = mark->p_vpprev;
8340Sstevel@tonic-gate 		if (vp->v_pages == pp)
8350Sstevel@tonic-gate 			where_to_move = &vp->v_pages;
8360Sstevel@tonic-gate 		else
8370Sstevel@tonic-gate 			where_to_move = &pp->p_vpprev->p_vpnext;
8380Sstevel@tonic-gate 
8390Sstevel@tonic-gate 		ASSERT(pp->p_vnode == vp);
8400Sstevel@tonic-gate 
8410Sstevel@tonic-gate 		/*
8420Sstevel@tonic-gate 		 * If just flushing dirty pages to disk and this vnode
8430Sstevel@tonic-gate 		 * is using a sorted list of pages, we can stop processing
8440Sstevel@tonic-gate 		 * as soon as we find an unmodified page. Since all the
8450Sstevel@tonic-gate 		 * modified pages are visited first.
8460Sstevel@tonic-gate 		 */
8470Sstevel@tonic-gate 		if (IS_VMODSORT(vp) &&
8482999Sstans 		    !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
8492999Sstans 			if (!hat_ismod(pp) && !page_io_locked(pp)) {
8500Sstevel@tonic-gate #ifdef  DEBUG
8512999Sstans 				/*
8522999Sstans 				 * For debug kernels examine what should be
8532999Sstans 				 * all the remaining clean pages, asserting
8542999Sstans 				 * that they are not modified.
8552999Sstans 				 */
8562999Sstans 				page_t	*chk = pp;
8572999Sstans 				int	attr;
8580Sstevel@tonic-gate 
8592999Sstans 				page_vpsub(&vp->v_pages, mark);
8602999Sstans 				page_vpadd(where_to_move, mark);
8612999Sstans 				do {
8622999Sstans 					chk = chk->p_vpprev;
8632999Sstans 					ASSERT(chk != end);
8642999Sstans 					if (chk == mark)
8652999Sstans 						continue;
8662999Sstans 					attr = hat_page_getattr(chk, P_MOD |
8672999Sstans 					    P_REF);
8682999Sstans 					if ((attr & P_MOD) == 0)
8692999Sstans 						continue;
8702999Sstans 					panic("v_pages list not all clean: "
8712999Sstans 					    "page_t*=%p vnode=%p off=%lx "
8722999Sstans 					    "attr=0x%x last clean page_t*=%p\n",
8732999Sstans 					    (void *)chk, (void *)chk->p_vnode,
8742999Sstans 					    (long)chk->p_offset, attr,
8752999Sstans 					    (void *)pp);
8762999Sstans 				} while (chk != vp->v_pages);
8770Sstevel@tonic-gate #endif
8782999Sstans 				break;
8792999Sstans 			} else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
8802999Sstans 				/*
8812999Sstans 				 * Couldn't get io lock, wait until IO is done.
8822999Sstans 				 * Block only for sync IO since we don't want
8832999Sstans 				 * to block async IO.
8842999Sstans 				 */
8852999Sstans 				mutex_exit(vphm);
8862999Sstans 				page_io_wait(pp);
8872999Sstans 				mutex_enter(vphm);
8882999Sstans 				continue;
8892999Sstans 			}
8900Sstevel@tonic-gate 		}
8910Sstevel@tonic-gate 
8920Sstevel@tonic-gate 		/*
8937972SPeter.Telford@Sun.COM 		 * Skip this page if the offset is out of the desired range.
8947972SPeter.Telford@Sun.COM 		 * Just move the marker and continue.
8957972SPeter.Telford@Sun.COM 		 */
8967972SPeter.Telford@Sun.COM 		if (pp->p_offset < off) {
8977972SPeter.Telford@Sun.COM 			page_vpsub(&vp->v_pages, mark);
8987972SPeter.Telford@Sun.COM 			page_vpadd(where_to_move, mark);
8997972SPeter.Telford@Sun.COM 			continue;
9007972SPeter.Telford@Sun.COM 		}
9017972SPeter.Telford@Sun.COM 
9027972SPeter.Telford@Sun.COM 		/*
9030Sstevel@tonic-gate 		 * If we are supposed to invalidate or free this
9040Sstevel@tonic-gate 		 * page, then we need an exclusive lock.
9050Sstevel@tonic-gate 		 */
9060Sstevel@tonic-gate 		se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
9070Sstevel@tonic-gate 
9080Sstevel@tonic-gate 		/*
9090Sstevel@tonic-gate 		 * We must acquire the page lock for all synchronous
9100Sstevel@tonic-gate 		 * operations (invalidate, free and write).
9110Sstevel@tonic-gate 		 */
9120Sstevel@tonic-gate 		if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
9130Sstevel@tonic-gate 			/*
9140Sstevel@tonic-gate 			 * If the page_lock() drops the mutex
9150Sstevel@tonic-gate 			 * we must retry the loop.
9160Sstevel@tonic-gate 			 */
9170Sstevel@tonic-gate 			if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
9180Sstevel@tonic-gate 				continue;
9190Sstevel@tonic-gate 
9200Sstevel@tonic-gate 			/*
9210Sstevel@tonic-gate 			 * It's ok to move the marker page now.
9220Sstevel@tonic-gate 			 */
9230Sstevel@tonic-gate 			page_vpsub(&vp->v_pages, mark);
9240Sstevel@tonic-gate 			page_vpadd(where_to_move, mark);
9250Sstevel@tonic-gate 		} else {
9260Sstevel@tonic-gate 
9270Sstevel@tonic-gate 			/*
9280Sstevel@tonic-gate 			 * update the marker page for all remaining cases
9290Sstevel@tonic-gate 			 */
9300Sstevel@tonic-gate 			page_vpsub(&vp->v_pages, mark);
9310Sstevel@tonic-gate 			page_vpadd(where_to_move, mark);
9320Sstevel@tonic-gate 
9330Sstevel@tonic-gate 			/*
9340Sstevel@tonic-gate 			 * For write backs, If we can't lock the page, it's
9350Sstevel@tonic-gate 			 * invalid or in the process of being destroyed.  Skip
9360Sstevel@tonic-gate 			 * it, assuming someone else is writing it.
9370Sstevel@tonic-gate 			 */
9380Sstevel@tonic-gate 			if (!page_trylock(pp, se))
9390Sstevel@tonic-gate 				continue;
9400Sstevel@tonic-gate 		}
9410Sstevel@tonic-gate 
9420Sstevel@tonic-gate 		ASSERT(pp->p_vnode == vp);
9430Sstevel@tonic-gate 
9440Sstevel@tonic-gate 		/*
9450Sstevel@tonic-gate 		 * Successfully locked the page, now figure out what to
9460Sstevel@tonic-gate 		 * do with it. Free pages are easily dealt with, invalidate
9470Sstevel@tonic-gate 		 * if desired or just go on to the next page.
9480Sstevel@tonic-gate 		 */
9490Sstevel@tonic-gate 		if (PP_ISFREE(pp)) {
9500Sstevel@tonic-gate 			if ((flags & B_INVAL) == 0) {
9510Sstevel@tonic-gate 				page_unlock(pp);
9520Sstevel@tonic-gate 				continue;
9530Sstevel@tonic-gate 			}
9540Sstevel@tonic-gate 
9550Sstevel@tonic-gate 			/*
9560Sstevel@tonic-gate 			 * Invalidate (destroy) the page.
9570Sstevel@tonic-gate 			 */
9580Sstevel@tonic-gate 			mutex_exit(vphm);
9590Sstevel@tonic-gate 			page_destroy_free(pp);
9600Sstevel@tonic-gate 			mutex_enter(vphm);
9610Sstevel@tonic-gate 			continue;
9620Sstevel@tonic-gate 		}
9630Sstevel@tonic-gate 
9640Sstevel@tonic-gate 		/*
9650Sstevel@tonic-gate 		 * pvn_getdirty() figures out what do do with a dirty page.
9660Sstevel@tonic-gate 		 * If the page is dirty, the putapage() routine will write it
9670Sstevel@tonic-gate 		 * and will kluster any other adjacent dirty pages it can.
9680Sstevel@tonic-gate 		 *
9690Sstevel@tonic-gate 		 * pvn_getdirty() and `(*putapage)' unlock the page.
9700Sstevel@tonic-gate 		 */
9710Sstevel@tonic-gate 		mutex_exit(vphm);
9720Sstevel@tonic-gate 		if (pvn_getdirty(pp, flags)) {
9730Sstevel@tonic-gate 			error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
9740Sstevel@tonic-gate 			if (!err)
9750Sstevel@tonic-gate 				err = error;
9760Sstevel@tonic-gate 		}
9770Sstevel@tonic-gate 		mutex_enter(vphm);
9780Sstevel@tonic-gate 	}
9790Sstevel@tonic-gate 	page_vpsub(&vp->v_pages, mark);
9800Sstevel@tonic-gate 	page_vpsub(&vp->v_pages, end);
9810Sstevel@tonic-gate 
9820Sstevel@tonic-gate leave:
9830Sstevel@tonic-gate 	/*
9840Sstevel@tonic-gate 	 * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
9850Sstevel@tonic-gate 	 */
9860Sstevel@tonic-gate 	mutex_exit(vphm);
9870Sstevel@tonic-gate 	kmem_cache_free(marker_cache, mark);
9880Sstevel@tonic-gate 	kmem_cache_free(marker_cache, end);
9890Sstevel@tonic-gate 	mutex_enter(&vp->v_lock);
9900Sstevel@tonic-gate 	vp->v_flag &= ~VVMLOCK;
9910Sstevel@tonic-gate 	cv_broadcast(&vp->v_cv);
9920Sstevel@tonic-gate 	mutex_exit(&vp->v_lock);
9930Sstevel@tonic-gate 	return (err);
9940Sstevel@tonic-gate }
9950Sstevel@tonic-gate 
9960Sstevel@tonic-gate /*
997*11888SPavel.Filipensky@Sun.COM  * Walk the vp->v_pages list, for every page call the callback function
998*11888SPavel.Filipensky@Sun.COM  * pointed by *page_check. If page_check returns non-zero, then mark the
999*11888SPavel.Filipensky@Sun.COM  * page as modified and if VMODSORT is set, move it to the end of v_pages
1000*11888SPavel.Filipensky@Sun.COM  * list. Moving makes sense only if we have at least two pages - this also
1001*11888SPavel.Filipensky@Sun.COM  * avoids having v_pages temporarily being NULL after calling page_vpsub()
1002*11888SPavel.Filipensky@Sun.COM  * if there was just one page.
1003*11888SPavel.Filipensky@Sun.COM  */
1004*11888SPavel.Filipensky@Sun.COM void
1005*11888SPavel.Filipensky@Sun.COM pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
1006*11888SPavel.Filipensky@Sun.COM {
1007*11888SPavel.Filipensky@Sun.COM 	page_t	*pp, *next, *end;
1008*11888SPavel.Filipensky@Sun.COM 	kmutex_t	*vphm;
1009*11888SPavel.Filipensky@Sun.COM 	int	shuffle;
1010*11888SPavel.Filipensky@Sun.COM 
1011*11888SPavel.Filipensky@Sun.COM 	vphm = page_vnode_mutex(vp);
1012*11888SPavel.Filipensky@Sun.COM 	mutex_enter(vphm);
1013*11888SPavel.Filipensky@Sun.COM 
1014*11888SPavel.Filipensky@Sun.COM 	if (vp->v_pages == NULL) {
1015*11888SPavel.Filipensky@Sun.COM 		mutex_exit(vphm);
1016*11888SPavel.Filipensky@Sun.COM 		return;
1017*11888SPavel.Filipensky@Sun.COM 	}
1018*11888SPavel.Filipensky@Sun.COM 
1019*11888SPavel.Filipensky@Sun.COM 	end = vp->v_pages->p_vpprev;
1020*11888SPavel.Filipensky@Sun.COM 	shuffle = IS_VMODSORT(vp) && (vp->v_pages != end);
1021*11888SPavel.Filipensky@Sun.COM 	pp = vp->v_pages;
1022*11888SPavel.Filipensky@Sun.COM 
1023*11888SPavel.Filipensky@Sun.COM 	for (;;) {
1024*11888SPavel.Filipensky@Sun.COM 		next = pp->p_vpnext;
1025*11888SPavel.Filipensky@Sun.COM 		if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) {
1026*11888SPavel.Filipensky@Sun.COM 			/*
1027*11888SPavel.Filipensky@Sun.COM 			 * hat_setmod_only() in contrast to hat_setmod() does
1028*11888SPavel.Filipensky@Sun.COM 			 * not shuffle the pages and does not grab the mutex
1029*11888SPavel.Filipensky@Sun.COM 			 * page_vnode_mutex. Exactly what we need.
1030*11888SPavel.Filipensky@Sun.COM 			 */
1031*11888SPavel.Filipensky@Sun.COM 			hat_setmod_only(pp);
1032*11888SPavel.Filipensky@Sun.COM 			if (shuffle) {
1033*11888SPavel.Filipensky@Sun.COM 				page_vpsub(&vp->v_pages, pp);
1034*11888SPavel.Filipensky@Sun.COM 				ASSERT(vp->v_pages != NULL);
1035*11888SPavel.Filipensky@Sun.COM 				page_vpadd(&vp->v_pages->p_vpprev->p_vpnext,
1036*11888SPavel.Filipensky@Sun.COM 				    pp);
1037*11888SPavel.Filipensky@Sun.COM 			}
1038*11888SPavel.Filipensky@Sun.COM 		}
1039*11888SPavel.Filipensky@Sun.COM 		/* Stop if we have just processed the last page. */
1040*11888SPavel.Filipensky@Sun.COM 		if (pp == end)
1041*11888SPavel.Filipensky@Sun.COM 			break;
1042*11888SPavel.Filipensky@Sun.COM 		pp = next;
1043*11888SPavel.Filipensky@Sun.COM 	}
1044*11888SPavel.Filipensky@Sun.COM 
1045*11888SPavel.Filipensky@Sun.COM 	mutex_exit(vphm);
1046*11888SPavel.Filipensky@Sun.COM }
1047*11888SPavel.Filipensky@Sun.COM 
1048*11888SPavel.Filipensky@Sun.COM /*
10490Sstevel@tonic-gate  * Zero out zbytes worth of data. Caller should be aware that this
10500Sstevel@tonic-gate  * routine may enter back into the fs layer (xxx_getpage). Locks
10510Sstevel@tonic-gate  * that the xxx_getpage routine may need should not be held while
10520Sstevel@tonic-gate  * calling this.
10530Sstevel@tonic-gate  */
10540Sstevel@tonic-gate void
10550Sstevel@tonic-gate pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
10560Sstevel@tonic-gate {
10570Sstevel@tonic-gate 	caddr_t addr;
10580Sstevel@tonic-gate 
10590Sstevel@tonic-gate 	ASSERT(vp->v_type != VCHR);
10600Sstevel@tonic-gate 
10610Sstevel@tonic-gate 	if (vp->v_pages == NULL)
10620Sstevel@tonic-gate 		return;
10630Sstevel@tonic-gate 
10640Sstevel@tonic-gate 	/*
10650Sstevel@tonic-gate 	 * zbytes may be zero but there still may be some portion of
10660Sstevel@tonic-gate 	 * a page which needs clearing (since zbytes is a function
10670Sstevel@tonic-gate 	 * of filesystem block size, not pagesize.)
10680Sstevel@tonic-gate 	 */
10690Sstevel@tonic-gate 	if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
10700Sstevel@tonic-gate 		return;
10710Sstevel@tonic-gate 
10720Sstevel@tonic-gate 	/*
10730Sstevel@tonic-gate 	 * We get the last page and handle the partial
10740Sstevel@tonic-gate 	 * zeroing via kernel mappings.  This will make the page
10750Sstevel@tonic-gate 	 * dirty so that we know that when this page is written
10760Sstevel@tonic-gate 	 * back, the zeroed information will go out with it.  If
10770Sstevel@tonic-gate 	 * the page is not currently in memory, then the kzero
10780Sstevel@tonic-gate 	 * operation will cause it to be brought it.  We use kzero
10790Sstevel@tonic-gate 	 * instead of bzero so that if the page cannot be read in
10800Sstevel@tonic-gate 	 * for any reason, the system will not panic.  We need
10810Sstevel@tonic-gate 	 * to zero out a minimum of the fs given zbytes, but we
10820Sstevel@tonic-gate 	 * might also have to do more to get the entire last page.
10830Sstevel@tonic-gate 	 */
10840Sstevel@tonic-gate 
10850Sstevel@tonic-gate 	if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
10860Sstevel@tonic-gate 		panic("pvn_vptrunc zbytes");
10870Sstevel@tonic-gate 	addr = segmap_getmapflt(segkmap, vp, vplen,
10880Sstevel@tonic-gate 	    MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
10890Sstevel@tonic-gate 	(void) kzero(addr + (vplen & MAXBOFFSET),
10900Sstevel@tonic-gate 	    MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
10910Sstevel@tonic-gate 	(void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
10920Sstevel@tonic-gate }
10930Sstevel@tonic-gate 
10940Sstevel@tonic-gate /*
10950Sstevel@tonic-gate  * Handles common work of the VOP_GETPAGE routines when more than
10960Sstevel@tonic-gate  * one page must be returned by calling a file system specific operation
10970Sstevel@tonic-gate  * to do most of the work.  Must be called with the vp already locked
10980Sstevel@tonic-gate  * by the VOP_GETPAGE routine.
10990Sstevel@tonic-gate  */
11000Sstevel@tonic-gate int
11010Sstevel@tonic-gate pvn_getpages(
11020Sstevel@tonic-gate 	int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
11030Sstevel@tonic-gate 		size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
11040Sstevel@tonic-gate 	struct vnode *vp,
11050Sstevel@tonic-gate 	u_offset_t off,
11060Sstevel@tonic-gate 	size_t len,
11070Sstevel@tonic-gate 	uint_t *protp,
11080Sstevel@tonic-gate 	page_t *pl[],
11090Sstevel@tonic-gate 	size_t plsz,
11100Sstevel@tonic-gate 	struct seg *seg,
11110Sstevel@tonic-gate 	caddr_t addr,
11120Sstevel@tonic-gate 	enum seg_rw rw,
11130Sstevel@tonic-gate 	struct cred *cred)
11140Sstevel@tonic-gate {
11150Sstevel@tonic-gate 	page_t **ppp;
11160Sstevel@tonic-gate 	u_offset_t o, eoff;
11170Sstevel@tonic-gate 	size_t sz, xlen;
11180Sstevel@tonic-gate 	int err;
11190Sstevel@tonic-gate 
11200Sstevel@tonic-gate 	ASSERT(plsz >= len);		/* insure that we have enough space */
11210Sstevel@tonic-gate 
11220Sstevel@tonic-gate 	/*
11230Sstevel@tonic-gate 	 * Loop one page at a time and let getapage function fill
11240Sstevel@tonic-gate 	 * in the next page in array.  We only allow one page to be
11250Sstevel@tonic-gate 	 * returned at a time (except for the last page) so that we
11260Sstevel@tonic-gate 	 * don't have any problems with duplicates and other such
11270Sstevel@tonic-gate 	 * painful problems.  This is a very simple minded algorithm,
11280Sstevel@tonic-gate 	 * but it does the job correctly.  We hope that the cost of a
11290Sstevel@tonic-gate 	 * getapage call for a resident page that we might have been
11300Sstevel@tonic-gate 	 * able to get from an earlier call doesn't cost too much.
11310Sstevel@tonic-gate 	 */
11320Sstevel@tonic-gate 	ppp = pl;
11330Sstevel@tonic-gate 	sz = PAGESIZE;
11340Sstevel@tonic-gate 	eoff = off + len;
11350Sstevel@tonic-gate 	xlen = len;
11360Sstevel@tonic-gate 	for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
11370Sstevel@tonic-gate 	    xlen -= PAGESIZE) {
11380Sstevel@tonic-gate 		if (o + PAGESIZE >= eoff) {
11390Sstevel@tonic-gate 			/*
11400Sstevel@tonic-gate 			 * Last time through - allow the all of
11410Sstevel@tonic-gate 			 * what's left of the pl[] array to be used.
11420Sstevel@tonic-gate 			 */
11430Sstevel@tonic-gate 			sz = plsz - (o - off);
11440Sstevel@tonic-gate 		}
11450Sstevel@tonic-gate 		err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
11460Sstevel@tonic-gate 		    rw, cred);
11470Sstevel@tonic-gate 		if (err) {
11480Sstevel@tonic-gate 			/*
11490Sstevel@tonic-gate 			 * Release any pages we already got.
11500Sstevel@tonic-gate 			 */
11510Sstevel@tonic-gate 			if (o > off && pl != NULL) {
11520Sstevel@tonic-gate 				for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
11530Sstevel@tonic-gate 					(void) page_release(*ppp, 1);
11540Sstevel@tonic-gate 			}
11550Sstevel@tonic-gate 			break;
11560Sstevel@tonic-gate 		}
11570Sstevel@tonic-gate 		if (pl != NULL)
11580Sstevel@tonic-gate 			ppp++;
11590Sstevel@tonic-gate 	}
11600Sstevel@tonic-gate 	return (err);
11610Sstevel@tonic-gate }
11620Sstevel@tonic-gate 
11630Sstevel@tonic-gate /*
11640Sstevel@tonic-gate  * Initialize the page list array.
11650Sstevel@tonic-gate  */
11663351Saguzovsk /*ARGSUSED*/
11670Sstevel@tonic-gate void
11680Sstevel@tonic-gate pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
11690Sstevel@tonic-gate     u_offset_t off, size_t io_len, enum seg_rw rw)
11700Sstevel@tonic-gate {
11710Sstevel@tonic-gate 	ssize_t sz;
11720Sstevel@tonic-gate 	page_t *ppcur, **ppp;
11730Sstevel@tonic-gate 
11743351Saguzovsk 	/*
11753351Saguzovsk 	 * Set up to load plsz worth
11763351Saguzovsk 	 * starting at the needed page.
11773351Saguzovsk 	 */
11783351Saguzovsk 	while (pp != NULL && pp->p_offset != off) {
11790Sstevel@tonic-gate 		/*
11803351Saguzovsk 		 * Remove page from the i/o list,
11813351Saguzovsk 		 * release the i/o and the page lock.
11820Sstevel@tonic-gate 		 */
11833351Saguzovsk 		ppcur = pp;
11843351Saguzovsk 		page_sub(&pp, ppcur);
11853351Saguzovsk 		page_io_unlock(ppcur);
11863351Saguzovsk 		(void) page_release(ppcur, 1);
11870Sstevel@tonic-gate 	}
11880Sstevel@tonic-gate 
11893351Saguzovsk 	if (pp == NULL) {
11903351Saguzovsk 		pl[0] = NULL;
11913351Saguzovsk 		return;
11923351Saguzovsk 	}
11933351Saguzovsk 
11943351Saguzovsk 	sz = plsz;
11953351Saguzovsk 
11960Sstevel@tonic-gate 	/*
11970Sstevel@tonic-gate 	 * Initialize the page list array.
11980Sstevel@tonic-gate 	 */
11990Sstevel@tonic-gate 	ppp = pl;
12000Sstevel@tonic-gate 	do {
12010Sstevel@tonic-gate 		ppcur = pp;
12020Sstevel@tonic-gate 		*ppp++ = ppcur;
12030Sstevel@tonic-gate 		page_sub(&pp, ppcur);
12040Sstevel@tonic-gate 		page_io_unlock(ppcur);
12050Sstevel@tonic-gate 		if (rw != S_CREATE)
12060Sstevel@tonic-gate 			page_downgrade(ppcur);
12070Sstevel@tonic-gate 		sz -= PAGESIZE;
12080Sstevel@tonic-gate 	} while (sz > 0 && pp != NULL);
12090Sstevel@tonic-gate 	*ppp = NULL;		/* terminate list */
12100Sstevel@tonic-gate 
12110Sstevel@tonic-gate 	/*
12120Sstevel@tonic-gate 	 * Now free the remaining pages that weren't
12130Sstevel@tonic-gate 	 * loaded in the page list.
12140Sstevel@tonic-gate 	 */
12150Sstevel@tonic-gate 	while (pp != NULL) {
12160Sstevel@tonic-gate 		ppcur = pp;
12170Sstevel@tonic-gate 		page_sub(&pp, ppcur);
12180Sstevel@tonic-gate 		page_io_unlock(ppcur);
12190Sstevel@tonic-gate 		(void) page_release(ppcur, 1);
12200Sstevel@tonic-gate 	}
12210Sstevel@tonic-gate }
1222