xref: /onnv-gate/usr/src/uts/common/os/vm_pageout.c (revision 11173:87f3734e64df)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
53290Sjohansen  * Common Development and Distribution License (the "License").
63290Sjohansen  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
2211066Srafael.vanoni@sun.com  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
270Sstevel@tonic-gate /*	  All Rights Reserved  	*/
280Sstevel@tonic-gate 
290Sstevel@tonic-gate /*
300Sstevel@tonic-gate  * University Copyright- Copyright (c) 1982, 1986, 1988
310Sstevel@tonic-gate  * The Regents of the University of California
320Sstevel@tonic-gate  * All Rights Reserved
330Sstevel@tonic-gate  *
340Sstevel@tonic-gate  * University Acknowledgment- Portions of this document are derived from
350Sstevel@tonic-gate  * software developed by the University of California, Berkeley, and its
360Sstevel@tonic-gate  * contributors.
370Sstevel@tonic-gate  */
380Sstevel@tonic-gate 
390Sstevel@tonic-gate #include <sys/types.h>
400Sstevel@tonic-gate #include <sys/t_lock.h>
410Sstevel@tonic-gate #include <sys/param.h>
420Sstevel@tonic-gate #include <sys/buf.h>
430Sstevel@tonic-gate #include <sys/uio.h>
440Sstevel@tonic-gate #include <sys/proc.h>
450Sstevel@tonic-gate #include <sys/systm.h>
460Sstevel@tonic-gate #include <sys/mman.h>
470Sstevel@tonic-gate #include <sys/cred.h>
480Sstevel@tonic-gate #include <sys/vnode.h>
490Sstevel@tonic-gate #include <sys/vm.h>
500Sstevel@tonic-gate #include <sys/vmparam.h>
510Sstevel@tonic-gate #include <sys/vtrace.h>
520Sstevel@tonic-gate #include <sys/cmn_err.h>
530Sstevel@tonic-gate #include <sys/cpuvar.h>
540Sstevel@tonic-gate #include <sys/user.h>
550Sstevel@tonic-gate #include <sys/kmem.h>
560Sstevel@tonic-gate #include <sys/debug.h>
570Sstevel@tonic-gate #include <sys/callb.h>
580Sstevel@tonic-gate #include <sys/tnf_probe.h>
590Sstevel@tonic-gate #include <sys/mem_cage.h>
600Sstevel@tonic-gate #include <sys/time.h>
610Sstevel@tonic-gate 
620Sstevel@tonic-gate #include <vm/hat.h>
630Sstevel@tonic-gate #include <vm/as.h>
640Sstevel@tonic-gate #include <vm/seg.h>
650Sstevel@tonic-gate #include <vm/page.h>
660Sstevel@tonic-gate #include <vm/pvn.h>
670Sstevel@tonic-gate #include <vm/seg_kmem.h>
680Sstevel@tonic-gate 
690Sstevel@tonic-gate static int checkpage(page_t *, int);
700Sstevel@tonic-gate 
710Sstevel@tonic-gate /*
720Sstevel@tonic-gate  * The following parameters control operation of the page replacement
730Sstevel@tonic-gate  * algorithm.  They are initialized to 0, and then computed at boot time
740Sstevel@tonic-gate  * based on the size of the system.  If they are patched non-zero in
750Sstevel@tonic-gate  * a loaded vmunix they are left alone and may thus be changed per system
760Sstevel@tonic-gate  * using adb on the loaded system.
770Sstevel@tonic-gate  */
780Sstevel@tonic-gate pgcnt_t		slowscan = 0;
790Sstevel@tonic-gate pgcnt_t		fastscan = 0;
800Sstevel@tonic-gate 
810Sstevel@tonic-gate static pgcnt_t	handspreadpages = 0;
820Sstevel@tonic-gate static int	loopfraction = 2;
830Sstevel@tonic-gate static pgcnt_t	looppages;
840Sstevel@tonic-gate static int	min_percent_cpu = 4;
850Sstevel@tonic-gate static int	max_percent_cpu = 80;
860Sstevel@tonic-gate static pgcnt_t	maxfastscan = 0;
870Sstevel@tonic-gate static pgcnt_t	maxslowscan = 100;
880Sstevel@tonic-gate 
890Sstevel@tonic-gate pgcnt_t	maxpgio = 0;
900Sstevel@tonic-gate pgcnt_t	minfree = 0;
910Sstevel@tonic-gate pgcnt_t	desfree = 0;
920Sstevel@tonic-gate pgcnt_t	lotsfree = 0;
930Sstevel@tonic-gate pgcnt_t	needfree = 0;
940Sstevel@tonic-gate pgcnt_t	throttlefree = 0;
950Sstevel@tonic-gate pgcnt_t	pageout_reserve = 0;
960Sstevel@tonic-gate 
970Sstevel@tonic-gate pgcnt_t	deficit;
980Sstevel@tonic-gate pgcnt_t	nscan;
990Sstevel@tonic-gate pgcnt_t	desscan;
1000Sstevel@tonic-gate 
1010Sstevel@tonic-gate /*
1020Sstevel@tonic-gate  * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
1030Sstevel@tonic-gate  * are the number of ticks in each wakeup cycle that gives the
1040Sstevel@tonic-gate  * equivalent of some underlying %CPU duty cycle.
1050Sstevel@tonic-gate  * When RATETOSCHEDPAGING is 4,  and hz is 100, pageout_scanner is
1060Sstevel@tonic-gate  * awakened every 25 clock ticks.  So, converting from %CPU to ticks
1070Sstevel@tonic-gate  * per wakeup cycle would be x% of 25, that is (x * 100) / 25.
1080Sstevel@tonic-gate  * So, for example, 4% == 1 tick and 80% == 20 ticks.
1090Sstevel@tonic-gate  *
1100Sstevel@tonic-gate  * min_pageout_ticks:
1110Sstevel@tonic-gate  *     ticks/wakeup equivalent of min_percent_cpu.
1120Sstevel@tonic-gate  *
1130Sstevel@tonic-gate  * max_pageout_ticks:
1140Sstevel@tonic-gate  *     ticks/wakeup equivalent of max_percent_cpu.
1150Sstevel@tonic-gate  *
1160Sstevel@tonic-gate  * pageout_ticks:
1170Sstevel@tonic-gate  *     Number of clock ticks budgeted for each wakeup cycle.
1180Sstevel@tonic-gate  *     Computed each time around by schedpaging().
1190Sstevel@tonic-gate  *     Varies between min_pageout_ticks .. max_pageout_ticks,
1200Sstevel@tonic-gate  *     depending on memory pressure.
1210Sstevel@tonic-gate  *
1220Sstevel@tonic-gate  * pageout_lbolt:
1230Sstevel@tonic-gate  *     Timestamp of the last time pageout_scanner woke up and started
1240Sstevel@tonic-gate  *     (or resumed) scanning for not recently referenced pages.
1250Sstevel@tonic-gate  */
1260Sstevel@tonic-gate 
1270Sstevel@tonic-gate static clock_t	min_pageout_ticks;
1280Sstevel@tonic-gate static clock_t	max_pageout_ticks;
1290Sstevel@tonic-gate static clock_t	pageout_ticks;
1300Sstevel@tonic-gate static clock_t	pageout_lbolt;
1310Sstevel@tonic-gate 
1320Sstevel@tonic-gate static uint_t	reset_hands;
1330Sstevel@tonic-gate 
1340Sstevel@tonic-gate #define	PAGES_POLL_MASK	1023
1350Sstevel@tonic-gate 
1360Sstevel@tonic-gate /*
1370Sstevel@tonic-gate  * pageout_sample_lim:
1380Sstevel@tonic-gate  *     The limit on the number of samples needed to establish a value
1390Sstevel@tonic-gate  *     for new pageout parameters, fastscan, slowscan, and handspreadpages.
1400Sstevel@tonic-gate  *
1410Sstevel@tonic-gate  * pageout_sample_cnt:
1420Sstevel@tonic-gate  *     Current sample number.  Once the sample gets large enough,
1430Sstevel@tonic-gate  *     set new values for handspreadpages, fastscan and slowscan.
1440Sstevel@tonic-gate  *
1450Sstevel@tonic-gate  * pageout_sample_pages:
1460Sstevel@tonic-gate  *     The accumulated number of pages scanned during sampling.
1470Sstevel@tonic-gate  *
1480Sstevel@tonic-gate  * pageout_sample_ticks:
1490Sstevel@tonic-gate  *     The accumulated clock ticks for the sample.
1500Sstevel@tonic-gate  *
1510Sstevel@tonic-gate  * pageout_rate:
1520Sstevel@tonic-gate  *     Rate in pages/nanosecond, computed at the end of sampling.
1530Sstevel@tonic-gate  *
1540Sstevel@tonic-gate  * pageout_new_spread:
1550Sstevel@tonic-gate  *     The new value to use for fastscan and handspreadpages.
1560Sstevel@tonic-gate  *     Calculated after enough samples have been taken.
1570Sstevel@tonic-gate  */
1580Sstevel@tonic-gate 
1590Sstevel@tonic-gate typedef hrtime_t hrrate_t;
1600Sstevel@tonic-gate 
1610Sstevel@tonic-gate static uint64_t	pageout_sample_lim = 4;
1620Sstevel@tonic-gate static uint64_t	pageout_sample_cnt = 0;
1630Sstevel@tonic-gate static pgcnt_t	pageout_sample_pages = 0;
1640Sstevel@tonic-gate static hrrate_t	pageout_rate = 0;
1650Sstevel@tonic-gate static pgcnt_t	pageout_new_spread = 0;
1660Sstevel@tonic-gate 
1670Sstevel@tonic-gate static clock_t	pageout_cycle_ticks;
1680Sstevel@tonic-gate static hrtime_t	sample_start, sample_end;
1690Sstevel@tonic-gate static hrtime_t	pageout_sample_etime = 0;
1700Sstevel@tonic-gate 
1710Sstevel@tonic-gate /*
1720Sstevel@tonic-gate  * Record number of times a pageout_scanner wakeup cycle finished because it
1730Sstevel@tonic-gate  * timed out (exceeded its CPU budget), rather than because it visited
1740Sstevel@tonic-gate  * its budgeted number of pages.
1750Sstevel@tonic-gate  */
1760Sstevel@tonic-gate uint64_t pageout_timeouts = 0;
1770Sstevel@tonic-gate 
1780Sstevel@tonic-gate #ifdef VM_STATS
1790Sstevel@tonic-gate static struct pageoutvmstats_str {
1800Sstevel@tonic-gate 	ulong_t	checkpage[3];
1810Sstevel@tonic-gate } pageoutvmstats;
1820Sstevel@tonic-gate #endif /* VM_STATS */
1830Sstevel@tonic-gate 
1840Sstevel@tonic-gate /*
1850Sstevel@tonic-gate  * Threads waiting for free memory use this condition variable and lock until
1860Sstevel@tonic-gate  * memory becomes available.
1870Sstevel@tonic-gate  */
1880Sstevel@tonic-gate kmutex_t	memavail_lock;
1890Sstevel@tonic-gate kcondvar_t	memavail_cv;
1900Sstevel@tonic-gate 
1910Sstevel@tonic-gate /*
1920Sstevel@tonic-gate  * The size of the clock loop.
1930Sstevel@tonic-gate  */
1940Sstevel@tonic-gate #define	LOOPPAGES	total_pages
1950Sstevel@tonic-gate 
1960Sstevel@tonic-gate /*
1970Sstevel@tonic-gate  * Set up the paging constants for the clock algorithm.
1980Sstevel@tonic-gate  * Called after the system is initialized and the amount of memory
1990Sstevel@tonic-gate  * and number of paging devices is known.
2000Sstevel@tonic-gate  *
2010Sstevel@tonic-gate  * lotsfree is 1/64 of memory, but at least 512K.
2020Sstevel@tonic-gate  * desfree is 1/2 of lotsfree.
2030Sstevel@tonic-gate  * minfree is 1/2 of desfree.
2040Sstevel@tonic-gate  *
2050Sstevel@tonic-gate  * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set:
2060Sstevel@tonic-gate  *
2070Sstevel@tonic-gate  *	lotsfree = btop(512K)
2080Sstevel@tonic-gate  *	desfree = btop(200K)
2090Sstevel@tonic-gate  *	minfree = btop(100K)
2100Sstevel@tonic-gate  *	throttlefree = INT_MIN
2110Sstevel@tonic-gate  *	max_percent_cpu = 4
2120Sstevel@tonic-gate  */
2130Sstevel@tonic-gate void
setupclock(int recalc)2140Sstevel@tonic-gate setupclock(int recalc)
2150Sstevel@tonic-gate {
2160Sstevel@tonic-gate 
2170Sstevel@tonic-gate 	static spgcnt_t init_lfree, init_dfree, init_mfree;
2180Sstevel@tonic-gate 	static spgcnt_t init_tfree, init_preserve, init_mpgio;
2190Sstevel@tonic-gate 	static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages;
2200Sstevel@tonic-gate 
2210Sstevel@tonic-gate 	looppages = LOOPPAGES;
2220Sstevel@tonic-gate 
2230Sstevel@tonic-gate 	/*
2240Sstevel@tonic-gate 	 * setupclock can now be called to recalculate the paging
2250Sstevel@tonic-gate 	 * parameters in the case of dynamic addition of memory.
2260Sstevel@tonic-gate 	 * So to make sure we make the proper calculations, if such a
2270Sstevel@tonic-gate 	 * situation should arise, we save away the initial values
2280Sstevel@tonic-gate 	 * of each parameter so we can recall them when needed. This
2290Sstevel@tonic-gate 	 * way we don't lose the settings an admin might have made
2300Sstevel@tonic-gate 	 * through the /etc/system file.
2310Sstevel@tonic-gate 	 */
2320Sstevel@tonic-gate 
2330Sstevel@tonic-gate 	if (!recalc) {
2340Sstevel@tonic-gate 		init_lfree = lotsfree;
2350Sstevel@tonic-gate 		init_dfree = desfree;
2360Sstevel@tonic-gate 		init_mfree = minfree;
2370Sstevel@tonic-gate 		init_tfree = throttlefree;
2380Sstevel@tonic-gate 		init_preserve = pageout_reserve;
2390Sstevel@tonic-gate 		init_mpgio = maxpgio;
2400Sstevel@tonic-gate 		init_mfscan = maxfastscan;
2410Sstevel@tonic-gate 		init_fscan = fastscan;
2420Sstevel@tonic-gate 		init_sscan = slowscan;
2430Sstevel@tonic-gate 		init_hspages = handspreadpages;
2440Sstevel@tonic-gate 	}
2450Sstevel@tonic-gate 
2460Sstevel@tonic-gate 	/*
2470Sstevel@tonic-gate 	 * Set up thresholds for paging:
2480Sstevel@tonic-gate 	 */
2490Sstevel@tonic-gate 
2500Sstevel@tonic-gate 	/*
2510Sstevel@tonic-gate 	 * Lotsfree is threshold where paging daemon turns on.
2520Sstevel@tonic-gate 	 */
2530Sstevel@tonic-gate 	if (init_lfree == 0 || init_lfree >= looppages)
2540Sstevel@tonic-gate 		lotsfree = MAX(looppages / 64, btop(512 * 1024));
2550Sstevel@tonic-gate 	else
2560Sstevel@tonic-gate 		lotsfree = init_lfree;
2570Sstevel@tonic-gate 
2580Sstevel@tonic-gate 	/*
2590Sstevel@tonic-gate 	 * Desfree is amount of memory desired free.
2600Sstevel@tonic-gate 	 * If less than this for extended period, start swapping.
2610Sstevel@tonic-gate 	 */
2620Sstevel@tonic-gate 	if (init_dfree == 0 || init_dfree >= lotsfree)
2630Sstevel@tonic-gate 		desfree = lotsfree / 2;
2640Sstevel@tonic-gate 	else
2650Sstevel@tonic-gate 		desfree = init_dfree;
2660Sstevel@tonic-gate 
2670Sstevel@tonic-gate 	/*
2680Sstevel@tonic-gate 	 * Minfree is minimal amount of free memory which is tolerable.
2690Sstevel@tonic-gate 	 */
2700Sstevel@tonic-gate 	if (init_mfree == 0 || init_mfree >= desfree)
2710Sstevel@tonic-gate 		minfree = desfree / 2;
2720Sstevel@tonic-gate 	else
2730Sstevel@tonic-gate 		minfree = init_mfree;
2740Sstevel@tonic-gate 
2750Sstevel@tonic-gate 	/*
2760Sstevel@tonic-gate 	 * Throttlefree is the point at which we start throttling
2770Sstevel@tonic-gate 	 * PG_WAIT requests until enough memory becomes available.
2780Sstevel@tonic-gate 	 */
2790Sstevel@tonic-gate 	if (init_tfree == 0 || init_tfree >= desfree)
2800Sstevel@tonic-gate 		throttlefree = minfree;
2810Sstevel@tonic-gate 	else
2820Sstevel@tonic-gate 		throttlefree = init_tfree;
2830Sstevel@tonic-gate 
2840Sstevel@tonic-gate 	/*
2850Sstevel@tonic-gate 	 * Pageout_reserve is the number of pages that we keep in
2860Sstevel@tonic-gate 	 * stock for pageout's own use.  Having a few such pages
2870Sstevel@tonic-gate 	 * provides insurance against system deadlock due to
2880Sstevel@tonic-gate 	 * pageout needing pages.  When freemem < pageout_reserve,
2890Sstevel@tonic-gate 	 * non-blocking allocations are denied to any threads
2900Sstevel@tonic-gate 	 * other than pageout and sched.  (At some point we might
2910Sstevel@tonic-gate 	 * want to consider a per-thread flag like T_PUSHING_PAGES
2920Sstevel@tonic-gate 	 * to indicate that a thread is part of the page-pushing
2930Sstevel@tonic-gate 	 * dance (e.g. an interrupt thread) and thus is entitled
2940Sstevel@tonic-gate 	 * to the same special dispensation we accord pageout.)
2950Sstevel@tonic-gate 	 */
2960Sstevel@tonic-gate 	if (init_preserve == 0 || init_preserve >= throttlefree)
2970Sstevel@tonic-gate 		pageout_reserve = throttlefree / 2;
2980Sstevel@tonic-gate 	else
2990Sstevel@tonic-gate 		pageout_reserve = init_preserve;
3000Sstevel@tonic-gate 
3010Sstevel@tonic-gate 	/*
3020Sstevel@tonic-gate 	 * Maxpgio thresholds how much paging is acceptable.
3030Sstevel@tonic-gate 	 * This figures that 2/3 busy on an arm is all that is
3040Sstevel@tonic-gate 	 * tolerable for paging.  We assume one operation per disk rev.
3050Sstevel@tonic-gate 	 *
3060Sstevel@tonic-gate 	 * XXX - Does not account for multiple swap devices.
3070Sstevel@tonic-gate 	 */
3080Sstevel@tonic-gate 	if (init_mpgio == 0)
3090Sstevel@tonic-gate 		maxpgio = (DISKRPM * 2) / 3;
3100Sstevel@tonic-gate 	else
3110Sstevel@tonic-gate 		maxpgio = init_mpgio;
3120Sstevel@tonic-gate 
3130Sstevel@tonic-gate 	/*
3140Sstevel@tonic-gate 	 * The clock scan rate varies between fastscan and slowscan
3150Sstevel@tonic-gate 	 * based on the amount of free memory available.  Fastscan
3160Sstevel@tonic-gate 	 * rate should be set based on the number pages that can be
3170Sstevel@tonic-gate 	 * scanned per sec using ~10% of processor time.  Since this
3180Sstevel@tonic-gate 	 * value depends on the processor, MMU, Mhz etc., it is
3190Sstevel@tonic-gate 	 * difficult to determine it in a generic manner for all
3200Sstevel@tonic-gate 	 * architectures.
3210Sstevel@tonic-gate 	 *
3220Sstevel@tonic-gate 	 * Instead of trying to determine the number of pages scanned
3230Sstevel@tonic-gate 	 * per sec for every processor, fastscan is set to be the smaller
3240Sstevel@tonic-gate 	 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
3250Sstevel@tonic-gate 	 * time is limited to ~4% of processor time.
3260Sstevel@tonic-gate 	 *
3270Sstevel@tonic-gate 	 * Setting fastscan to be 1/2 of memory allows pageout to scan
3280Sstevel@tonic-gate 	 * all of memory in ~2 secs.  This implies that user pages not
3290Sstevel@tonic-gate 	 * accessed within 1 sec (assuming, handspreadpages == fastscan)
3300Sstevel@tonic-gate 	 * can be reclaimed when free memory is very low.  Stealing pages
3310Sstevel@tonic-gate 	 * not accessed within 1 sec seems reasonable and ensures that
3320Sstevel@tonic-gate 	 * active user processes don't thrash.
3330Sstevel@tonic-gate 	 *
3340Sstevel@tonic-gate 	 * Smaller values of fastscan result in scanning fewer pages
3350Sstevel@tonic-gate 	 * every second and consequently pageout may not be able to free
3360Sstevel@tonic-gate 	 * sufficient memory to maintain the minimum threshold.  Larger
3370Sstevel@tonic-gate 	 * values of fastscan result in scanning a lot more pages which
3380Sstevel@tonic-gate 	 * could lead to thrashing and higher CPU usage.
3390Sstevel@tonic-gate 	 *
3400Sstevel@tonic-gate 	 * Fastscan needs to be limited to a maximum value and should not
3410Sstevel@tonic-gate 	 * scale with memory to prevent pageout from consuming too much
3420Sstevel@tonic-gate 	 * time for scanning on slow CPU's and avoid thrashing, as a
3430Sstevel@tonic-gate 	 * result of scanning too many pages, on faster CPU's.
3440Sstevel@tonic-gate 	 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
3450Sstevel@tonic-gate 	 * (the upper bound for fastscan) based on the average number
3460Sstevel@tonic-gate 	 * of pages that can potentially be scanned in ~1 sec (using ~4%
3470Sstevel@tonic-gate 	 * of the CPU) on some of the following machines that currently
3480Sstevel@tonic-gate 	 * run Solaris 2.x:
3490Sstevel@tonic-gate 	 *
3500Sstevel@tonic-gate 	 *			average memory scanned in ~1 sec
3510Sstevel@tonic-gate 	 *
3520Sstevel@tonic-gate 	 *	25 Mhz SS1+:		23 Meg
3530Sstevel@tonic-gate 	 *	LX:			37 Meg
3540Sstevel@tonic-gate 	 *	50 Mhz SC2000:		68 Meg
3550Sstevel@tonic-gate 	 *
3560Sstevel@tonic-gate 	 *	40 Mhz 486:		26 Meg
3570Sstevel@tonic-gate 	 *	66 Mhz 486:		42 Meg
3580Sstevel@tonic-gate 	 *
3590Sstevel@tonic-gate 	 * When free memory falls just below lotsfree, the scan rate
3600Sstevel@tonic-gate 	 * goes from 0 to slowscan (i.e., pageout starts running).  This
3610Sstevel@tonic-gate 	 * transition needs to be smooth and is achieved by ensuring that
3620Sstevel@tonic-gate 	 * pageout scans a small number of pages to satisfy the transient
3630Sstevel@tonic-gate 	 * memory demand.  This is set to not exceed 100 pages/sec (25 per
3640Sstevel@tonic-gate 	 * wakeup) since scanning that many pages has no noticible impact
3650Sstevel@tonic-gate 	 * on system performance.
3660Sstevel@tonic-gate 	 *
3670Sstevel@tonic-gate 	 * In addition to setting fastscan and slowscan, pageout is
3680Sstevel@tonic-gate 	 * limited to using ~4% of the CPU.  This results in increasing
3690Sstevel@tonic-gate 	 * the time taken to scan all of memory, which in turn means that
3700Sstevel@tonic-gate 	 * user processes have a better opportunity of preventing their
3710Sstevel@tonic-gate 	 * pages from being stolen.  This has a positive effect on
3720Sstevel@tonic-gate 	 * interactive and overall system performance when memory demand
3730Sstevel@tonic-gate 	 * is high.
3740Sstevel@tonic-gate 	 *
3750Sstevel@tonic-gate 	 * Thus, the rate at which pages are scanned for replacement will
3760Sstevel@tonic-gate 	 * vary linearly between slowscan and the number of pages that
3770Sstevel@tonic-gate 	 * can be scanned using ~4% of processor time instead of varying
3780Sstevel@tonic-gate 	 * linearly between slowscan and fastscan.
3790Sstevel@tonic-gate 	 *
3800Sstevel@tonic-gate 	 * Also, the processor time used by pageout will vary from ~1%
3810Sstevel@tonic-gate 	 * at slowscan to ~4% at fastscan instead of varying between
3820Sstevel@tonic-gate 	 * ~1% at slowscan and ~10% at fastscan.
3830Sstevel@tonic-gate 	 *
3840Sstevel@tonic-gate 	 * The values chosen for the various VM parameters (fastscan,
3850Sstevel@tonic-gate 	 * handspreadpages, etc) are not universally true for all machines,
3860Sstevel@tonic-gate 	 * but appear to be a good rule of thumb for the machines we've
3870Sstevel@tonic-gate 	 * tested.  They have the following ranges:
3880Sstevel@tonic-gate 	 *
3890Sstevel@tonic-gate 	 *	cpu speed:	20 to 70 Mhz
3900Sstevel@tonic-gate 	 *	page size:	4K to 8K
3910Sstevel@tonic-gate 	 *	memory size:	16M to 5G
3920Sstevel@tonic-gate 	 *	page scan rate:	4000 - 17400 4K pages per sec
3930Sstevel@tonic-gate 	 *
3940Sstevel@tonic-gate 	 * The values need to be re-examined for machines which don't
3950Sstevel@tonic-gate 	 * fall into the various ranges (e.g., slower or faster CPUs,
3960Sstevel@tonic-gate 	 * smaller or larger pagesizes etc) shown above.
3970Sstevel@tonic-gate 	 *
3980Sstevel@tonic-gate 	 * On an MP machine, pageout is often unable to maintain the
3990Sstevel@tonic-gate 	 * minimum paging thresholds under heavy load.  This is due to
4000Sstevel@tonic-gate 	 * the fact that user processes running on other CPU's can be
4010Sstevel@tonic-gate 	 * dirtying memory at a much faster pace than pageout can find
4020Sstevel@tonic-gate 	 * pages to free.  The memory demands could be met by enabling
4030Sstevel@tonic-gate 	 * more than one CPU to run the clock algorithm in such a manner
4040Sstevel@tonic-gate 	 * that the various clock hands don't overlap.  This also makes
4050Sstevel@tonic-gate 	 * it more difficult to determine the values for fastscan, slowscan
4060Sstevel@tonic-gate 	 * and handspreadpages.
4070Sstevel@tonic-gate 	 *
4080Sstevel@tonic-gate 	 * The swapper is currently used to free up memory when pageout
4090Sstevel@tonic-gate 	 * is unable to meet memory demands by swapping out processes.
4100Sstevel@tonic-gate 	 * In addition to freeing up memory, swapping also reduces the
4110Sstevel@tonic-gate 	 * demand for memory by preventing user processes from running
4120Sstevel@tonic-gate 	 * and thereby consuming memory.
4130Sstevel@tonic-gate 	 */
4140Sstevel@tonic-gate 	if (init_mfscan == 0) {
4150Sstevel@tonic-gate 		if (pageout_new_spread != 0)
4160Sstevel@tonic-gate 			maxfastscan = pageout_new_spread;
4170Sstevel@tonic-gate 		else
4180Sstevel@tonic-gate 			maxfastscan = MAXHANDSPREADPAGES;
4190Sstevel@tonic-gate 	} else {
4200Sstevel@tonic-gate 		maxfastscan = init_mfscan;
4210Sstevel@tonic-gate 	}
4220Sstevel@tonic-gate 	if (init_fscan == 0)
4230Sstevel@tonic-gate 		fastscan = MIN(looppages / loopfraction, maxfastscan);
4240Sstevel@tonic-gate 	else
4250Sstevel@tonic-gate 		fastscan = init_fscan;
4260Sstevel@tonic-gate 	if (fastscan > looppages / loopfraction)
4270Sstevel@tonic-gate 		fastscan = looppages / loopfraction;
4280Sstevel@tonic-gate 
4290Sstevel@tonic-gate 	/*
4300Sstevel@tonic-gate 	 * Set slow scan time to 1/10 the fast scan time, but
4310Sstevel@tonic-gate 	 * not to exceed maxslowscan.
4320Sstevel@tonic-gate 	 */
4330Sstevel@tonic-gate 	if (init_sscan == 0)
4340Sstevel@tonic-gate 		slowscan = MIN(fastscan / 10, maxslowscan);
4350Sstevel@tonic-gate 	else
4360Sstevel@tonic-gate 		slowscan = init_sscan;
4370Sstevel@tonic-gate 	if (slowscan > fastscan / 2)
4380Sstevel@tonic-gate 		slowscan = fastscan / 2;
4390Sstevel@tonic-gate 
4400Sstevel@tonic-gate 	/*
4410Sstevel@tonic-gate 	 * Handspreadpages is distance (in pages) between front and back
4420Sstevel@tonic-gate 	 * pageout daemon hands.  The amount of time to reclaim a page
4430Sstevel@tonic-gate 	 * once pageout examines it increases with this distance and
4440Sstevel@tonic-gate 	 * decreases as the scan rate rises. It must be < the amount
4450Sstevel@tonic-gate 	 * of pageable memory.
4460Sstevel@tonic-gate 	 *
4470Sstevel@tonic-gate 	 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
4480Sstevel@tonic-gate 	 * to be "fastscan" results in the front hand being a few secs
4490Sstevel@tonic-gate 	 * (varies based on the processor speed) ahead of the back hand
4500Sstevel@tonic-gate 	 * at fastscan rates.  This distance can be further reduced, if
4510Sstevel@tonic-gate 	 * necessary, by increasing the processor time used by pageout
4520Sstevel@tonic-gate 	 * to be more than ~4% and preferrably not more than ~10%.
4530Sstevel@tonic-gate 	 *
4540Sstevel@tonic-gate 	 * As a result, user processes have a much better chance of
4550Sstevel@tonic-gate 	 * referencing their pages before the back hand examines them.
4560Sstevel@tonic-gate 	 * This also significantly lowers the number of reclaims from
4570Sstevel@tonic-gate 	 * the freelist since pageout does not end up freeing pages which
4580Sstevel@tonic-gate 	 * may be referenced a sec later.
4590Sstevel@tonic-gate 	 */
4600Sstevel@tonic-gate 	if (init_hspages == 0)
4610Sstevel@tonic-gate 		handspreadpages = fastscan;
4620Sstevel@tonic-gate 	else
4630Sstevel@tonic-gate 		handspreadpages = init_hspages;
4640Sstevel@tonic-gate 
4650Sstevel@tonic-gate 	/*
4660Sstevel@tonic-gate 	 * Make sure that back hand follows front hand by at least
4670Sstevel@tonic-gate 	 * 1/RATETOSCHEDPAGING seconds.  Without this test, it is possible
4680Sstevel@tonic-gate 	 * for the back hand to look at a page during the same wakeup of
4690Sstevel@tonic-gate 	 * the pageout daemon in which the front hand cleared its ref bit.
4700Sstevel@tonic-gate 	 */
4710Sstevel@tonic-gate 	if (handspreadpages >= looppages)
4720Sstevel@tonic-gate 		handspreadpages = looppages - 1;
4730Sstevel@tonic-gate 
4740Sstevel@tonic-gate 	/*
4750Sstevel@tonic-gate 	 * If we have been called to recalculate the parameters,
4760Sstevel@tonic-gate 	 * set a flag to re-evaluate the clock hand pointers.
4770Sstevel@tonic-gate 	 */
4780Sstevel@tonic-gate 	if (recalc)
4790Sstevel@tonic-gate 		reset_hands = 1;
4800Sstevel@tonic-gate }
4810Sstevel@tonic-gate 
4820Sstevel@tonic-gate /*
4830Sstevel@tonic-gate  * Pageout scheduling.
4840Sstevel@tonic-gate  *
4850Sstevel@tonic-gate  * Schedpaging controls the rate at which the page out daemon runs by
4860Sstevel@tonic-gate  * setting the global variables nscan and desscan RATETOSCHEDPAGING
4870Sstevel@tonic-gate  * times a second.  Nscan records the number of pages pageout has examined
4880Sstevel@tonic-gate  * in its current pass; schedpaging resets this value to zero each time
4890Sstevel@tonic-gate  * it runs.  Desscan records the number of pages pageout should examine
4900Sstevel@tonic-gate  * in its next pass; schedpaging sets this value based on the amount of
4910Sstevel@tonic-gate  * currently available memory.
4920Sstevel@tonic-gate  */
4930Sstevel@tonic-gate 
4940Sstevel@tonic-gate #define	RATETOSCHEDPAGING	4		/* hz that is */
4950Sstevel@tonic-gate 
4960Sstevel@tonic-gate static kmutex_t	pageout_mutex;	/* held while pageout or schedpaging running */
4970Sstevel@tonic-gate 
4980Sstevel@tonic-gate /*
4990Sstevel@tonic-gate  * Pool of available async pageout putpage requests.
5000Sstevel@tonic-gate  */
5010Sstevel@tonic-gate static struct async_reqs *push_req;
5020Sstevel@tonic-gate static struct async_reqs *req_freelist;	/* available req structs */
5030Sstevel@tonic-gate static struct async_reqs *push_list;	/* pending reqs */
5040Sstevel@tonic-gate static kmutex_t push_lock;		/* protects req pool */
5050Sstevel@tonic-gate static kcondvar_t push_cv;
5060Sstevel@tonic-gate 
5070Sstevel@tonic-gate static int async_list_size = 256;	/* number of async request structs */
5080Sstevel@tonic-gate 
5090Sstevel@tonic-gate static void pageout_scanner(void);
5100Sstevel@tonic-gate 
5110Sstevel@tonic-gate /*
5120Sstevel@tonic-gate  * If a page is being shared more than "po_share" times
5130Sstevel@tonic-gate  * then leave it alone- don't page it out.
5140Sstevel@tonic-gate  */
5150Sstevel@tonic-gate #define	MIN_PO_SHARE	(8)
5160Sstevel@tonic-gate #define	MAX_PO_SHARE	((MIN_PO_SHARE) << 24)
5170Sstevel@tonic-gate ulong_t	po_share = MIN_PO_SHARE;
5180Sstevel@tonic-gate 
5190Sstevel@tonic-gate /*
5200Sstevel@tonic-gate  * Schedule rate for paging.
5210Sstevel@tonic-gate  * Rate is linear interpolation between
5220Sstevel@tonic-gate  * slowscan with lotsfree and fastscan when out of memory.
5230Sstevel@tonic-gate  */
5240Sstevel@tonic-gate static void
schedpaging(void * arg)5250Sstevel@tonic-gate schedpaging(void *arg)
5260Sstevel@tonic-gate {
5270Sstevel@tonic-gate 	spgcnt_t vavail;
5280Sstevel@tonic-gate 
5290Sstevel@tonic-gate 	if (freemem < lotsfree + needfree + kmem_reapahead)
5300Sstevel@tonic-gate 		kmem_reap();
5310Sstevel@tonic-gate 
5326695Saguzovsk 	if (freemem < lotsfree + needfree)
5330Sstevel@tonic-gate 		seg_preap();
5340Sstevel@tonic-gate 
5350Sstevel@tonic-gate 	if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
5360Sstevel@tonic-gate 		kcage_cageout_wakeup();
5370Sstevel@tonic-gate 
5380Sstevel@tonic-gate 	if (mutex_tryenter(&pageout_mutex)) {
5390Sstevel@tonic-gate 		/* pageout() not running */
5400Sstevel@tonic-gate 		nscan = 0;
5410Sstevel@tonic-gate 		vavail = freemem - deficit;
5426118Sjimp 		if (pageout_new_spread != 0)
5436118Sjimp 			vavail -= needfree;
5440Sstevel@tonic-gate 		if (vavail < 0)
5450Sstevel@tonic-gate 			vavail = 0;
5460Sstevel@tonic-gate 		if (vavail > lotsfree)
5470Sstevel@tonic-gate 			vavail = lotsfree;
5480Sstevel@tonic-gate 
5490Sstevel@tonic-gate 		/*
5500Sstevel@tonic-gate 		 * Fix for 1161438 (CRS SPR# 73922).  All variables
5510Sstevel@tonic-gate 		 * in the original calculation for desscan were 32 bit signed
5520Sstevel@tonic-gate 		 * ints.  As freemem approaches 0x0 on a system with 1 Gig or
5530Sstevel@tonic-gate 		 * more of memory, the calculation can overflow.  When this
5540Sstevel@tonic-gate 		 * happens, desscan becomes negative and pageout_scanner()
5550Sstevel@tonic-gate 		 * stops paging out.
5560Sstevel@tonic-gate 		 */
5576118Sjimp 		if ((needfree) && (pageout_new_spread == 0)) {
5586118Sjimp 			/*
5596118Sjimp 			 * If we've not yet collected enough samples to
5606118Sjimp 			 * calculate a spread, use the old logic of kicking
5616118Sjimp 			 * into high gear anytime needfree is non-zero.
5626118Sjimp 			 */
5630Sstevel@tonic-gate 			desscan = fastscan / RATETOSCHEDPAGING;
5640Sstevel@tonic-gate 		} else {
5656118Sjimp 			/*
5666118Sjimp 			 * Once we've calculated a spread based on system
5676118Sjimp 			 * memory and usage, just treat needfree as another
5686118Sjimp 			 * form of deficit.
5696118Sjimp 			 */
5700Sstevel@tonic-gate 			spgcnt_t faststmp, slowstmp, result;
5710Sstevel@tonic-gate 
5720Sstevel@tonic-gate 			slowstmp = slowscan * vavail;
5730Sstevel@tonic-gate 			faststmp = fastscan * (lotsfree - vavail);
5740Sstevel@tonic-gate 			result = (slowstmp + faststmp) /
5756118Sjimp 			    nz(lotsfree) / RATETOSCHEDPAGING;
5760Sstevel@tonic-gate 			desscan = (pgcnt_t)result;
5770Sstevel@tonic-gate 		}
5780Sstevel@tonic-gate 
5790Sstevel@tonic-gate 		pageout_ticks = min_pageout_ticks + (lotsfree - vavail) *
5800Sstevel@tonic-gate 		    (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree);
5810Sstevel@tonic-gate 
5820Sstevel@tonic-gate 		if (freemem < lotsfree + needfree ||
5830Sstevel@tonic-gate 		    pageout_sample_cnt < pageout_sample_lim) {
5840Sstevel@tonic-gate 			TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
5856118Sjimp 			    "pageout_cv_signal:freemem %ld", freemem);
5860Sstevel@tonic-gate 			cv_signal(&proc_pageout->p_cv);
5870Sstevel@tonic-gate 		} else {
5880Sstevel@tonic-gate 			/*
5890Sstevel@tonic-gate 			 * There are enough free pages, no need to
5900Sstevel@tonic-gate 			 * kick the scanner thread.  And next time
5910Sstevel@tonic-gate 			 * around, keep more of the `highly shared'
5920Sstevel@tonic-gate 			 * pages.
5930Sstevel@tonic-gate 			 */
5940Sstevel@tonic-gate 			cv_signal_pageout();
5950Sstevel@tonic-gate 			if (po_share > MIN_PO_SHARE) {
5960Sstevel@tonic-gate 				po_share >>= 1;
5970Sstevel@tonic-gate 			}
5980Sstevel@tonic-gate 		}
5990Sstevel@tonic-gate 		mutex_exit(&pageout_mutex);
6000Sstevel@tonic-gate 	}
6010Sstevel@tonic-gate 
6020Sstevel@tonic-gate 	/*
6030Sstevel@tonic-gate 	 * Signal threads waiting for available memory.
6040Sstevel@tonic-gate 	 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
6050Sstevel@tonic-gate 	 * in this case it is not needed - the waiters will be waken up during
6060Sstevel@tonic-gate 	 * the next invocation of this function.
6070Sstevel@tonic-gate 	 */
6080Sstevel@tonic-gate 	if (kmem_avail() > 0)
6090Sstevel@tonic-gate 		cv_broadcast(&memavail_cv);
6100Sstevel@tonic-gate 
6110Sstevel@tonic-gate 	(void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING);
6120Sstevel@tonic-gate }
6130Sstevel@tonic-gate 
6140Sstevel@tonic-gate pgcnt_t		pushes;
6150Sstevel@tonic-gate ulong_t		push_list_size;		/* # of requests on pageout queue */
6160Sstevel@tonic-gate 
6170Sstevel@tonic-gate #define	FRONT	1
6180Sstevel@tonic-gate #define	BACK	2
6190Sstevel@tonic-gate 
6200Sstevel@tonic-gate int dopageout = 1;	/* must be non-zero to turn page stealing on */
6210Sstevel@tonic-gate 
6220Sstevel@tonic-gate /*
6230Sstevel@tonic-gate  * The page out daemon, which runs as process 2.
6240Sstevel@tonic-gate  *
6250Sstevel@tonic-gate  * As long as there are at least lotsfree pages,
6260Sstevel@tonic-gate  * this process is not run.  When the number of free
6270Sstevel@tonic-gate  * pages stays in the range desfree to lotsfree,
6280Sstevel@tonic-gate  * this daemon runs through the pages in the loop
6290Sstevel@tonic-gate  * at a rate determined in schedpaging().  Pageout manages
6300Sstevel@tonic-gate  * two hands on the clock.  The front hand moves through
6310Sstevel@tonic-gate  * memory, clearing the reference bit,
6320Sstevel@tonic-gate  * and stealing pages from procs that are over maxrss.
6330Sstevel@tonic-gate  * The back hand travels a distance behind the front hand,
6340Sstevel@tonic-gate  * freeing the pages that have not been referenced in the time
6350Sstevel@tonic-gate  * since the front hand passed.  If modified, they are pushed to
6360Sstevel@tonic-gate  * swap before being freed.
6370Sstevel@tonic-gate  *
6380Sstevel@tonic-gate  * There are 2 threads that act on behalf of the pageout process.
6390Sstevel@tonic-gate  * One thread scans pages (pageout_scanner) and frees them up if
6400Sstevel@tonic-gate  * they don't require any VOP_PUTPAGE operation. If a page must be
6410Sstevel@tonic-gate  * written back to its backing store, the request is put on a list
6420Sstevel@tonic-gate  * and the other (pageout) thread is signaled. The pageout thread
6430Sstevel@tonic-gate  * grabs VOP_PUTPAGE requests from the list, and processes them.
6440Sstevel@tonic-gate  * Some filesystems may require resources for the VOP_PUTPAGE
6450Sstevel@tonic-gate  * operations (like memory) and hence can block the pageout
6460Sstevel@tonic-gate  * thread, but the scanner thread can still operate. There is still
6475331Samw  * no guarantee that memory deadlocks cannot occur.
6480Sstevel@tonic-gate  *
6490Sstevel@tonic-gate  * For now, this thing is in very rough form.
6500Sstevel@tonic-gate  */
6510Sstevel@tonic-gate void
pageout()6520Sstevel@tonic-gate pageout()
6530Sstevel@tonic-gate {
6540Sstevel@tonic-gate 	struct async_reqs *arg;
6550Sstevel@tonic-gate 	pri_t pageout_pri;
6560Sstevel@tonic-gate 	int i;
6570Sstevel@tonic-gate 	pgcnt_t max_pushes;
6580Sstevel@tonic-gate 	callb_cpr_t cprinfo;
6590Sstevel@tonic-gate 
6600Sstevel@tonic-gate 	proc_pageout = ttoproc(curthread);
6610Sstevel@tonic-gate 	proc_pageout->p_cstime = 0;
6620Sstevel@tonic-gate 	proc_pageout->p_stime =  0;
6630Sstevel@tonic-gate 	proc_pageout->p_cutime =  0;
6640Sstevel@tonic-gate 	proc_pageout->p_utime = 0;
6653446Smrj 	bcopy("pageout", PTOU(curproc)->u_psargs, 8);
6663446Smrj 	bcopy("pageout", PTOU(curproc)->u_comm, 7);
6670Sstevel@tonic-gate 
6680Sstevel@tonic-gate 	/*
6690Sstevel@tonic-gate 	 * Create pageout scanner thread
6700Sstevel@tonic-gate 	 */
6710Sstevel@tonic-gate 	mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
6720Sstevel@tonic-gate 	mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
6730Sstevel@tonic-gate 
6740Sstevel@tonic-gate 	/*
6750Sstevel@tonic-gate 	 * Allocate and initialize the async request structures
6760Sstevel@tonic-gate 	 * for pageout.
6770Sstevel@tonic-gate 	 */
6780Sstevel@tonic-gate 	push_req = (struct async_reqs *)
6790Sstevel@tonic-gate 	    kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
6800Sstevel@tonic-gate 
6810Sstevel@tonic-gate 	req_freelist = push_req;
6820Sstevel@tonic-gate 	for (i = 0; i < async_list_size - 1; i++)
6830Sstevel@tonic-gate 		push_req[i].a_next = &push_req[i + 1];
6840Sstevel@tonic-gate 
6850Sstevel@tonic-gate 	pageout_pri = curthread->t_pri;
686*11173SJonathan.Adams@Sun.COM 
687*11173SJonathan.Adams@Sun.COM 	/* Create the pageout scanner thread. */
688*11173SJonathan.Adams@Sun.COM 	(void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
689*11173SJonathan.Adams@Sun.COM 	    pageout_pri - 1);
6900Sstevel@tonic-gate 
6910Sstevel@tonic-gate 	/*
6920Sstevel@tonic-gate 	 * kick off pageout scheduler.
6930Sstevel@tonic-gate 	 */
6940Sstevel@tonic-gate 	schedpaging(NULL);
6950Sstevel@tonic-gate 
6960Sstevel@tonic-gate 	/*
6970Sstevel@tonic-gate 	 * Create kernel cage thread.
6980Sstevel@tonic-gate 	 * The kernel cage thread is started under the pageout process
6990Sstevel@tonic-gate 	 * to take advantage of the less restricted page allocation
7000Sstevel@tonic-gate 	 * in page_create_throttle().
7010Sstevel@tonic-gate 	 */
7020Sstevel@tonic-gate 	kcage_cageout_init();
7030Sstevel@tonic-gate 
7040Sstevel@tonic-gate 	/*
7050Sstevel@tonic-gate 	 * Limit pushes to avoid saturating pageout devices.
7060Sstevel@tonic-gate 	 */
7070Sstevel@tonic-gate 	max_pushes = maxpgio / RATETOSCHEDPAGING;
7080Sstevel@tonic-gate 	CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
7090Sstevel@tonic-gate 
7100Sstevel@tonic-gate 	for (;;) {
7110Sstevel@tonic-gate 		mutex_enter(&push_lock);
7120Sstevel@tonic-gate 
7130Sstevel@tonic-gate 		while ((arg = push_list) == NULL || pushes > max_pushes) {
7140Sstevel@tonic-gate 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
7150Sstevel@tonic-gate 			cv_wait(&push_cv, &push_lock);
7160Sstevel@tonic-gate 			pushes = 0;
7170Sstevel@tonic-gate 			CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
7180Sstevel@tonic-gate 		}
7190Sstevel@tonic-gate 		push_list = arg->a_next;
7200Sstevel@tonic-gate 		arg->a_next = NULL;
7210Sstevel@tonic-gate 		mutex_exit(&push_lock);
7220Sstevel@tonic-gate 
7230Sstevel@tonic-gate 		if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
7246118Sjimp 		    arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
7250Sstevel@tonic-gate 			pushes++;
7260Sstevel@tonic-gate 		}
7270Sstevel@tonic-gate 
7280Sstevel@tonic-gate 		/* vp held by checkpage() */
7290Sstevel@tonic-gate 		VN_RELE(arg->a_vp);
7300Sstevel@tonic-gate 
7310Sstevel@tonic-gate 		mutex_enter(&push_lock);
7320Sstevel@tonic-gate 		arg->a_next = req_freelist;	/* back on freelist */
7330Sstevel@tonic-gate 		req_freelist = arg;
7340Sstevel@tonic-gate 		push_list_size--;
7350Sstevel@tonic-gate 		mutex_exit(&push_lock);
7360Sstevel@tonic-gate 	}
7370Sstevel@tonic-gate }
7380Sstevel@tonic-gate 
7390Sstevel@tonic-gate /*
7400Sstevel@tonic-gate  * Kernel thread that scans pages looking for ones to free
7410Sstevel@tonic-gate  */
7420Sstevel@tonic-gate static void
pageout_scanner(void)7430Sstevel@tonic-gate pageout_scanner(void)
7440Sstevel@tonic-gate {
7450Sstevel@tonic-gate 	struct page *fronthand, *backhand;
7460Sstevel@tonic-gate 	uint_t count;
7470Sstevel@tonic-gate 	callb_cpr_t cprinfo;
7480Sstevel@tonic-gate 	pgcnt_t	nscan_limit;
7490Sstevel@tonic-gate 	pgcnt_t	pcount;
7500Sstevel@tonic-gate 
7510Sstevel@tonic-gate 	CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
7520Sstevel@tonic-gate 	mutex_enter(&pageout_mutex);
7530Sstevel@tonic-gate 
7540Sstevel@tonic-gate 	/*
7550Sstevel@tonic-gate 	 * The restart case does not attempt to point the hands at roughly
7560Sstevel@tonic-gate 	 * the right point on the assumption that after one circuit things
7570Sstevel@tonic-gate 	 * will have settled down - and restarts shouldn't be that often.
7580Sstevel@tonic-gate 	 */
7590Sstevel@tonic-gate 
7600Sstevel@tonic-gate 	/*
7610Sstevel@tonic-gate 	 * Set the two clock hands to be separated by a reasonable amount,
7620Sstevel@tonic-gate 	 * but no more than 360 degrees apart.
7630Sstevel@tonic-gate 	 */
7640Sstevel@tonic-gate 	backhand = page_first();
7650Sstevel@tonic-gate 	if (handspreadpages >= total_pages)
7660Sstevel@tonic-gate 		fronthand = page_nextn(backhand, total_pages - 1);
7670Sstevel@tonic-gate 	else
7680Sstevel@tonic-gate 		fronthand = page_nextn(backhand, handspreadpages);
7690Sstevel@tonic-gate 
7700Sstevel@tonic-gate 	min_pageout_ticks = MAX(1,
7710Sstevel@tonic-gate 	    ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING);
7720Sstevel@tonic-gate 	max_pageout_ticks = MAX(min_pageout_ticks,
7730Sstevel@tonic-gate 	    ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING);
7740Sstevel@tonic-gate 
7750Sstevel@tonic-gate loop:
7760Sstevel@tonic-gate 	cv_signal_pageout();
7770Sstevel@tonic-gate 
7780Sstevel@tonic-gate 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
7790Sstevel@tonic-gate 	cv_wait(&proc_pageout->p_cv, &pageout_mutex);
7800Sstevel@tonic-gate 	CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
7810Sstevel@tonic-gate 
7820Sstevel@tonic-gate 	if (!dopageout)
7830Sstevel@tonic-gate 		goto loop;
7840Sstevel@tonic-gate 
7850Sstevel@tonic-gate 	if (reset_hands) {
7860Sstevel@tonic-gate 		reset_hands = 0;
7870Sstevel@tonic-gate 
7880Sstevel@tonic-gate 		backhand = page_first();
7890Sstevel@tonic-gate 		if (handspreadpages >= total_pages)
7900Sstevel@tonic-gate 			fronthand = page_nextn(backhand, total_pages - 1);
7910Sstevel@tonic-gate 		else
7920Sstevel@tonic-gate 			fronthand = page_nextn(backhand, handspreadpages);
7930Sstevel@tonic-gate 	}
7940Sstevel@tonic-gate 
7950Sstevel@tonic-gate 	CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
7960Sstevel@tonic-gate 	count = 0;
7970Sstevel@tonic-gate 
7980Sstevel@tonic-gate 	TRACE_4(TR_FAC_VM, TR_PAGEOUT_START,
7996118Sjimp 	    "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld",
8006118Sjimp 	    freemem, lotsfree, nscan, desscan);
8010Sstevel@tonic-gate 
8020Sstevel@tonic-gate 	/* Kernel probe */
8030Sstevel@tonic-gate 	TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */,
8046118Sjimp 	    tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree);
8050Sstevel@tonic-gate 
8060Sstevel@tonic-gate 	pcount = 0;
8070Sstevel@tonic-gate 	if (pageout_sample_cnt < pageout_sample_lim) {
8080Sstevel@tonic-gate 		nscan_limit = total_pages;
8090Sstevel@tonic-gate 	} else {
8100Sstevel@tonic-gate 		nscan_limit = desscan;
8110Sstevel@tonic-gate 	}
81211066Srafael.vanoni@sun.com 	pageout_lbolt = ddi_get_lbolt();
8130Sstevel@tonic-gate 	sample_start = gethrtime();
8140Sstevel@tonic-gate 
8150Sstevel@tonic-gate 	/*
8160Sstevel@tonic-gate 	 * Scan the appropriate number of pages for a single duty cycle.
8170Sstevel@tonic-gate 	 * However, stop scanning as soon as there is enough free memory.
8180Sstevel@tonic-gate 	 * For a short while, we will be sampling the performance of the
8190Sstevel@tonic-gate 	 * scanner and need to keep running just to get sample data, in
8200Sstevel@tonic-gate 	 * which case we keep going and don't pay attention to whether
8210Sstevel@tonic-gate 	 * or not there is enough free memory.
8220Sstevel@tonic-gate 	 */
8230Sstevel@tonic-gate 
8240Sstevel@tonic-gate 	while (nscan < nscan_limit && (freemem < lotsfree + needfree ||
8250Sstevel@tonic-gate 	    pageout_sample_cnt < pageout_sample_lim)) {
8260Sstevel@tonic-gate 		int rvfront, rvback;
8270Sstevel@tonic-gate 
8280Sstevel@tonic-gate 		/*
8290Sstevel@tonic-gate 		 * Check to see if we have exceeded our %CPU budget
8300Sstevel@tonic-gate 		 * for this wakeup, but not on every single page visited,
8310Sstevel@tonic-gate 		 * just every once in a while.
8320Sstevel@tonic-gate 		 */
8330Sstevel@tonic-gate 		if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
83411066Srafael.vanoni@sun.com 			pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt;
8350Sstevel@tonic-gate 			if (pageout_cycle_ticks >= pageout_ticks) {
8360Sstevel@tonic-gate 				++pageout_timeouts;
8370Sstevel@tonic-gate 				break;
8380Sstevel@tonic-gate 			}
8390Sstevel@tonic-gate 		}
8400Sstevel@tonic-gate 
8410Sstevel@tonic-gate 		/*
8420Sstevel@tonic-gate 		 * If checkpage manages to add a page to the free list,
8430Sstevel@tonic-gate 		 * we give ourselves another couple of trips around the loop.
8440Sstevel@tonic-gate 		 */
8450Sstevel@tonic-gate 		if ((rvfront = checkpage(fronthand, FRONT)) == 1)
8460Sstevel@tonic-gate 			count = 0;
8470Sstevel@tonic-gate 		if ((rvback = checkpage(backhand, BACK)) == 1)
8480Sstevel@tonic-gate 			count = 0;
8490Sstevel@tonic-gate 
8500Sstevel@tonic-gate 		++pcount;
8510Sstevel@tonic-gate 
8520Sstevel@tonic-gate 		/*
8530Sstevel@tonic-gate 		 * protected by pageout_mutex instead of cpu_stat_lock
8540Sstevel@tonic-gate 		 */
8550Sstevel@tonic-gate 		CPU_STATS_ADDQ(CPU, vm, scan, 1);
8560Sstevel@tonic-gate 
8570Sstevel@tonic-gate 		/*
8580Sstevel@tonic-gate 		 * Don't include ineligible pages in the number scanned.
8590Sstevel@tonic-gate 		 */
8600Sstevel@tonic-gate 		if (rvfront != -1 || rvback != -1)
8610Sstevel@tonic-gate 			nscan++;
8620Sstevel@tonic-gate 
8630Sstevel@tonic-gate 		backhand = page_next(backhand);
8640Sstevel@tonic-gate 
8650Sstevel@tonic-gate 		/*
8660Sstevel@tonic-gate 		 * backhand update and wraparound check are done separately
8670Sstevel@tonic-gate 		 * because lint barks when it finds an empty "if" body
8680Sstevel@tonic-gate 		 */
8690Sstevel@tonic-gate 
8700Sstevel@tonic-gate 		if ((fronthand = page_next(fronthand)) == page_first())	{
8710Sstevel@tonic-gate 			TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP,
8726118Sjimp 			    "pageout_hand_wrap:freemem %ld whichhand %d",
8736118Sjimp 			    freemem, FRONT);
8740Sstevel@tonic-gate 
8750Sstevel@tonic-gate 			/*
8760Sstevel@tonic-gate 			 * protected by pageout_mutex instead of cpu_stat_lock
8770Sstevel@tonic-gate 			 */
8780Sstevel@tonic-gate 			CPU_STATS_ADDQ(CPU, vm, rev, 1);
8790Sstevel@tonic-gate 			if (++count > 1) {
8800Sstevel@tonic-gate 				/*
8810Sstevel@tonic-gate 				 * Extremely unlikely, but it happens.
8820Sstevel@tonic-gate 				 * We went around the loop at least once
8830Sstevel@tonic-gate 				 * and didn't get far enough.
8840Sstevel@tonic-gate 				 * If we are still skipping `highly shared'
8850Sstevel@tonic-gate 				 * pages, skip fewer of them.  Otherwise,
8860Sstevel@tonic-gate 				 * give up till the next clock tick.
8870Sstevel@tonic-gate 				 */
8880Sstevel@tonic-gate 				if (po_share < MAX_PO_SHARE) {
8890Sstevel@tonic-gate 					po_share <<= 1;
8900Sstevel@tonic-gate 				} else {
8910Sstevel@tonic-gate 					/*
8920Sstevel@tonic-gate 					 * Really a "goto loop", but
8930Sstevel@tonic-gate 					 * if someone is TRACing or
8940Sstevel@tonic-gate 					 * TNF_PROBE_ing, at least
8950Sstevel@tonic-gate 					 * make records to show
8960Sstevel@tonic-gate 					 * where we are.
8970Sstevel@tonic-gate 					 */
8980Sstevel@tonic-gate 					break;
8990Sstevel@tonic-gate 				}
9000Sstevel@tonic-gate 			}
9010Sstevel@tonic-gate 		}
9020Sstevel@tonic-gate 	}
9030Sstevel@tonic-gate 
9040Sstevel@tonic-gate 	sample_end = gethrtime();
9050Sstevel@tonic-gate 
9060Sstevel@tonic-gate 	TRACE_5(TR_FAC_VM, TR_PAGEOUT_END,
9076118Sjimp 	    "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u",
9086118Sjimp 	    freemem, lotsfree, nscan, desscan, count);
9090Sstevel@tonic-gate 
9100Sstevel@tonic-gate 	/* Kernel probe */
9110Sstevel@tonic-gate 	TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */,
9126118Sjimp 	    tnf_ulong, pages_scanned, nscan, tnf_ulong, pages_free, freemem);
9130Sstevel@tonic-gate 
9140Sstevel@tonic-gate 	if (pageout_sample_cnt < pageout_sample_lim) {
9150Sstevel@tonic-gate 		pageout_sample_pages += pcount;
9160Sstevel@tonic-gate 		pageout_sample_etime += sample_end - sample_start;
9170Sstevel@tonic-gate 		++pageout_sample_cnt;
9180Sstevel@tonic-gate 	}
9190Sstevel@tonic-gate 	if (pageout_sample_cnt >= pageout_sample_lim &&
9200Sstevel@tonic-gate 	    pageout_new_spread == 0) {
9210Sstevel@tonic-gate 		pageout_rate = (hrrate_t)pageout_sample_pages *
9220Sstevel@tonic-gate 		    (hrrate_t)(NANOSEC) / pageout_sample_etime;
9230Sstevel@tonic-gate 		pageout_new_spread = pageout_rate / 10;
9240Sstevel@tonic-gate 		setupclock(1);
9250Sstevel@tonic-gate 	}
9260Sstevel@tonic-gate 
9270Sstevel@tonic-gate 	goto loop;
9280Sstevel@tonic-gate }
9290Sstevel@tonic-gate 
9300Sstevel@tonic-gate /*
9310Sstevel@tonic-gate  * Look at the page at hand.  If it is locked (e.g., for physical i/o),
9320Sstevel@tonic-gate  * system (u., page table) or free, then leave it alone.  Otherwise,
9330Sstevel@tonic-gate  * if we are running the front hand, turn off the page's reference bit.
9340Sstevel@tonic-gate  * If the proc is over maxrss, we take it.  If running the back hand,
9350Sstevel@tonic-gate  * check whether the page has been reclaimed.  If not, free the page,
9360Sstevel@tonic-gate  * pushing it to disk first if necessary.
9370Sstevel@tonic-gate  *
9380Sstevel@tonic-gate  * Return values:
9390Sstevel@tonic-gate  *	-1 if the page is not a candidate at all,
9400Sstevel@tonic-gate  *	 0 if not freed, or
9410Sstevel@tonic-gate  *	 1 if we freed it.
9420Sstevel@tonic-gate  */
9430Sstevel@tonic-gate static int
checkpage(struct page * pp,int whichhand)9440Sstevel@tonic-gate checkpage(struct page *pp, int whichhand)
9450Sstevel@tonic-gate {
9460Sstevel@tonic-gate 	int ppattr;
9470Sstevel@tonic-gate 	int isfs = 0;
9480Sstevel@tonic-gate 	int isexec = 0;
9490Sstevel@tonic-gate 	int pagesync_flag;
9500Sstevel@tonic-gate 
9510Sstevel@tonic-gate 	/*
9520Sstevel@tonic-gate 	 * Skip pages:
9530Sstevel@tonic-gate 	 * 	- associated with the kernel vnode since
9540Sstevel@tonic-gate 	 *	    they are always "exclusively" locked.
9550Sstevel@tonic-gate 	 *	- that are free
9560Sstevel@tonic-gate 	 *	- that are shared more than po_share'd times
9570Sstevel@tonic-gate 	 *	- its already locked
9580Sstevel@tonic-gate 	 *
9590Sstevel@tonic-gate 	 * NOTE:  These optimizations assume that reads are atomic.
9600Sstevel@tonic-gate 	 */
9616695Saguzovsk 
9626695Saguzovsk 	if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
9636695Saguzovsk 	    pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
9646695Saguzovsk 	    hat_page_checkshare(pp, po_share)) {
9650Sstevel@tonic-gate 		return (-1);
9660Sstevel@tonic-gate 	}
9670Sstevel@tonic-gate 
9680Sstevel@tonic-gate 	if (!page_trylock(pp, SE_EXCL)) {
9690Sstevel@tonic-gate 		/*
9700Sstevel@tonic-gate 		 * Skip the page if we can't acquire the "exclusive" lock.
9710Sstevel@tonic-gate 		 */
9720Sstevel@tonic-gate 		return (-1);
9730Sstevel@tonic-gate 	} else if (PP_ISFREE(pp)) {
9740Sstevel@tonic-gate 		/*
9750Sstevel@tonic-gate 		 * It became free between the above check and our actually
9760Sstevel@tonic-gate 		 * locking the page.  Oh, well there will be other pages.
9770Sstevel@tonic-gate 		 */
9780Sstevel@tonic-gate 		page_unlock(pp);
9790Sstevel@tonic-gate 		return (-1);
9800Sstevel@tonic-gate 	}
9810Sstevel@tonic-gate 
9820Sstevel@tonic-gate 	/*
9830Sstevel@tonic-gate 	 * Reject pages that cannot be freed. The page_struct_lock
9840Sstevel@tonic-gate 	 * need not be acquired to examine these
9850Sstevel@tonic-gate 	 * fields since the page has an "exclusive" lock.
9860Sstevel@tonic-gate 	 */
9870Sstevel@tonic-gate 	if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
9880Sstevel@tonic-gate 		page_unlock(pp);
9890Sstevel@tonic-gate 		return (-1);
9900Sstevel@tonic-gate 	}
9910Sstevel@tonic-gate 
9920Sstevel@tonic-gate 	/*
9930Sstevel@tonic-gate 	 * Maintain statistics for what we are freeing
9940Sstevel@tonic-gate 	 */
9950Sstevel@tonic-gate 
9960Sstevel@tonic-gate 	if (pp->p_vnode != NULL) {
9970Sstevel@tonic-gate 		if (pp->p_vnode->v_flag & VVMEXEC)
9980Sstevel@tonic-gate 			isexec = 1;
9990Sstevel@tonic-gate 
10000Sstevel@tonic-gate 		if (!IS_SWAPFSVP(pp->p_vnode))
10010Sstevel@tonic-gate 			isfs = 1;
10020Sstevel@tonic-gate 	}
10030Sstevel@tonic-gate 
10040Sstevel@tonic-gate 	/*
10050Sstevel@tonic-gate 	 * Turn off REF and MOD bits with the front hand.
10060Sstevel@tonic-gate 	 * The back hand examines the REF bit and always considers
10070Sstevel@tonic-gate 	 * SHARED pages as referenced.
10080Sstevel@tonic-gate 	 */
10090Sstevel@tonic-gate 	if (whichhand == FRONT)
10100Sstevel@tonic-gate 		pagesync_flag = HAT_SYNC_ZERORM;
10110Sstevel@tonic-gate 	else
10120Sstevel@tonic-gate 		pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
10130Sstevel@tonic-gate 		    HAT_SYNC_STOPON_SHARED;
10140Sstevel@tonic-gate 
10150Sstevel@tonic-gate 	ppattr = hat_pagesync(pp, pagesync_flag);
10160Sstevel@tonic-gate 
10170Sstevel@tonic-gate recheck:
10180Sstevel@tonic-gate 	/*
10190Sstevel@tonic-gate 	 * If page is referenced; make unreferenced but reclaimable.
10200Sstevel@tonic-gate 	 * If this page is not referenced, then it must be reclaimable
10210Sstevel@tonic-gate 	 * and we can add it to the free list.
10220Sstevel@tonic-gate 	 */
10230Sstevel@tonic-gate 	if (ppattr & P_REF) {
10240Sstevel@tonic-gate 		TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF,
10250Sstevel@tonic-gate 		    "pageout_isref:pp %p whichhand %d", pp, whichhand);
10260Sstevel@tonic-gate 		if (whichhand == FRONT) {
10270Sstevel@tonic-gate 			/*
10280Sstevel@tonic-gate 			 * Checking of rss or madvise flags needed here...
10290Sstevel@tonic-gate 			 *
10300Sstevel@tonic-gate 			 * If not "well-behaved", fall through into the code
10310Sstevel@tonic-gate 			 * for not referenced.
10320Sstevel@tonic-gate 			 */
10330Sstevel@tonic-gate 			hat_clrref(pp);
10340Sstevel@tonic-gate 		}
10350Sstevel@tonic-gate 		/*
10360Sstevel@tonic-gate 		 * Somebody referenced the page since the front
10370Sstevel@tonic-gate 		 * hand went by, so it's not a candidate for
10380Sstevel@tonic-gate 		 * freeing up.
10390Sstevel@tonic-gate 		 */
10400Sstevel@tonic-gate 		page_unlock(pp);
10410Sstevel@tonic-gate 		return (0);
10420Sstevel@tonic-gate 	}
10430Sstevel@tonic-gate 
10440Sstevel@tonic-gate 	VM_STAT_ADD(pageoutvmstats.checkpage[0]);
10450Sstevel@tonic-gate 
10460Sstevel@tonic-gate 	/*
10470Sstevel@tonic-gate 	 * If large page, attempt to demote it. If successfully demoted,
10480Sstevel@tonic-gate 	 * retry the checkpage.
10490Sstevel@tonic-gate 	 */
10500Sstevel@tonic-gate 	if (pp->p_szc != 0) {
10510Sstevel@tonic-gate 		if (!page_try_demote_pages(pp)) {
10520Sstevel@tonic-gate 			VM_STAT_ADD(pageoutvmstats.checkpage[1]);
10530Sstevel@tonic-gate 			page_unlock(pp);
10540Sstevel@tonic-gate 			return (-1);
10550Sstevel@tonic-gate 		}
10560Sstevel@tonic-gate 		ASSERT(pp->p_szc == 0);
10570Sstevel@tonic-gate 		VM_STAT_ADD(pageoutvmstats.checkpage[2]);
10580Sstevel@tonic-gate 		/*
10590Sstevel@tonic-gate 		 * since page_try_demote_pages() could have unloaded some
10600Sstevel@tonic-gate 		 * mappings it makes sense to reload ppattr.
10610Sstevel@tonic-gate 		 */
10620Sstevel@tonic-gate 		ppattr = hat_page_getattr(pp, P_MOD | P_REF);
10630Sstevel@tonic-gate 	}
10640Sstevel@tonic-gate 
10650Sstevel@tonic-gate 	/*
10660Sstevel@tonic-gate 	 * If the page is currently dirty, we have to arrange
10670Sstevel@tonic-gate 	 * to have it cleaned before it can be freed.
10680Sstevel@tonic-gate 	 *
10690Sstevel@tonic-gate 	 * XXX - ASSERT(pp->p_vnode != NULL);
10700Sstevel@tonic-gate 	 */
10710Sstevel@tonic-gate 	if ((ppattr & P_MOD) && pp->p_vnode) {
10720Sstevel@tonic-gate 		struct vnode *vp = pp->p_vnode;
10730Sstevel@tonic-gate 		u_offset_t offset = pp->p_offset;
10740Sstevel@tonic-gate 
10750Sstevel@tonic-gate 		/*
10760Sstevel@tonic-gate 		 * XXX - Test for process being swapped out or about to exit?
10770Sstevel@tonic-gate 		 * [Can't get back to process(es) using the page.]
10780Sstevel@tonic-gate 		 */
10790Sstevel@tonic-gate 
10800Sstevel@tonic-gate 		/*
10810Sstevel@tonic-gate 		 * Hold the vnode before releasing the page lock to
10820Sstevel@tonic-gate 		 * prevent it from being freed and re-used by some
10830Sstevel@tonic-gate 		 * other thread.
10840Sstevel@tonic-gate 		 */
10850Sstevel@tonic-gate 		VN_HOLD(vp);
10860Sstevel@tonic-gate 		page_unlock(pp);
10870Sstevel@tonic-gate 
10880Sstevel@tonic-gate 		/*
10890Sstevel@tonic-gate 		 * Queue i/o request for the pageout thread.
10900Sstevel@tonic-gate 		 */
10910Sstevel@tonic-gate 		if (!queue_io_request(vp, offset)) {
10920Sstevel@tonic-gate 			VN_RELE(vp);
10930Sstevel@tonic-gate 			return (0);
10940Sstevel@tonic-gate 		}
10950Sstevel@tonic-gate 		return (1);
10960Sstevel@tonic-gate 	}
10970Sstevel@tonic-gate 
10980Sstevel@tonic-gate 	/*
10990Sstevel@tonic-gate 	 * Now we unload all the translations,
11000Sstevel@tonic-gate 	 * and put the page back on to the free list.
11010Sstevel@tonic-gate 	 * If the page was used (referenced or modified) after
11020Sstevel@tonic-gate 	 * the pagesync but before it was unloaded we catch it
11030Sstevel@tonic-gate 	 * and handle the page properly.
11040Sstevel@tonic-gate 	 */
11050Sstevel@tonic-gate 	TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE,
11066118Sjimp 	    "pageout_free:pp %p whichhand %d", pp, whichhand);
11070Sstevel@tonic-gate 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
11080Sstevel@tonic-gate 	ppattr = hat_page_getattr(pp, P_MOD | P_REF);
11090Sstevel@tonic-gate 	if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode))
11100Sstevel@tonic-gate 		goto recheck;
11110Sstevel@tonic-gate 
11120Sstevel@tonic-gate 	/*LINTED: constant in conditional context*/
11130Sstevel@tonic-gate 	VN_DISPOSE(pp, B_FREE, 0, kcred);
11140Sstevel@tonic-gate 
11150Sstevel@tonic-gate 	CPU_STATS_ADD_K(vm, dfree, 1);
11160Sstevel@tonic-gate 
11170Sstevel@tonic-gate 	if (isfs) {
11180Sstevel@tonic-gate 		if (isexec) {
11190Sstevel@tonic-gate 			CPU_STATS_ADD_K(vm, execfree, 1);
11200Sstevel@tonic-gate 		} else {
11210Sstevel@tonic-gate 			CPU_STATS_ADD_K(vm, fsfree, 1);
11220Sstevel@tonic-gate 		}
11230Sstevel@tonic-gate 	} else {
11240Sstevel@tonic-gate 		CPU_STATS_ADD_K(vm, anonfree, 1);
11250Sstevel@tonic-gate 	}
11260Sstevel@tonic-gate 
11270Sstevel@tonic-gate 	return (1);		/* freed a page! */
11280Sstevel@tonic-gate }
11290Sstevel@tonic-gate 
11300Sstevel@tonic-gate /*
11310Sstevel@tonic-gate  * Queue async i/o request from pageout_scanner and segment swapout
11320Sstevel@tonic-gate  * routines on one common list.  This ensures that pageout devices (swap)
11330Sstevel@tonic-gate  * are not saturated by pageout_scanner or swapout requests.
11340Sstevel@tonic-gate  * The pageout thread empties this list by initiating i/o operations.
11350Sstevel@tonic-gate  */
11360Sstevel@tonic-gate int
queue_io_request(vnode_t * vp,u_offset_t off)11370Sstevel@tonic-gate queue_io_request(vnode_t *vp, u_offset_t off)
11380Sstevel@tonic-gate {
11390Sstevel@tonic-gate 	struct async_reqs *arg;
11400Sstevel@tonic-gate 
11410Sstevel@tonic-gate 	/*
11420Sstevel@tonic-gate 	 * If we cannot allocate an async request struct,
11430Sstevel@tonic-gate 	 * skip this page.
11440Sstevel@tonic-gate 	 */
11450Sstevel@tonic-gate 	mutex_enter(&push_lock);
11460Sstevel@tonic-gate 	if ((arg = req_freelist) == NULL) {
11470Sstevel@tonic-gate 		mutex_exit(&push_lock);
11480Sstevel@tonic-gate 		return (0);
11490Sstevel@tonic-gate 	}
11500Sstevel@tonic-gate 	req_freelist = arg->a_next;		/* adjust freelist */
11510Sstevel@tonic-gate 	push_list_size++;
11520Sstevel@tonic-gate 
11530Sstevel@tonic-gate 	arg->a_vp = vp;
11540Sstevel@tonic-gate 	arg->a_off = off;
11550Sstevel@tonic-gate 	arg->a_len = PAGESIZE;
11560Sstevel@tonic-gate 	arg->a_flags = B_ASYNC | B_FREE;
11570Sstevel@tonic-gate 	arg->a_cred = kcred;		/* always held */
11580Sstevel@tonic-gate 
11590Sstevel@tonic-gate 	/*
11600Sstevel@tonic-gate 	 * Add to list of pending write requests.
11610Sstevel@tonic-gate 	 */
11620Sstevel@tonic-gate 	arg->a_next = push_list;
11630Sstevel@tonic-gate 	push_list = arg;
11640Sstevel@tonic-gate 
11650Sstevel@tonic-gate 	if (req_freelist == NULL) {
11660Sstevel@tonic-gate 		/*
11670Sstevel@tonic-gate 		 * No free async requests left. The lock is held so we
11680Sstevel@tonic-gate 		 * might as well signal the pusher thread now.
11690Sstevel@tonic-gate 		 */
11700Sstevel@tonic-gate 		cv_signal(&push_cv);
11710Sstevel@tonic-gate 	}
11720Sstevel@tonic-gate 	mutex_exit(&push_lock);
11730Sstevel@tonic-gate 	return (1);
11740Sstevel@tonic-gate }
11750Sstevel@tonic-gate 
11760Sstevel@tonic-gate /*
11770Sstevel@tonic-gate  * Wakeup pageout to initiate i/o if push_list is not empty.
11780Sstevel@tonic-gate  */
11790Sstevel@tonic-gate void
cv_signal_pageout()11800Sstevel@tonic-gate cv_signal_pageout()
11810Sstevel@tonic-gate {
11820Sstevel@tonic-gate 	if (push_list != NULL) {
11830Sstevel@tonic-gate 		mutex_enter(&push_lock);
11840Sstevel@tonic-gate 		cv_signal(&push_cv);
11850Sstevel@tonic-gate 		mutex_exit(&push_lock);
11860Sstevel@tonic-gate 	}
11870Sstevel@tonic-gate }
1188