10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 52999Sstans * Common Development and Distribution License (the "License"). 62999Sstans * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate /* 22*11888SPavel.Filipensky@Sun.COM * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 230Sstevel@tonic-gate * Use is subject to license terms. 240Sstevel@tonic-gate */ 250Sstevel@tonic-gate 260Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 270Sstevel@tonic-gate /* All Rights Reserved */ 280Sstevel@tonic-gate 290Sstevel@tonic-gate /* 300Sstevel@tonic-gate * University Copyright- Copyright (c) 1982, 1986, 1988 310Sstevel@tonic-gate * The Regents of the University of California 320Sstevel@tonic-gate * All Rights Reserved 330Sstevel@tonic-gate * 340Sstevel@tonic-gate * University Acknowledgment- Portions of this document are derived from 350Sstevel@tonic-gate * software developed by the University of California, Berkeley, and its 360Sstevel@tonic-gate * contributors. 370Sstevel@tonic-gate */ 380Sstevel@tonic-gate 390Sstevel@tonic-gate /* 400Sstevel@tonic-gate * VM - paged vnode. 410Sstevel@tonic-gate * 420Sstevel@tonic-gate * This file supplies vm support for the vnode operations that deal with pages. 430Sstevel@tonic-gate */ 440Sstevel@tonic-gate #include <sys/types.h> 450Sstevel@tonic-gate #include <sys/t_lock.h> 460Sstevel@tonic-gate #include <sys/param.h> 470Sstevel@tonic-gate #include <sys/sysmacros.h> 480Sstevel@tonic-gate #include <sys/systm.h> 490Sstevel@tonic-gate #include <sys/time.h> 500Sstevel@tonic-gate #include <sys/buf.h> 510Sstevel@tonic-gate #include <sys/vnode.h> 520Sstevel@tonic-gate #include <sys/uio.h> 530Sstevel@tonic-gate #include <sys/vmmeter.h> 540Sstevel@tonic-gate #include <sys/vmsystm.h> 550Sstevel@tonic-gate #include <sys/mman.h> 560Sstevel@tonic-gate #include <sys/vfs.h> 570Sstevel@tonic-gate #include <sys/cred.h> 580Sstevel@tonic-gate #include <sys/user.h> 590Sstevel@tonic-gate #include <sys/kmem.h> 600Sstevel@tonic-gate #include <sys/cmn_err.h> 610Sstevel@tonic-gate #include <sys/debug.h> 620Sstevel@tonic-gate #include <sys/cpuvar.h> 630Sstevel@tonic-gate #include <sys/vtrace.h> 640Sstevel@tonic-gate #include <sys/tnf_probe.h> 650Sstevel@tonic-gate 660Sstevel@tonic-gate #include <vm/hat.h> 670Sstevel@tonic-gate #include <vm/as.h> 680Sstevel@tonic-gate #include <vm/seg.h> 690Sstevel@tonic-gate #include <vm/rm.h> 700Sstevel@tonic-gate #include <vm/pvn.h> 710Sstevel@tonic-gate #include <vm/page.h> 720Sstevel@tonic-gate #include <vm/seg_map.h> 730Sstevel@tonic-gate #include <vm/seg_kmem.h> 740Sstevel@tonic-gate #include <sys/fs/swapnode.h> 750Sstevel@tonic-gate 760Sstevel@tonic-gate int pvn_nofodklust = 0; 770Sstevel@tonic-gate int pvn_write_noklust = 0; 780Sstevel@tonic-gate 790Sstevel@tonic-gate uint_t pvn_vmodsort_supported = 0; /* set if HAT supports VMODSORT */ 800Sstevel@tonic-gate uint_t pvn_vmodsort_disable = 0; /* set in /etc/system to disable HAT */ 810Sstevel@tonic-gate /* support for vmodsort for testing */ 820Sstevel@tonic-gate 830Sstevel@tonic-gate static struct kmem_cache *marker_cache = NULL; 840Sstevel@tonic-gate 850Sstevel@tonic-gate /* 860Sstevel@tonic-gate * Find the largest contiguous block which contains `addr' for file offset 870Sstevel@tonic-gate * `offset' in it while living within the file system block sizes (`vp_off' 880Sstevel@tonic-gate * and `vp_len') and the address space limits for which no pages currently 890Sstevel@tonic-gate * exist and which map to consecutive file offsets. 900Sstevel@tonic-gate */ 910Sstevel@tonic-gate page_t * 920Sstevel@tonic-gate pvn_read_kluster( 930Sstevel@tonic-gate struct vnode *vp, 940Sstevel@tonic-gate u_offset_t off, 950Sstevel@tonic-gate struct seg *seg, 960Sstevel@tonic-gate caddr_t addr, 970Sstevel@tonic-gate u_offset_t *offp, /* return values */ 980Sstevel@tonic-gate size_t *lenp, /* return values */ 990Sstevel@tonic-gate u_offset_t vp_off, 1000Sstevel@tonic-gate size_t vp_len, 1010Sstevel@tonic-gate int isra) 1020Sstevel@tonic-gate { 1030Sstevel@tonic-gate ssize_t deltaf, deltab; 1040Sstevel@tonic-gate page_t *pp; 1050Sstevel@tonic-gate page_t *plist = NULL; 1060Sstevel@tonic-gate spgcnt_t pagesavail; 1070Sstevel@tonic-gate u_offset_t vp_end; 1080Sstevel@tonic-gate 1090Sstevel@tonic-gate ASSERT(off >= vp_off && off < vp_off + vp_len); 1100Sstevel@tonic-gate 1110Sstevel@tonic-gate /* 1120Sstevel@tonic-gate * We only want to do klustering/read ahead if there 1130Sstevel@tonic-gate * is more than minfree pages currently available. 1140Sstevel@tonic-gate */ 1150Sstevel@tonic-gate pagesavail = freemem - minfree; 1160Sstevel@tonic-gate 1170Sstevel@tonic-gate if (pagesavail <= 0) 1180Sstevel@tonic-gate if (isra) 1190Sstevel@tonic-gate return ((page_t *)NULL); /* ra case - give up */ 1200Sstevel@tonic-gate else 1210Sstevel@tonic-gate pagesavail = 1; /* must return a page */ 1220Sstevel@tonic-gate 1230Sstevel@tonic-gate /* We calculate in pages instead of bytes due to 32-bit overflows */ 1240Sstevel@tonic-gate if (pagesavail < (spgcnt_t)btopr(vp_len)) { 1250Sstevel@tonic-gate /* 1260Sstevel@tonic-gate * Don't have enough free memory for the 1270Sstevel@tonic-gate * max request, try sizing down vp request. 1280Sstevel@tonic-gate */ 1290Sstevel@tonic-gate deltab = (ssize_t)(off - vp_off); 1300Sstevel@tonic-gate vp_len -= deltab; 1310Sstevel@tonic-gate vp_off += deltab; 1320Sstevel@tonic-gate if (pagesavail < btopr(vp_len)) { 1330Sstevel@tonic-gate /* 1340Sstevel@tonic-gate * Still not enough memory, just settle for 1350Sstevel@tonic-gate * pagesavail which is at least 1. 1360Sstevel@tonic-gate */ 1370Sstevel@tonic-gate vp_len = ptob(pagesavail); 1380Sstevel@tonic-gate } 1390Sstevel@tonic-gate } 1400Sstevel@tonic-gate 1410Sstevel@tonic-gate vp_end = vp_off + vp_len; 1420Sstevel@tonic-gate ASSERT(off >= vp_off && off < vp_end); 1430Sstevel@tonic-gate 1440Sstevel@tonic-gate if (isra && SEGOP_KLUSTER(seg, addr, 0)) 1450Sstevel@tonic-gate return ((page_t *)NULL); /* segment driver says no */ 1460Sstevel@tonic-gate 1470Sstevel@tonic-gate if ((plist = page_create_va(vp, off, 1480Sstevel@tonic-gate PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL) 1490Sstevel@tonic-gate return ((page_t *)NULL); 1500Sstevel@tonic-gate 1510Sstevel@tonic-gate if (vp_len <= PAGESIZE || pvn_nofodklust) { 1520Sstevel@tonic-gate *offp = off; 1530Sstevel@tonic-gate *lenp = MIN(vp_len, PAGESIZE); 1540Sstevel@tonic-gate } else { 1550Sstevel@tonic-gate /* 1560Sstevel@tonic-gate * Scan back from front by incrementing "deltab" and 1570Sstevel@tonic-gate * comparing "off" with "vp_off + deltab" to avoid 1580Sstevel@tonic-gate * "signed" versus "unsigned" conversion problems. 1590Sstevel@tonic-gate */ 1600Sstevel@tonic-gate for (deltab = PAGESIZE; off >= vp_off + deltab; 1610Sstevel@tonic-gate deltab += PAGESIZE) { 1620Sstevel@tonic-gate /* 1630Sstevel@tonic-gate * Call back to the segment driver to verify that 1640Sstevel@tonic-gate * the klustering/read ahead operation makes sense. 1650Sstevel@tonic-gate */ 1660Sstevel@tonic-gate if (SEGOP_KLUSTER(seg, addr, -deltab)) 1670Sstevel@tonic-gate break; /* page not eligible */ 1680Sstevel@tonic-gate if ((pp = page_create_va(vp, off - deltab, 1690Sstevel@tonic-gate PAGESIZE, PG_EXCL, seg, addr - deltab)) 1700Sstevel@tonic-gate == NULL) 1710Sstevel@tonic-gate break; /* already have the page */ 1720Sstevel@tonic-gate /* 1730Sstevel@tonic-gate * Add page to front of page list. 1740Sstevel@tonic-gate */ 1750Sstevel@tonic-gate page_add(&plist, pp); 1760Sstevel@tonic-gate } 1770Sstevel@tonic-gate deltab -= PAGESIZE; 1780Sstevel@tonic-gate 1790Sstevel@tonic-gate /* scan forward from front */ 1800Sstevel@tonic-gate for (deltaf = PAGESIZE; off + deltaf < vp_end; 1810Sstevel@tonic-gate deltaf += PAGESIZE) { 1820Sstevel@tonic-gate /* 1830Sstevel@tonic-gate * Call back to the segment driver to verify that 1840Sstevel@tonic-gate * the klustering/read ahead operation makes sense. 1850Sstevel@tonic-gate */ 1860Sstevel@tonic-gate if (SEGOP_KLUSTER(seg, addr, deltaf)) 1870Sstevel@tonic-gate break; /* page not file extension */ 1880Sstevel@tonic-gate if ((pp = page_create_va(vp, off + deltaf, 1890Sstevel@tonic-gate PAGESIZE, PG_EXCL, seg, addr + deltaf)) 1900Sstevel@tonic-gate == NULL) 1910Sstevel@tonic-gate break; /* already have page */ 1920Sstevel@tonic-gate 1930Sstevel@tonic-gate /* 1940Sstevel@tonic-gate * Add page to end of page list. 1950Sstevel@tonic-gate */ 1960Sstevel@tonic-gate page_add(&plist, pp); 1970Sstevel@tonic-gate plist = plist->p_next; 1980Sstevel@tonic-gate } 1990Sstevel@tonic-gate *offp = off = off - deltab; 2000Sstevel@tonic-gate *lenp = deltab + deltaf; 2010Sstevel@tonic-gate ASSERT(off >= vp_off); 2020Sstevel@tonic-gate 2030Sstevel@tonic-gate /* 2040Sstevel@tonic-gate * If we ended up getting more than was actually 2050Sstevel@tonic-gate * requested, retract the returned length to only 2060Sstevel@tonic-gate * reflect what was requested. This might happen 2070Sstevel@tonic-gate * if we were allowed to kluster pages across a 2080Sstevel@tonic-gate * span of (say) 5 frags, and frag size is less 2090Sstevel@tonic-gate * than PAGESIZE. We need a whole number of 2100Sstevel@tonic-gate * pages to contain those frags, but the returned 2110Sstevel@tonic-gate * size should only allow the returned range to 2120Sstevel@tonic-gate * extend as far as the end of the frags. 2130Sstevel@tonic-gate */ 2140Sstevel@tonic-gate if ((vp_off + vp_len) < (off + *lenp)) { 2150Sstevel@tonic-gate ASSERT(vp_end > off); 2160Sstevel@tonic-gate *lenp = vp_end - off; 2170Sstevel@tonic-gate } 2180Sstevel@tonic-gate } 2190Sstevel@tonic-gate TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER, 220*11888SPavel.Filipensky@Sun.COM "pvn_read_kluster:seg %p addr %x isra %x", 221*11888SPavel.Filipensky@Sun.COM seg, addr, isra); 2220Sstevel@tonic-gate return (plist); 2230Sstevel@tonic-gate } 2240Sstevel@tonic-gate 2250Sstevel@tonic-gate /* 2260Sstevel@tonic-gate * Handle pages for this vnode on either side of the page "pp" 2270Sstevel@tonic-gate * which has been locked by the caller. This routine will also 2280Sstevel@tonic-gate * do klustering in the range [vp_off, vp_off + vp_len] up 2290Sstevel@tonic-gate * until a page which is not found. The offset and length 2300Sstevel@tonic-gate * of pages included is returned in "*offp" and "*lenp". 2310Sstevel@tonic-gate * 2320Sstevel@tonic-gate * Returns a list of dirty locked pages all ready to be 2330Sstevel@tonic-gate * written back. 2340Sstevel@tonic-gate */ 2350Sstevel@tonic-gate page_t * 2360Sstevel@tonic-gate pvn_write_kluster( 2370Sstevel@tonic-gate struct vnode *vp, 2380Sstevel@tonic-gate page_t *pp, 2390Sstevel@tonic-gate u_offset_t *offp, /* return values */ 2400Sstevel@tonic-gate size_t *lenp, /* return values */ 2410Sstevel@tonic-gate u_offset_t vp_off, 2420Sstevel@tonic-gate size_t vp_len, 2430Sstevel@tonic-gate int flags) 2440Sstevel@tonic-gate { 2450Sstevel@tonic-gate u_offset_t off; 2460Sstevel@tonic-gate page_t *dirty; 2470Sstevel@tonic-gate size_t deltab, deltaf; 2480Sstevel@tonic-gate se_t se; 2490Sstevel@tonic-gate u_offset_t vp_end; 2500Sstevel@tonic-gate 2510Sstevel@tonic-gate off = pp->p_offset; 2520Sstevel@tonic-gate 2530Sstevel@tonic-gate /* 2540Sstevel@tonic-gate * Kustering should not be done if we are invalidating 2550Sstevel@tonic-gate * pages since we could destroy pages that belong to 2560Sstevel@tonic-gate * some other process if this is a swap vnode. 2570Sstevel@tonic-gate */ 2580Sstevel@tonic-gate if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) { 2590Sstevel@tonic-gate *offp = off; 2600Sstevel@tonic-gate *lenp = PAGESIZE; 2610Sstevel@tonic-gate return (pp); 2620Sstevel@tonic-gate } 2630Sstevel@tonic-gate 2640Sstevel@tonic-gate if (flags & (B_FREE | B_INVAL)) 2650Sstevel@tonic-gate se = SE_EXCL; 2660Sstevel@tonic-gate else 2670Sstevel@tonic-gate se = SE_SHARED; 2680Sstevel@tonic-gate 2690Sstevel@tonic-gate dirty = pp; 2700Sstevel@tonic-gate /* 2710Sstevel@tonic-gate * Scan backwards looking for pages to kluster by incrementing 2720Sstevel@tonic-gate * "deltab" and comparing "off" with "vp_off + deltab" to 2730Sstevel@tonic-gate * avoid "signed" versus "unsigned" conversion problems. 2740Sstevel@tonic-gate */ 2750Sstevel@tonic-gate for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) { 2760Sstevel@tonic-gate pp = page_lookup_nowait(vp, off - deltab, se); 2770Sstevel@tonic-gate if (pp == NULL) 2780Sstevel@tonic-gate break; /* page not found */ 2790Sstevel@tonic-gate if (pvn_getdirty(pp, flags | B_DELWRI) == 0) 2800Sstevel@tonic-gate break; 2810Sstevel@tonic-gate page_add(&dirty, pp); 2820Sstevel@tonic-gate } 2830Sstevel@tonic-gate deltab -= PAGESIZE; 2840Sstevel@tonic-gate 2850Sstevel@tonic-gate vp_end = vp_off + vp_len; 2860Sstevel@tonic-gate /* now scan forwards looking for pages to kluster */ 2870Sstevel@tonic-gate for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) { 2880Sstevel@tonic-gate pp = page_lookup_nowait(vp, off + deltaf, se); 2890Sstevel@tonic-gate if (pp == NULL) 2900Sstevel@tonic-gate break; /* page not found */ 2910Sstevel@tonic-gate if (pvn_getdirty(pp, flags | B_DELWRI) == 0) 2920Sstevel@tonic-gate break; 2930Sstevel@tonic-gate page_add(&dirty, pp); 2940Sstevel@tonic-gate dirty = dirty->p_next; 2950Sstevel@tonic-gate } 2960Sstevel@tonic-gate 2970Sstevel@tonic-gate *offp = off - deltab; 2980Sstevel@tonic-gate *lenp = deltab + deltaf; 2990Sstevel@tonic-gate return (dirty); 3000Sstevel@tonic-gate } 3010Sstevel@tonic-gate 3020Sstevel@tonic-gate /* 3030Sstevel@tonic-gate * Generic entry point used to release the "shared/exclusive" lock 3040Sstevel@tonic-gate * and the "p_iolock" on pages after i/o is complete. 3050Sstevel@tonic-gate */ 3060Sstevel@tonic-gate void 3070Sstevel@tonic-gate pvn_io_done(page_t *plist) 3080Sstevel@tonic-gate { 3090Sstevel@tonic-gate page_t *pp; 3100Sstevel@tonic-gate 3110Sstevel@tonic-gate while (plist != NULL) { 3120Sstevel@tonic-gate pp = plist; 3130Sstevel@tonic-gate page_sub(&plist, pp); 3140Sstevel@tonic-gate page_io_unlock(pp); 3150Sstevel@tonic-gate page_unlock(pp); 3160Sstevel@tonic-gate } 3170Sstevel@tonic-gate } 3180Sstevel@tonic-gate 3190Sstevel@tonic-gate /* 3200Sstevel@tonic-gate * Entry point to be used by file system getpage subr's and 3210Sstevel@tonic-gate * other such routines which either want to unlock pages (B_ASYNC 3220Sstevel@tonic-gate * request) or destroy a list of pages if an error occurred. 3230Sstevel@tonic-gate */ 3240Sstevel@tonic-gate void 3250Sstevel@tonic-gate pvn_read_done(page_t *plist, int flags) 3260Sstevel@tonic-gate { 3270Sstevel@tonic-gate page_t *pp; 3280Sstevel@tonic-gate 3290Sstevel@tonic-gate while (plist != NULL) { 3300Sstevel@tonic-gate pp = plist; 3310Sstevel@tonic-gate page_sub(&plist, pp); 3320Sstevel@tonic-gate page_io_unlock(pp); 3330Sstevel@tonic-gate if (flags & B_ERROR) { 3340Sstevel@tonic-gate /*LINTED: constant in conditional context*/ 3350Sstevel@tonic-gate VN_DISPOSE(pp, B_INVAL, 0, kcred); 3360Sstevel@tonic-gate } else { 3370Sstevel@tonic-gate (void) page_release(pp, 0); 3380Sstevel@tonic-gate } 3390Sstevel@tonic-gate } 3400Sstevel@tonic-gate } 3410Sstevel@tonic-gate 3420Sstevel@tonic-gate /* 3430Sstevel@tonic-gate * Automagic pageout. 3440Sstevel@tonic-gate * When memory gets tight, start freeing pages popping out of the 3450Sstevel@tonic-gate * write queue. 3460Sstevel@tonic-gate */ 3470Sstevel@tonic-gate int write_free = 1; 3480Sstevel@tonic-gate pgcnt_t pages_before_pager = 200; /* LMXXX */ 3490Sstevel@tonic-gate 3500Sstevel@tonic-gate /* 3510Sstevel@tonic-gate * Routine to be called when page-out's complete. 3520Sstevel@tonic-gate * The caller, typically VOP_PUTPAGE, has to explicity call this routine 3530Sstevel@tonic-gate * after waiting for i/o to complete (biowait) to free the list of 3540Sstevel@tonic-gate * pages associated with the buffer. These pages must be locked 3550Sstevel@tonic-gate * before i/o is initiated. 3560Sstevel@tonic-gate * 3570Sstevel@tonic-gate * If a write error occurs, the pages are marked as modified 3580Sstevel@tonic-gate * so the write will be re-tried later. 3590Sstevel@tonic-gate */ 3600Sstevel@tonic-gate 3610Sstevel@tonic-gate void 3620Sstevel@tonic-gate pvn_write_done(page_t *plist, int flags) 3630Sstevel@tonic-gate { 3640Sstevel@tonic-gate int dfree = 0; 3650Sstevel@tonic-gate int pgrec = 0; 3660Sstevel@tonic-gate int pgout = 0; 3670Sstevel@tonic-gate int pgpgout = 0; 3680Sstevel@tonic-gate int anonpgout = 0; 3690Sstevel@tonic-gate int anonfree = 0; 3700Sstevel@tonic-gate int fspgout = 0; 3710Sstevel@tonic-gate int fsfree = 0; 3720Sstevel@tonic-gate int execpgout = 0; 3730Sstevel@tonic-gate int execfree = 0; 3740Sstevel@tonic-gate page_t *pp; 3750Sstevel@tonic-gate struct cpu *cpup; 3760Sstevel@tonic-gate struct vnode *vp = NULL; /* for probe */ 3770Sstevel@tonic-gate uint_t ppattr; 3782999Sstans kmutex_t *vphm = NULL; 3790Sstevel@tonic-gate 3800Sstevel@tonic-gate ASSERT((flags & B_READ) == 0); 3810Sstevel@tonic-gate 3820Sstevel@tonic-gate /* 3830Sstevel@tonic-gate * If we are about to start paging anyway, start freeing pages. 3840Sstevel@tonic-gate */ 3850Sstevel@tonic-gate if (write_free && freemem < lotsfree + pages_before_pager && 3860Sstevel@tonic-gate (flags & B_ERROR) == 0) { 3870Sstevel@tonic-gate flags |= B_FREE; 3880Sstevel@tonic-gate } 3890Sstevel@tonic-gate 3900Sstevel@tonic-gate /* 3910Sstevel@tonic-gate * Handle each page involved in the i/o operation. 3920Sstevel@tonic-gate */ 3930Sstevel@tonic-gate while (plist != NULL) { 3940Sstevel@tonic-gate pp = plist; 3950Sstevel@tonic-gate ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp)); 3960Sstevel@tonic-gate page_sub(&plist, pp); 3970Sstevel@tonic-gate 3980Sstevel@tonic-gate /* Kernel probe support */ 3990Sstevel@tonic-gate if (vp == NULL) 4000Sstevel@tonic-gate vp = pp->p_vnode; 4010Sstevel@tonic-gate 4024324Sqiao if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) { 4032999Sstans /* 4042999Sstans * Move page to the top of the v_page list. 4052999Sstans * Skip pages modified during IO. 4062999Sstans */ 4072999Sstans vphm = page_vnode_mutex(vp); 4082999Sstans mutex_enter(vphm); 4092999Sstans if ((pp->p_vpnext != pp) && !hat_ismod(pp)) { 4102999Sstans page_vpsub(&vp->v_pages, pp); 4112999Sstans page_vpadd(&vp->v_pages, pp); 4122999Sstans } 4132999Sstans mutex_exit(vphm); 4142999Sstans } 4152999Sstans 4160Sstevel@tonic-gate if (flags & B_ERROR) { 4170Sstevel@tonic-gate /* 4180Sstevel@tonic-gate * Write operation failed. We don't want 4190Sstevel@tonic-gate * to destroy (or free) the page unless B_FORCE 4200Sstevel@tonic-gate * is set. We set the mod bit again and release 4210Sstevel@tonic-gate * all locks on the page so that it will get written 4220Sstevel@tonic-gate * back again later when things are hopefully 4230Sstevel@tonic-gate * better again. 4240Sstevel@tonic-gate * If B_INVAL and B_FORCE is set we really have 4250Sstevel@tonic-gate * to destroy the page. 4260Sstevel@tonic-gate */ 4270Sstevel@tonic-gate if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) { 4280Sstevel@tonic-gate page_io_unlock(pp); 4290Sstevel@tonic-gate /*LINTED: constant in conditional context*/ 4300Sstevel@tonic-gate VN_DISPOSE(pp, B_INVAL, 0, kcred); 4310Sstevel@tonic-gate } else { 4324324Sqiao hat_setmod_only(pp); 4330Sstevel@tonic-gate page_io_unlock(pp); 4340Sstevel@tonic-gate page_unlock(pp); 4350Sstevel@tonic-gate } 4360Sstevel@tonic-gate } else if (flags & B_INVAL) { 4370Sstevel@tonic-gate /* 4380Sstevel@tonic-gate * XXX - Failed writes with B_INVAL set are 4390Sstevel@tonic-gate * not handled appropriately. 4400Sstevel@tonic-gate */ 4410Sstevel@tonic-gate page_io_unlock(pp); 4420Sstevel@tonic-gate /*LINTED: constant in conditional context*/ 4430Sstevel@tonic-gate VN_DISPOSE(pp, B_INVAL, 0, kcred); 4440Sstevel@tonic-gate } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) { 4450Sstevel@tonic-gate /* 4460Sstevel@tonic-gate * Update statistics for pages being paged out 4470Sstevel@tonic-gate */ 4480Sstevel@tonic-gate if (pp->p_vnode) { 4490Sstevel@tonic-gate if (IS_SWAPFSVP(pp->p_vnode)) { 4500Sstevel@tonic-gate anonpgout++; 4510Sstevel@tonic-gate } else { 4520Sstevel@tonic-gate if (pp->p_vnode->v_flag & VVMEXEC) { 4530Sstevel@tonic-gate execpgout++; 4540Sstevel@tonic-gate } else { 4550Sstevel@tonic-gate fspgout++; 4560Sstevel@tonic-gate } 4570Sstevel@tonic-gate } 4580Sstevel@tonic-gate } 4590Sstevel@tonic-gate page_io_unlock(pp); 4600Sstevel@tonic-gate pgout = 1; 4610Sstevel@tonic-gate pgpgout++; 4620Sstevel@tonic-gate TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT, 463*11888SPavel.Filipensky@Sun.COM "page_ws_out:pp %p", pp); 4640Sstevel@tonic-gate 4650Sstevel@tonic-gate /* 4660Sstevel@tonic-gate * The page_struct_lock need not be acquired to 4670Sstevel@tonic-gate * examine "p_lckcnt" and "p_cowcnt" since we'll 4680Sstevel@tonic-gate * have an "exclusive" lock if the upgrade succeeds. 4690Sstevel@tonic-gate */ 4700Sstevel@tonic-gate if (page_tryupgrade(pp) && 4710Sstevel@tonic-gate pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 4720Sstevel@tonic-gate /* 4730Sstevel@tonic-gate * Check if someone has reclaimed the 4740Sstevel@tonic-gate * page. If ref and mod are not set, no 4750Sstevel@tonic-gate * one is using it so we can free it. 4760Sstevel@tonic-gate * The rest of the system is careful 4770Sstevel@tonic-gate * to use the NOSYNC flag to unload 4780Sstevel@tonic-gate * translations set up for i/o w/o 4790Sstevel@tonic-gate * affecting ref and mod bits. 4800Sstevel@tonic-gate * 4810Sstevel@tonic-gate * Obtain a copy of the real hardware 4820Sstevel@tonic-gate * mod bit using hat_pagesync(pp, HAT_DONTZERO) 4830Sstevel@tonic-gate * to avoid having to flush the cache. 4840Sstevel@tonic-gate */ 4850Sstevel@tonic-gate ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO | 486*11888SPavel.Filipensky@Sun.COM HAT_SYNC_STOPON_MOD); 4870Sstevel@tonic-gate ck_refmod: 4880Sstevel@tonic-gate if (!(ppattr & (P_REF | P_MOD))) { 4890Sstevel@tonic-gate if (hat_page_is_mapped(pp)) { 4900Sstevel@tonic-gate /* 4910Sstevel@tonic-gate * Doesn't look like the page 4920Sstevel@tonic-gate * was modified so now we 4930Sstevel@tonic-gate * really have to unload the 4940Sstevel@tonic-gate * translations. Meanwhile 4950Sstevel@tonic-gate * another CPU could've 4960Sstevel@tonic-gate * modified it so we have to 4970Sstevel@tonic-gate * check again. We don't loop 4980Sstevel@tonic-gate * forever here because now 4990Sstevel@tonic-gate * the translations are gone 5000Sstevel@tonic-gate * and no one can get a new one 5010Sstevel@tonic-gate * since we have the "exclusive" 5020Sstevel@tonic-gate * lock on the page. 5030Sstevel@tonic-gate */ 5040Sstevel@tonic-gate (void) hat_pageunload(pp, 505*11888SPavel.Filipensky@Sun.COM HAT_FORCE_PGUNLOAD); 5060Sstevel@tonic-gate ppattr = hat_page_getattr(pp, 507*11888SPavel.Filipensky@Sun.COM P_REF | P_MOD); 5080Sstevel@tonic-gate goto ck_refmod; 5090Sstevel@tonic-gate } 5100Sstevel@tonic-gate /* 5110Sstevel@tonic-gate * Update statistics for pages being 5120Sstevel@tonic-gate * freed 5130Sstevel@tonic-gate */ 5140Sstevel@tonic-gate if (pp->p_vnode) { 5150Sstevel@tonic-gate if (IS_SWAPFSVP(pp->p_vnode)) { 5160Sstevel@tonic-gate anonfree++; 5170Sstevel@tonic-gate } else { 5180Sstevel@tonic-gate if (pp->p_vnode->v_flag 5190Sstevel@tonic-gate & VVMEXEC) { 5200Sstevel@tonic-gate execfree++; 5210Sstevel@tonic-gate } else { 5220Sstevel@tonic-gate fsfree++; 5230Sstevel@tonic-gate } 5240Sstevel@tonic-gate } 5250Sstevel@tonic-gate } 5260Sstevel@tonic-gate /*LINTED: constant in conditional ctx*/ 5270Sstevel@tonic-gate VN_DISPOSE(pp, B_FREE, 528*11888SPavel.Filipensky@Sun.COM (flags & B_DONTNEED), kcred); 5290Sstevel@tonic-gate dfree++; 5300Sstevel@tonic-gate } else { 5310Sstevel@tonic-gate page_unlock(pp); 5320Sstevel@tonic-gate pgrec++; 5330Sstevel@tonic-gate TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE, 5340Sstevel@tonic-gate "page_ws_free:pp %p", pp); 5350Sstevel@tonic-gate } 5360Sstevel@tonic-gate } else { 5370Sstevel@tonic-gate /* 5380Sstevel@tonic-gate * Page is either `locked' in memory 5390Sstevel@tonic-gate * or was reclaimed and now has a 5400Sstevel@tonic-gate * "shared" lock, so release it. 5410Sstevel@tonic-gate */ 5420Sstevel@tonic-gate page_unlock(pp); 5430Sstevel@tonic-gate } 5440Sstevel@tonic-gate } else { 5450Sstevel@tonic-gate /* 5460Sstevel@tonic-gate * Neither B_FREE nor B_INVAL nor B_ERROR. 5470Sstevel@tonic-gate * Just release locks. 5480Sstevel@tonic-gate */ 5490Sstevel@tonic-gate page_io_unlock(pp); 5500Sstevel@tonic-gate page_unlock(pp); 5510Sstevel@tonic-gate } 5520Sstevel@tonic-gate } 5530Sstevel@tonic-gate 5540Sstevel@tonic-gate CPU_STATS_ENTER_K(); 5550Sstevel@tonic-gate cpup = CPU; /* get cpup now that CPU cannot change */ 5560Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, dfree, dfree); 5570Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec); 5580Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, pgout, pgout); 5590Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout); 5600Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout); 5610Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree); 5620Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout); 5630Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree); 5640Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout); 5650Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, execfree, execfree); 5660Sstevel@tonic-gate CPU_STATS_EXIT_K(); 5670Sstevel@tonic-gate 5680Sstevel@tonic-gate /* Kernel probe */ 5690Sstevel@tonic-gate TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */, 570*11888SPavel.Filipensky@Sun.COM tnf_opaque, vnode, vp, 571*11888SPavel.Filipensky@Sun.COM tnf_ulong, pages_pageout, pgpgout, 572*11888SPavel.Filipensky@Sun.COM tnf_ulong, pages_freed, dfree, 573*11888SPavel.Filipensky@Sun.COM tnf_ulong, pages_reclaimed, pgrec); 5740Sstevel@tonic-gate } 5750Sstevel@tonic-gate 5760Sstevel@tonic-gate /* 5770Sstevel@tonic-gate * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI, 5780Sstevel@tonic-gate * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster 5790Sstevel@tonic-gate * operation and is only to be considered if it doesn't involve any 5800Sstevel@tonic-gate * waiting here. B_TRUNC indicates that the file is being truncated 5810Sstevel@tonic-gate * and so no i/o needs to be done. B_FORCE indicates that the page 5820Sstevel@tonic-gate * must be destroyed so don't try wrting it out. 5830Sstevel@tonic-gate * 5840Sstevel@tonic-gate * The caller must ensure that the page is locked. Returns 1, if 5850Sstevel@tonic-gate * the page should be written back (the "iolock" is held in this 5860Sstevel@tonic-gate * case), or 0 if the page has been dealt with or has been 5870Sstevel@tonic-gate * unlocked. 5880Sstevel@tonic-gate */ 5890Sstevel@tonic-gate int 5900Sstevel@tonic-gate pvn_getdirty(page_t *pp, int flags) 5910Sstevel@tonic-gate { 5920Sstevel@tonic-gate ASSERT((flags & (B_INVAL | B_FREE)) ? 5930Sstevel@tonic-gate PAGE_EXCL(pp) : PAGE_SHARED(pp)); 5940Sstevel@tonic-gate ASSERT(PP_ISFREE(pp) == 0); 5950Sstevel@tonic-gate 5960Sstevel@tonic-gate /* 5970Sstevel@tonic-gate * If trying to invalidate or free a logically `locked' page, 5980Sstevel@tonic-gate * forget it. Don't need page_struct_lock to check p_lckcnt and 5990Sstevel@tonic-gate * p_cowcnt as the page is exclusively locked. 6000Sstevel@tonic-gate */ 6010Sstevel@tonic-gate if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) && 6020Sstevel@tonic-gate (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) { 6030Sstevel@tonic-gate page_unlock(pp); 6040Sstevel@tonic-gate return (0); 6050Sstevel@tonic-gate } 6060Sstevel@tonic-gate 6070Sstevel@tonic-gate /* 6080Sstevel@tonic-gate * Now acquire the i/o lock so we can add it to the dirty 6090Sstevel@tonic-gate * list (if necessary). We avoid blocking on the i/o lock 6100Sstevel@tonic-gate * in the following cases: 6110Sstevel@tonic-gate * 6120Sstevel@tonic-gate * If B_DELWRI is set, which implies that this request is 6130Sstevel@tonic-gate * due to a klustering operartion. 6140Sstevel@tonic-gate * 6150Sstevel@tonic-gate * If this is an async (B_ASYNC) operation and we are not doing 6160Sstevel@tonic-gate * invalidation (B_INVAL) [The current i/o or fsflush will ensure 6170Sstevel@tonic-gate * that the the page is written out]. 6180Sstevel@tonic-gate */ 6190Sstevel@tonic-gate if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) { 6200Sstevel@tonic-gate if (!page_io_trylock(pp)) { 6210Sstevel@tonic-gate page_unlock(pp); 6220Sstevel@tonic-gate return (0); 6230Sstevel@tonic-gate } 6240Sstevel@tonic-gate } else { 6250Sstevel@tonic-gate page_io_lock(pp); 6260Sstevel@tonic-gate } 6270Sstevel@tonic-gate 6280Sstevel@tonic-gate /* 6290Sstevel@tonic-gate * If we want to free or invalidate the page then 6300Sstevel@tonic-gate * we need to unload it so that anyone who wants 6310Sstevel@tonic-gate * it will have to take a minor fault to get it. 6320Sstevel@tonic-gate * Otherwise, we're just writing the page back so we 6330Sstevel@tonic-gate * need to sync up the hardwre and software mod bit to 6340Sstevel@tonic-gate * detect any future modifications. We clear the 6350Sstevel@tonic-gate * software mod bit when we put the page on the dirty 6360Sstevel@tonic-gate * list. 6370Sstevel@tonic-gate */ 6380Sstevel@tonic-gate if (flags & (B_INVAL | B_FREE)) { 6390Sstevel@tonic-gate (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 6400Sstevel@tonic-gate } else { 6410Sstevel@tonic-gate (void) hat_pagesync(pp, HAT_SYNC_ZERORM); 6420Sstevel@tonic-gate } 6430Sstevel@tonic-gate 6440Sstevel@tonic-gate if (!hat_ismod(pp) || (flags & B_TRUNC)) { 6450Sstevel@tonic-gate /* 6460Sstevel@tonic-gate * Don't need to add it to the 6470Sstevel@tonic-gate * list after all. 6480Sstevel@tonic-gate */ 6490Sstevel@tonic-gate page_io_unlock(pp); 6500Sstevel@tonic-gate if (flags & B_INVAL) { 6510Sstevel@tonic-gate /*LINTED: constant in conditional context*/ 6520Sstevel@tonic-gate VN_DISPOSE(pp, B_INVAL, 0, kcred); 6530Sstevel@tonic-gate } else if (flags & B_FREE) { 6540Sstevel@tonic-gate /*LINTED: constant in conditional context*/ 6550Sstevel@tonic-gate VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred); 6560Sstevel@tonic-gate } else { 6570Sstevel@tonic-gate /* 6580Sstevel@tonic-gate * This is advisory path for the callers 6590Sstevel@tonic-gate * of VOP_PUTPAGE() who prefer freeing the 6600Sstevel@tonic-gate * page _only_ if no one else is accessing it. 6610Sstevel@tonic-gate * E.g. segmap_release() 6620Sstevel@tonic-gate * 6630Sstevel@tonic-gate * The above hat_ismod() check is useless because: 6640Sstevel@tonic-gate * (1) we may not be holding SE_EXCL lock; 6650Sstevel@tonic-gate * (2) we've not unloaded _all_ translations 6660Sstevel@tonic-gate * 6670Sstevel@tonic-gate * Let page_release() do the heavy-lifting. 6680Sstevel@tonic-gate */ 6690Sstevel@tonic-gate (void) page_release(pp, 1); 6700Sstevel@tonic-gate } 6710Sstevel@tonic-gate return (0); 6720Sstevel@tonic-gate } 6730Sstevel@tonic-gate 6740Sstevel@tonic-gate /* 6750Sstevel@tonic-gate * Page is dirty, get it ready for the write back 6760Sstevel@tonic-gate * and add page to the dirty list. 6770Sstevel@tonic-gate */ 6780Sstevel@tonic-gate hat_clrrefmod(pp); 6790Sstevel@tonic-gate 6800Sstevel@tonic-gate /* 6810Sstevel@tonic-gate * If we're going to free the page when we're done 6820Sstevel@tonic-gate * then we can let others try to use it starting now. 6830Sstevel@tonic-gate * We'll detect the fact that they used it when the 6840Sstevel@tonic-gate * i/o is done and avoid freeing the page. 6850Sstevel@tonic-gate */ 6860Sstevel@tonic-gate if (flags & B_FREE) 6870Sstevel@tonic-gate page_downgrade(pp); 6880Sstevel@tonic-gate 6890Sstevel@tonic-gate 6900Sstevel@tonic-gate TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp); 6910Sstevel@tonic-gate 6920Sstevel@tonic-gate return (1); 6930Sstevel@tonic-gate } 6940Sstevel@tonic-gate 6950Sstevel@tonic-gate 6960Sstevel@tonic-gate /*ARGSUSED*/ 6970Sstevel@tonic-gate static int 6980Sstevel@tonic-gate marker_constructor(void *buf, void *cdrarg, int kmflags) 6990Sstevel@tonic-gate { 7000Sstevel@tonic-gate page_t *mark = buf; 7010Sstevel@tonic-gate bzero(mark, sizeof (page_t)); 702*11888SPavel.Filipensky@Sun.COM mark->p_hash = PVN_VPLIST_HASH_TAG; 7030Sstevel@tonic-gate return (0); 7040Sstevel@tonic-gate } 7050Sstevel@tonic-gate 7060Sstevel@tonic-gate void 7070Sstevel@tonic-gate pvn_init() 7080Sstevel@tonic-gate { 7090Sstevel@tonic-gate if (pvn_vmodsort_disable == 0) 7100Sstevel@tonic-gate pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL); 7110Sstevel@tonic-gate marker_cache = kmem_cache_create("marker_cache", 7120Sstevel@tonic-gate sizeof (page_t), 0, marker_constructor, 7130Sstevel@tonic-gate NULL, NULL, NULL, NULL, 0); 7140Sstevel@tonic-gate } 7150Sstevel@tonic-gate 7160Sstevel@tonic-gate 7170Sstevel@tonic-gate /* 7180Sstevel@tonic-gate * Process a vnode's page list for all pages whose offset is >= off. 7190Sstevel@tonic-gate * Pages are to either be free'd, invalidated, or written back to disk. 7200Sstevel@tonic-gate * 7210Sstevel@tonic-gate * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE 7220Sstevel@tonic-gate * is specified, otherwise they are "shared" locked. 7230Sstevel@tonic-gate * 7240Sstevel@tonic-gate * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC} 7250Sstevel@tonic-gate * 7260Sstevel@tonic-gate * Special marker page_t's are inserted in the list in order 7270Sstevel@tonic-gate * to keep track of where we are in the list when locks are dropped. 7280Sstevel@tonic-gate * 7290Sstevel@tonic-gate * Note the list is circular and insertions can happen only at the 7300Sstevel@tonic-gate * head and tail of the list. The algorithm ensures visiting all pages 7310Sstevel@tonic-gate * on the list in the following way: 7320Sstevel@tonic-gate * 7330Sstevel@tonic-gate * Drop two marker pages at the end of the list. 7340Sstevel@tonic-gate * 7350Sstevel@tonic-gate * Move one marker page backwards towards the start of the list until 7360Sstevel@tonic-gate * it is at the list head, processing the pages passed along the way. 7370Sstevel@tonic-gate * 7380Sstevel@tonic-gate * Due to race conditions when the vphm mutex is dropped, additional pages 7390Sstevel@tonic-gate * can be added to either end of the list, so we'll continue to move 7400Sstevel@tonic-gate * the marker and process pages until it is up against the end marker. 7410Sstevel@tonic-gate * 7420Sstevel@tonic-gate * There is one special exit condition. If we are processing a VMODSORT 7430Sstevel@tonic-gate * vnode and only writing back modified pages, we can stop as soon as 7440Sstevel@tonic-gate * we run into an unmodified page. This makes fsync(3) operations fast. 7450Sstevel@tonic-gate */ 7460Sstevel@tonic-gate int 7470Sstevel@tonic-gate pvn_vplist_dirty( 7480Sstevel@tonic-gate vnode_t *vp, 7490Sstevel@tonic-gate u_offset_t off, 7500Sstevel@tonic-gate int (*putapage)(vnode_t *, page_t *, u_offset_t *, 7510Sstevel@tonic-gate size_t *, int, cred_t *), 7520Sstevel@tonic-gate int flags, 7530Sstevel@tonic-gate cred_t *cred) 7540Sstevel@tonic-gate { 7550Sstevel@tonic-gate page_t *pp; 7560Sstevel@tonic-gate page_t *mark; /* marker page that moves toward head */ 7570Sstevel@tonic-gate page_t *end; /* marker page at end of list */ 7580Sstevel@tonic-gate int err = 0; 7590Sstevel@tonic-gate int error; 7600Sstevel@tonic-gate kmutex_t *vphm; 7610Sstevel@tonic-gate se_t se; 7620Sstevel@tonic-gate page_t **where_to_move; 7630Sstevel@tonic-gate 7640Sstevel@tonic-gate ASSERT(vp->v_type != VCHR); 7650Sstevel@tonic-gate 7660Sstevel@tonic-gate if (vp->v_pages == NULL) 7670Sstevel@tonic-gate return (0); 7680Sstevel@tonic-gate 7690Sstevel@tonic-gate 7700Sstevel@tonic-gate /* 7710Sstevel@tonic-gate * Serialize vplist_dirty operations on this vnode by setting VVMLOCK. 7720Sstevel@tonic-gate * 7730Sstevel@tonic-gate * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync() 7740Sstevel@tonic-gate * from getting blocked while flushing pages to a dead NFS server. 7750Sstevel@tonic-gate */ 7760Sstevel@tonic-gate mutex_enter(&vp->v_lock); 7770Sstevel@tonic-gate if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) { 7780Sstevel@tonic-gate mutex_exit(&vp->v_lock); 7790Sstevel@tonic-gate return (EAGAIN); 7800Sstevel@tonic-gate } 7810Sstevel@tonic-gate 7820Sstevel@tonic-gate while (vp->v_flag & VVMLOCK) 7830Sstevel@tonic-gate cv_wait(&vp->v_cv, &vp->v_lock); 7840Sstevel@tonic-gate 7850Sstevel@tonic-gate if (vp->v_pages == NULL) { 7860Sstevel@tonic-gate mutex_exit(&vp->v_lock); 7870Sstevel@tonic-gate return (0); 7880Sstevel@tonic-gate } 7890Sstevel@tonic-gate 7900Sstevel@tonic-gate vp->v_flag |= VVMLOCK; 7910Sstevel@tonic-gate mutex_exit(&vp->v_lock); 7920Sstevel@tonic-gate 7930Sstevel@tonic-gate 7940Sstevel@tonic-gate /* 7950Sstevel@tonic-gate * Set up the marker pages used to walk the list 7960Sstevel@tonic-gate */ 7970Sstevel@tonic-gate end = kmem_cache_alloc(marker_cache, KM_SLEEP); 7980Sstevel@tonic-gate end->p_vnode = vp; 7990Sstevel@tonic-gate end->p_offset = (u_offset_t)-2; 8000Sstevel@tonic-gate mark = kmem_cache_alloc(marker_cache, KM_SLEEP); 8010Sstevel@tonic-gate mark->p_vnode = vp; 8020Sstevel@tonic-gate mark->p_offset = (u_offset_t)-1; 8030Sstevel@tonic-gate 8040Sstevel@tonic-gate /* 8050Sstevel@tonic-gate * Grab the lock protecting the vnode's page list 8060Sstevel@tonic-gate * note that this lock is dropped at times in the loop. 8070Sstevel@tonic-gate */ 8080Sstevel@tonic-gate vphm = page_vnode_mutex(vp); 8090Sstevel@tonic-gate mutex_enter(vphm); 8100Sstevel@tonic-gate if (vp->v_pages == NULL) 8110Sstevel@tonic-gate goto leave; 8120Sstevel@tonic-gate 8130Sstevel@tonic-gate /* 8140Sstevel@tonic-gate * insert the markers and loop through the list of pages 8150Sstevel@tonic-gate */ 8160Sstevel@tonic-gate page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark); 8170Sstevel@tonic-gate page_vpadd(&mark->p_vpnext, end); 8180Sstevel@tonic-gate for (;;) { 8190Sstevel@tonic-gate 8200Sstevel@tonic-gate /* 8210Sstevel@tonic-gate * If only doing an async write back, then we can 8220Sstevel@tonic-gate * stop as soon as we get to start of the list. 8230Sstevel@tonic-gate */ 8240Sstevel@tonic-gate if (flags == B_ASYNC && vp->v_pages == mark) 8250Sstevel@tonic-gate break; 8260Sstevel@tonic-gate 8270Sstevel@tonic-gate /* 8280Sstevel@tonic-gate * otherwise stop when we've gone through all the pages 8290Sstevel@tonic-gate */ 8300Sstevel@tonic-gate if (mark->p_vpprev == end) 8310Sstevel@tonic-gate break; 8320Sstevel@tonic-gate 8330Sstevel@tonic-gate pp = mark->p_vpprev; 8340Sstevel@tonic-gate if (vp->v_pages == pp) 8350Sstevel@tonic-gate where_to_move = &vp->v_pages; 8360Sstevel@tonic-gate else 8370Sstevel@tonic-gate where_to_move = &pp->p_vpprev->p_vpnext; 8380Sstevel@tonic-gate 8390Sstevel@tonic-gate ASSERT(pp->p_vnode == vp); 8400Sstevel@tonic-gate 8410Sstevel@tonic-gate /* 8420Sstevel@tonic-gate * If just flushing dirty pages to disk and this vnode 8430Sstevel@tonic-gate * is using a sorted list of pages, we can stop processing 8440Sstevel@tonic-gate * as soon as we find an unmodified page. Since all the 8450Sstevel@tonic-gate * modified pages are visited first. 8460Sstevel@tonic-gate */ 8470Sstevel@tonic-gate if (IS_VMODSORT(vp) && 8482999Sstans !(flags & (B_INVAL | B_FREE | B_TRUNC))) { 8492999Sstans if (!hat_ismod(pp) && !page_io_locked(pp)) { 8500Sstevel@tonic-gate #ifdef DEBUG 8512999Sstans /* 8522999Sstans * For debug kernels examine what should be 8532999Sstans * all the remaining clean pages, asserting 8542999Sstans * that they are not modified. 8552999Sstans */ 8562999Sstans page_t *chk = pp; 8572999Sstans int attr; 8580Sstevel@tonic-gate 8592999Sstans page_vpsub(&vp->v_pages, mark); 8602999Sstans page_vpadd(where_to_move, mark); 8612999Sstans do { 8622999Sstans chk = chk->p_vpprev; 8632999Sstans ASSERT(chk != end); 8642999Sstans if (chk == mark) 8652999Sstans continue; 8662999Sstans attr = hat_page_getattr(chk, P_MOD | 8672999Sstans P_REF); 8682999Sstans if ((attr & P_MOD) == 0) 8692999Sstans continue; 8702999Sstans panic("v_pages list not all clean: " 8712999Sstans "page_t*=%p vnode=%p off=%lx " 8722999Sstans "attr=0x%x last clean page_t*=%p\n", 8732999Sstans (void *)chk, (void *)chk->p_vnode, 8742999Sstans (long)chk->p_offset, attr, 8752999Sstans (void *)pp); 8762999Sstans } while (chk != vp->v_pages); 8770Sstevel@tonic-gate #endif 8782999Sstans break; 8792999Sstans } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) { 8802999Sstans /* 8812999Sstans * Couldn't get io lock, wait until IO is done. 8822999Sstans * Block only for sync IO since we don't want 8832999Sstans * to block async IO. 8842999Sstans */ 8852999Sstans mutex_exit(vphm); 8862999Sstans page_io_wait(pp); 8872999Sstans mutex_enter(vphm); 8882999Sstans continue; 8892999Sstans } 8900Sstevel@tonic-gate } 8910Sstevel@tonic-gate 8920Sstevel@tonic-gate /* 8937972SPeter.Telford@Sun.COM * Skip this page if the offset is out of the desired range. 8947972SPeter.Telford@Sun.COM * Just move the marker and continue. 8957972SPeter.Telford@Sun.COM */ 8967972SPeter.Telford@Sun.COM if (pp->p_offset < off) { 8977972SPeter.Telford@Sun.COM page_vpsub(&vp->v_pages, mark); 8987972SPeter.Telford@Sun.COM page_vpadd(where_to_move, mark); 8997972SPeter.Telford@Sun.COM continue; 9007972SPeter.Telford@Sun.COM } 9017972SPeter.Telford@Sun.COM 9027972SPeter.Telford@Sun.COM /* 9030Sstevel@tonic-gate * If we are supposed to invalidate or free this 9040Sstevel@tonic-gate * page, then we need an exclusive lock. 9050Sstevel@tonic-gate */ 9060Sstevel@tonic-gate se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED; 9070Sstevel@tonic-gate 9080Sstevel@tonic-gate /* 9090Sstevel@tonic-gate * We must acquire the page lock for all synchronous 9100Sstevel@tonic-gate * operations (invalidate, free and write). 9110Sstevel@tonic-gate */ 9120Sstevel@tonic-gate if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) { 9130Sstevel@tonic-gate /* 9140Sstevel@tonic-gate * If the page_lock() drops the mutex 9150Sstevel@tonic-gate * we must retry the loop. 9160Sstevel@tonic-gate */ 9170Sstevel@tonic-gate if (!page_lock(pp, se, vphm, P_NO_RECLAIM)) 9180Sstevel@tonic-gate continue; 9190Sstevel@tonic-gate 9200Sstevel@tonic-gate /* 9210Sstevel@tonic-gate * It's ok to move the marker page now. 9220Sstevel@tonic-gate */ 9230Sstevel@tonic-gate page_vpsub(&vp->v_pages, mark); 9240Sstevel@tonic-gate page_vpadd(where_to_move, mark); 9250Sstevel@tonic-gate } else { 9260Sstevel@tonic-gate 9270Sstevel@tonic-gate /* 9280Sstevel@tonic-gate * update the marker page for all remaining cases 9290Sstevel@tonic-gate */ 9300Sstevel@tonic-gate page_vpsub(&vp->v_pages, mark); 9310Sstevel@tonic-gate page_vpadd(where_to_move, mark); 9320Sstevel@tonic-gate 9330Sstevel@tonic-gate /* 9340Sstevel@tonic-gate * For write backs, If we can't lock the page, it's 9350Sstevel@tonic-gate * invalid or in the process of being destroyed. Skip 9360Sstevel@tonic-gate * it, assuming someone else is writing it. 9370Sstevel@tonic-gate */ 9380Sstevel@tonic-gate if (!page_trylock(pp, se)) 9390Sstevel@tonic-gate continue; 9400Sstevel@tonic-gate } 9410Sstevel@tonic-gate 9420Sstevel@tonic-gate ASSERT(pp->p_vnode == vp); 9430Sstevel@tonic-gate 9440Sstevel@tonic-gate /* 9450Sstevel@tonic-gate * Successfully locked the page, now figure out what to 9460Sstevel@tonic-gate * do with it. Free pages are easily dealt with, invalidate 9470Sstevel@tonic-gate * if desired or just go on to the next page. 9480Sstevel@tonic-gate */ 9490Sstevel@tonic-gate if (PP_ISFREE(pp)) { 9500Sstevel@tonic-gate if ((flags & B_INVAL) == 0) { 9510Sstevel@tonic-gate page_unlock(pp); 9520Sstevel@tonic-gate continue; 9530Sstevel@tonic-gate } 9540Sstevel@tonic-gate 9550Sstevel@tonic-gate /* 9560Sstevel@tonic-gate * Invalidate (destroy) the page. 9570Sstevel@tonic-gate */ 9580Sstevel@tonic-gate mutex_exit(vphm); 9590Sstevel@tonic-gate page_destroy_free(pp); 9600Sstevel@tonic-gate mutex_enter(vphm); 9610Sstevel@tonic-gate continue; 9620Sstevel@tonic-gate } 9630Sstevel@tonic-gate 9640Sstevel@tonic-gate /* 9650Sstevel@tonic-gate * pvn_getdirty() figures out what do do with a dirty page. 9660Sstevel@tonic-gate * If the page is dirty, the putapage() routine will write it 9670Sstevel@tonic-gate * and will kluster any other adjacent dirty pages it can. 9680Sstevel@tonic-gate * 9690Sstevel@tonic-gate * pvn_getdirty() and `(*putapage)' unlock the page. 9700Sstevel@tonic-gate */ 9710Sstevel@tonic-gate mutex_exit(vphm); 9720Sstevel@tonic-gate if (pvn_getdirty(pp, flags)) { 9730Sstevel@tonic-gate error = (*putapage)(vp, pp, NULL, NULL, flags, cred); 9740Sstevel@tonic-gate if (!err) 9750Sstevel@tonic-gate err = error; 9760Sstevel@tonic-gate } 9770Sstevel@tonic-gate mutex_enter(vphm); 9780Sstevel@tonic-gate } 9790Sstevel@tonic-gate page_vpsub(&vp->v_pages, mark); 9800Sstevel@tonic-gate page_vpsub(&vp->v_pages, end); 9810Sstevel@tonic-gate 9820Sstevel@tonic-gate leave: 9830Sstevel@tonic-gate /* 9840Sstevel@tonic-gate * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds 9850Sstevel@tonic-gate */ 9860Sstevel@tonic-gate mutex_exit(vphm); 9870Sstevel@tonic-gate kmem_cache_free(marker_cache, mark); 9880Sstevel@tonic-gate kmem_cache_free(marker_cache, end); 9890Sstevel@tonic-gate mutex_enter(&vp->v_lock); 9900Sstevel@tonic-gate vp->v_flag &= ~VVMLOCK; 9910Sstevel@tonic-gate cv_broadcast(&vp->v_cv); 9920Sstevel@tonic-gate mutex_exit(&vp->v_lock); 9930Sstevel@tonic-gate return (err); 9940Sstevel@tonic-gate } 9950Sstevel@tonic-gate 9960Sstevel@tonic-gate /* 997*11888SPavel.Filipensky@Sun.COM * Walk the vp->v_pages list, for every page call the callback function 998*11888SPavel.Filipensky@Sun.COM * pointed by *page_check. If page_check returns non-zero, then mark the 999*11888SPavel.Filipensky@Sun.COM * page as modified and if VMODSORT is set, move it to the end of v_pages 1000*11888SPavel.Filipensky@Sun.COM * list. Moving makes sense only if we have at least two pages - this also 1001*11888SPavel.Filipensky@Sun.COM * avoids having v_pages temporarily being NULL after calling page_vpsub() 1002*11888SPavel.Filipensky@Sun.COM * if there was just one page. 1003*11888SPavel.Filipensky@Sun.COM */ 1004*11888SPavel.Filipensky@Sun.COM void 1005*11888SPavel.Filipensky@Sun.COM pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *)) 1006*11888SPavel.Filipensky@Sun.COM { 1007*11888SPavel.Filipensky@Sun.COM page_t *pp, *next, *end; 1008*11888SPavel.Filipensky@Sun.COM kmutex_t *vphm; 1009*11888SPavel.Filipensky@Sun.COM int shuffle; 1010*11888SPavel.Filipensky@Sun.COM 1011*11888SPavel.Filipensky@Sun.COM vphm = page_vnode_mutex(vp); 1012*11888SPavel.Filipensky@Sun.COM mutex_enter(vphm); 1013*11888SPavel.Filipensky@Sun.COM 1014*11888SPavel.Filipensky@Sun.COM if (vp->v_pages == NULL) { 1015*11888SPavel.Filipensky@Sun.COM mutex_exit(vphm); 1016*11888SPavel.Filipensky@Sun.COM return; 1017*11888SPavel.Filipensky@Sun.COM } 1018*11888SPavel.Filipensky@Sun.COM 1019*11888SPavel.Filipensky@Sun.COM end = vp->v_pages->p_vpprev; 1020*11888SPavel.Filipensky@Sun.COM shuffle = IS_VMODSORT(vp) && (vp->v_pages != end); 1021*11888SPavel.Filipensky@Sun.COM pp = vp->v_pages; 1022*11888SPavel.Filipensky@Sun.COM 1023*11888SPavel.Filipensky@Sun.COM for (;;) { 1024*11888SPavel.Filipensky@Sun.COM next = pp->p_vpnext; 1025*11888SPavel.Filipensky@Sun.COM if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) { 1026*11888SPavel.Filipensky@Sun.COM /* 1027*11888SPavel.Filipensky@Sun.COM * hat_setmod_only() in contrast to hat_setmod() does 1028*11888SPavel.Filipensky@Sun.COM * not shuffle the pages and does not grab the mutex 1029*11888SPavel.Filipensky@Sun.COM * page_vnode_mutex. Exactly what we need. 1030*11888SPavel.Filipensky@Sun.COM */ 1031*11888SPavel.Filipensky@Sun.COM hat_setmod_only(pp); 1032*11888SPavel.Filipensky@Sun.COM if (shuffle) { 1033*11888SPavel.Filipensky@Sun.COM page_vpsub(&vp->v_pages, pp); 1034*11888SPavel.Filipensky@Sun.COM ASSERT(vp->v_pages != NULL); 1035*11888SPavel.Filipensky@Sun.COM page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, 1036*11888SPavel.Filipensky@Sun.COM pp); 1037*11888SPavel.Filipensky@Sun.COM } 1038*11888SPavel.Filipensky@Sun.COM } 1039*11888SPavel.Filipensky@Sun.COM /* Stop if we have just processed the last page. */ 1040*11888SPavel.Filipensky@Sun.COM if (pp == end) 1041*11888SPavel.Filipensky@Sun.COM break; 1042*11888SPavel.Filipensky@Sun.COM pp = next; 1043*11888SPavel.Filipensky@Sun.COM } 1044*11888SPavel.Filipensky@Sun.COM 1045*11888SPavel.Filipensky@Sun.COM mutex_exit(vphm); 1046*11888SPavel.Filipensky@Sun.COM } 1047*11888SPavel.Filipensky@Sun.COM 1048*11888SPavel.Filipensky@Sun.COM /* 10490Sstevel@tonic-gate * Zero out zbytes worth of data. Caller should be aware that this 10500Sstevel@tonic-gate * routine may enter back into the fs layer (xxx_getpage). Locks 10510Sstevel@tonic-gate * that the xxx_getpage routine may need should not be held while 10520Sstevel@tonic-gate * calling this. 10530Sstevel@tonic-gate */ 10540Sstevel@tonic-gate void 10550Sstevel@tonic-gate pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes) 10560Sstevel@tonic-gate { 10570Sstevel@tonic-gate caddr_t addr; 10580Sstevel@tonic-gate 10590Sstevel@tonic-gate ASSERT(vp->v_type != VCHR); 10600Sstevel@tonic-gate 10610Sstevel@tonic-gate if (vp->v_pages == NULL) 10620Sstevel@tonic-gate return; 10630Sstevel@tonic-gate 10640Sstevel@tonic-gate /* 10650Sstevel@tonic-gate * zbytes may be zero but there still may be some portion of 10660Sstevel@tonic-gate * a page which needs clearing (since zbytes is a function 10670Sstevel@tonic-gate * of filesystem block size, not pagesize.) 10680Sstevel@tonic-gate */ 10690Sstevel@tonic-gate if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0) 10700Sstevel@tonic-gate return; 10710Sstevel@tonic-gate 10720Sstevel@tonic-gate /* 10730Sstevel@tonic-gate * We get the last page and handle the partial 10740Sstevel@tonic-gate * zeroing via kernel mappings. This will make the page 10750Sstevel@tonic-gate * dirty so that we know that when this page is written 10760Sstevel@tonic-gate * back, the zeroed information will go out with it. If 10770Sstevel@tonic-gate * the page is not currently in memory, then the kzero 10780Sstevel@tonic-gate * operation will cause it to be brought it. We use kzero 10790Sstevel@tonic-gate * instead of bzero so that if the page cannot be read in 10800Sstevel@tonic-gate * for any reason, the system will not panic. We need 10810Sstevel@tonic-gate * to zero out a minimum of the fs given zbytes, but we 10820Sstevel@tonic-gate * might also have to do more to get the entire last page. 10830Sstevel@tonic-gate */ 10840Sstevel@tonic-gate 10850Sstevel@tonic-gate if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE) 10860Sstevel@tonic-gate panic("pvn_vptrunc zbytes"); 10870Sstevel@tonic-gate addr = segmap_getmapflt(segkmap, vp, vplen, 10880Sstevel@tonic-gate MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE); 10890Sstevel@tonic-gate (void) kzero(addr + (vplen & MAXBOFFSET), 10900Sstevel@tonic-gate MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET))); 10910Sstevel@tonic-gate (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC); 10920Sstevel@tonic-gate } 10930Sstevel@tonic-gate 10940Sstevel@tonic-gate /* 10950Sstevel@tonic-gate * Handles common work of the VOP_GETPAGE routines when more than 10960Sstevel@tonic-gate * one page must be returned by calling a file system specific operation 10970Sstevel@tonic-gate * to do most of the work. Must be called with the vp already locked 10980Sstevel@tonic-gate * by the VOP_GETPAGE routine. 10990Sstevel@tonic-gate */ 11000Sstevel@tonic-gate int 11010Sstevel@tonic-gate pvn_getpages( 11020Sstevel@tonic-gate int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[], 11030Sstevel@tonic-gate size_t, struct seg *, caddr_t, enum seg_rw, cred_t *), 11040Sstevel@tonic-gate struct vnode *vp, 11050Sstevel@tonic-gate u_offset_t off, 11060Sstevel@tonic-gate size_t len, 11070Sstevel@tonic-gate uint_t *protp, 11080Sstevel@tonic-gate page_t *pl[], 11090Sstevel@tonic-gate size_t plsz, 11100Sstevel@tonic-gate struct seg *seg, 11110Sstevel@tonic-gate caddr_t addr, 11120Sstevel@tonic-gate enum seg_rw rw, 11130Sstevel@tonic-gate struct cred *cred) 11140Sstevel@tonic-gate { 11150Sstevel@tonic-gate page_t **ppp; 11160Sstevel@tonic-gate u_offset_t o, eoff; 11170Sstevel@tonic-gate size_t sz, xlen; 11180Sstevel@tonic-gate int err; 11190Sstevel@tonic-gate 11200Sstevel@tonic-gate ASSERT(plsz >= len); /* insure that we have enough space */ 11210Sstevel@tonic-gate 11220Sstevel@tonic-gate /* 11230Sstevel@tonic-gate * Loop one page at a time and let getapage function fill 11240Sstevel@tonic-gate * in the next page in array. We only allow one page to be 11250Sstevel@tonic-gate * returned at a time (except for the last page) so that we 11260Sstevel@tonic-gate * don't have any problems with duplicates and other such 11270Sstevel@tonic-gate * painful problems. This is a very simple minded algorithm, 11280Sstevel@tonic-gate * but it does the job correctly. We hope that the cost of a 11290Sstevel@tonic-gate * getapage call for a resident page that we might have been 11300Sstevel@tonic-gate * able to get from an earlier call doesn't cost too much. 11310Sstevel@tonic-gate */ 11320Sstevel@tonic-gate ppp = pl; 11330Sstevel@tonic-gate sz = PAGESIZE; 11340Sstevel@tonic-gate eoff = off + len; 11350Sstevel@tonic-gate xlen = len; 11360Sstevel@tonic-gate for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE, 11370Sstevel@tonic-gate xlen -= PAGESIZE) { 11380Sstevel@tonic-gate if (o + PAGESIZE >= eoff) { 11390Sstevel@tonic-gate /* 11400Sstevel@tonic-gate * Last time through - allow the all of 11410Sstevel@tonic-gate * what's left of the pl[] array to be used. 11420Sstevel@tonic-gate */ 11430Sstevel@tonic-gate sz = plsz - (o - off); 11440Sstevel@tonic-gate } 11450Sstevel@tonic-gate err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr, 11460Sstevel@tonic-gate rw, cred); 11470Sstevel@tonic-gate if (err) { 11480Sstevel@tonic-gate /* 11490Sstevel@tonic-gate * Release any pages we already got. 11500Sstevel@tonic-gate */ 11510Sstevel@tonic-gate if (o > off && pl != NULL) { 11520Sstevel@tonic-gate for (ppp = pl; *ppp != NULL; *ppp++ = NULL) 11530Sstevel@tonic-gate (void) page_release(*ppp, 1); 11540Sstevel@tonic-gate } 11550Sstevel@tonic-gate break; 11560Sstevel@tonic-gate } 11570Sstevel@tonic-gate if (pl != NULL) 11580Sstevel@tonic-gate ppp++; 11590Sstevel@tonic-gate } 11600Sstevel@tonic-gate return (err); 11610Sstevel@tonic-gate } 11620Sstevel@tonic-gate 11630Sstevel@tonic-gate /* 11640Sstevel@tonic-gate * Initialize the page list array. 11650Sstevel@tonic-gate */ 11663351Saguzovsk /*ARGSUSED*/ 11670Sstevel@tonic-gate void 11680Sstevel@tonic-gate pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz, 11690Sstevel@tonic-gate u_offset_t off, size_t io_len, enum seg_rw rw) 11700Sstevel@tonic-gate { 11710Sstevel@tonic-gate ssize_t sz; 11720Sstevel@tonic-gate page_t *ppcur, **ppp; 11730Sstevel@tonic-gate 11743351Saguzovsk /* 11753351Saguzovsk * Set up to load plsz worth 11763351Saguzovsk * starting at the needed page. 11773351Saguzovsk */ 11783351Saguzovsk while (pp != NULL && pp->p_offset != off) { 11790Sstevel@tonic-gate /* 11803351Saguzovsk * Remove page from the i/o list, 11813351Saguzovsk * release the i/o and the page lock. 11820Sstevel@tonic-gate */ 11833351Saguzovsk ppcur = pp; 11843351Saguzovsk page_sub(&pp, ppcur); 11853351Saguzovsk page_io_unlock(ppcur); 11863351Saguzovsk (void) page_release(ppcur, 1); 11870Sstevel@tonic-gate } 11880Sstevel@tonic-gate 11893351Saguzovsk if (pp == NULL) { 11903351Saguzovsk pl[0] = NULL; 11913351Saguzovsk return; 11923351Saguzovsk } 11933351Saguzovsk 11943351Saguzovsk sz = plsz; 11953351Saguzovsk 11960Sstevel@tonic-gate /* 11970Sstevel@tonic-gate * Initialize the page list array. 11980Sstevel@tonic-gate */ 11990Sstevel@tonic-gate ppp = pl; 12000Sstevel@tonic-gate do { 12010Sstevel@tonic-gate ppcur = pp; 12020Sstevel@tonic-gate *ppp++ = ppcur; 12030Sstevel@tonic-gate page_sub(&pp, ppcur); 12040Sstevel@tonic-gate page_io_unlock(ppcur); 12050Sstevel@tonic-gate if (rw != S_CREATE) 12060Sstevel@tonic-gate page_downgrade(ppcur); 12070Sstevel@tonic-gate sz -= PAGESIZE; 12080Sstevel@tonic-gate } while (sz > 0 && pp != NULL); 12090Sstevel@tonic-gate *ppp = NULL; /* terminate list */ 12100Sstevel@tonic-gate 12110Sstevel@tonic-gate /* 12120Sstevel@tonic-gate * Now free the remaining pages that weren't 12130Sstevel@tonic-gate * loaded in the page list. 12140Sstevel@tonic-gate */ 12150Sstevel@tonic-gate while (pp != NULL) { 12160Sstevel@tonic-gate ppcur = pp; 12170Sstevel@tonic-gate page_sub(&pp, ppcur); 12180Sstevel@tonic-gate page_io_unlock(ppcur); 12190Sstevel@tonic-gate (void) page_release(ppcur, 1); 12200Sstevel@tonic-gate } 12210Sstevel@tonic-gate } 1222