1*0Sstevel@tonic-gate /* 2*0Sstevel@tonic-gate * CDDL HEADER START 3*0Sstevel@tonic-gate * 4*0Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*0Sstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only 6*0Sstevel@tonic-gate * (the "License"). You may not use this file except in compliance 7*0Sstevel@tonic-gate * with the License. 8*0Sstevel@tonic-gate * 9*0Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10*0Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 11*0Sstevel@tonic-gate * See the License for the specific language governing permissions 12*0Sstevel@tonic-gate * and limitations under the License. 13*0Sstevel@tonic-gate * 14*0Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 15*0Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16*0Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 17*0Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 18*0Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 19*0Sstevel@tonic-gate * 20*0Sstevel@tonic-gate * CDDL HEADER END 21*0Sstevel@tonic-gate */ 22*0Sstevel@tonic-gate /* 23*0Sstevel@tonic-gate * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24*0Sstevel@tonic-gate * Use is subject to license terms. 25*0Sstevel@tonic-gate */ 26*0Sstevel@tonic-gate 27*0Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28*0Sstevel@tonic-gate /* All Rights Reserved */ 29*0Sstevel@tonic-gate 30*0Sstevel@tonic-gate /* 31*0Sstevel@tonic-gate * University Copyright- Copyright (c) 1982, 1986, 1988 32*0Sstevel@tonic-gate * The Regents of the University of California 33*0Sstevel@tonic-gate * All Rights Reserved 34*0Sstevel@tonic-gate * 35*0Sstevel@tonic-gate * University Acknowledgment- Portions of this document are derived from 36*0Sstevel@tonic-gate * software developed by the University of California, Berkeley, and its 37*0Sstevel@tonic-gate * contributors. 38*0Sstevel@tonic-gate */ 39*0Sstevel@tonic-gate 40*0Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 41*0Sstevel@tonic-gate 42*0Sstevel@tonic-gate #include <sys/types.h> 43*0Sstevel@tonic-gate #include <sys/t_lock.h> 44*0Sstevel@tonic-gate #include <sys/sysmacros.h> 45*0Sstevel@tonic-gate #include <sys/conf.h> 46*0Sstevel@tonic-gate #include <sys/cpuvar.h> 47*0Sstevel@tonic-gate #include <sys/errno.h> 48*0Sstevel@tonic-gate #include <sys/debug.h> 49*0Sstevel@tonic-gate #include <sys/buf.h> 50*0Sstevel@tonic-gate #include <sys/var.h> 51*0Sstevel@tonic-gate #include <sys/vnode.h> 52*0Sstevel@tonic-gate #include <sys/bitmap.h> 53*0Sstevel@tonic-gate #include <sys/cmn_err.h> 54*0Sstevel@tonic-gate #include <sys/kmem.h> 55*0Sstevel@tonic-gate #include <sys/vmem.h> 56*0Sstevel@tonic-gate #include <sys/atomic.h> 57*0Sstevel@tonic-gate #include <vm/seg_kmem.h> 58*0Sstevel@tonic-gate #include <vm/page.h> 59*0Sstevel@tonic-gate #include <vm/pvn.h> 60*0Sstevel@tonic-gate #include <sys/vtrace.h> 61*0Sstevel@tonic-gate #include <sys/tnf_probe.h> 62*0Sstevel@tonic-gate #include <sys/fs/ufs_inode.h> 63*0Sstevel@tonic-gate #include <sys/fs/ufs_bio.h> 64*0Sstevel@tonic-gate #include <sys/fs/ufs_log.h> 65*0Sstevel@tonic-gate #include <sys/systm.h> 66*0Sstevel@tonic-gate #include <sys/vfs.h> 67*0Sstevel@tonic-gate #include <sys/sdt.h> 68*0Sstevel@tonic-gate 69*0Sstevel@tonic-gate /* Locks */ 70*0Sstevel@tonic-gate static kmutex_t blist_lock; /* protects b_list */ 71*0Sstevel@tonic-gate static kmutex_t bhdr_lock; /* protects the bhdrlist */ 72*0Sstevel@tonic-gate static kmutex_t bfree_lock; /* protects the bfreelist structure */ 73*0Sstevel@tonic-gate 74*0Sstevel@tonic-gate struct hbuf *hbuf; /* Hash buckets */ 75*0Sstevel@tonic-gate struct dwbuf *dwbuf; /* Delayed write buckets */ 76*0Sstevel@tonic-gate static struct buf *bhdrlist; /* buf header free list */ 77*0Sstevel@tonic-gate static int nbuf; /* number of buffer headers allocated */ 78*0Sstevel@tonic-gate 79*0Sstevel@tonic-gate static int lastindex; /* Reference point on where to start */ 80*0Sstevel@tonic-gate /* when looking for free buffers */ 81*0Sstevel@tonic-gate 82*0Sstevel@tonic-gate #define bio_bhash(dev, bn) (hash2ints((dev), (int)(bn)) & v.v_hmask) 83*0Sstevel@tonic-gate #define EMPTY_LIST ((struct buf *)-1) 84*0Sstevel@tonic-gate 85*0Sstevel@tonic-gate static kcondvar_t bio_mem_cv; /* Condition variables */ 86*0Sstevel@tonic-gate static kcondvar_t bio_flushinval_cv; 87*0Sstevel@tonic-gate static int bio_doingflush; /* flush in progress */ 88*0Sstevel@tonic-gate static int bio_doinginval; /* inval in progress */ 89*0Sstevel@tonic-gate static int bio_flinv_cv_wanted; /* someone waiting for cv */ 90*0Sstevel@tonic-gate 91*0Sstevel@tonic-gate /* 92*0Sstevel@tonic-gate * Statistics on the buffer cache 93*0Sstevel@tonic-gate */ 94*0Sstevel@tonic-gate struct biostats biostats = { 95*0Sstevel@tonic-gate { "buffer_cache_lookups", KSTAT_DATA_UINT32 }, 96*0Sstevel@tonic-gate { "buffer_cache_hits", KSTAT_DATA_UINT32 }, 97*0Sstevel@tonic-gate { "new_buffer_requests", KSTAT_DATA_UINT32 }, 98*0Sstevel@tonic-gate { "waits_for_buffer_allocs", KSTAT_DATA_UINT32 }, 99*0Sstevel@tonic-gate { "buffers_locked_by_someone", KSTAT_DATA_UINT32 }, 100*0Sstevel@tonic-gate { "duplicate_buffers_found", KSTAT_DATA_UINT32 } 101*0Sstevel@tonic-gate }; 102*0Sstevel@tonic-gate 103*0Sstevel@tonic-gate /* 104*0Sstevel@tonic-gate * kstat data 105*0Sstevel@tonic-gate */ 106*0Sstevel@tonic-gate kstat_named_t *biostats_ptr = (kstat_named_t *)&biostats; 107*0Sstevel@tonic-gate uint_t biostats_ndata = (uint_t)(sizeof (biostats) / 108*0Sstevel@tonic-gate sizeof (kstat_named_t)); 109*0Sstevel@tonic-gate 110*0Sstevel@tonic-gate /* 111*0Sstevel@tonic-gate * Statistics on ufs buffer cache 112*0Sstevel@tonic-gate * Not protected by locks 113*0Sstevel@tonic-gate */ 114*0Sstevel@tonic-gate struct ufsbiostats ub = { 115*0Sstevel@tonic-gate { "breads", KSTAT_DATA_UINT32 }, 116*0Sstevel@tonic-gate { "bwrites", KSTAT_DATA_UINT32 }, 117*0Sstevel@tonic-gate { "fbiwrites", KSTAT_DATA_UINT32 }, 118*0Sstevel@tonic-gate { "getpages", KSTAT_DATA_UINT32 }, 119*0Sstevel@tonic-gate { "getras", KSTAT_DATA_UINT32 }, 120*0Sstevel@tonic-gate { "putsyncs", KSTAT_DATA_UINT32 }, 121*0Sstevel@tonic-gate { "putasyncs", KSTAT_DATA_UINT32 }, 122*0Sstevel@tonic-gate { "putpageios", KSTAT_DATA_UINT32 }, 123*0Sstevel@tonic-gate }; 124*0Sstevel@tonic-gate 125*0Sstevel@tonic-gate /* 126*0Sstevel@tonic-gate * more UFS Logging eccentricities... 127*0Sstevel@tonic-gate * 128*0Sstevel@tonic-gate * required since "#pragma weak ..." doesn't work in reverse order. 129*0Sstevel@tonic-gate * i.e.: genunix (bio.c) is loaded before the ufs modules and pointers 130*0Sstevel@tonic-gate * to ufs routines don't get plugged into bio.c calls so 131*0Sstevel@tonic-gate * we initialize it when setting up the "lufsops" table 132*0Sstevel@tonic-gate * in "lufs.c:_init()" 133*0Sstevel@tonic-gate */ 134*0Sstevel@tonic-gate void (*bio_lufs_strategy)(void *, buf_t *); 135*0Sstevel@tonic-gate void (*bio_snapshot_strategy)(void *, buf_t *); 136*0Sstevel@tonic-gate 137*0Sstevel@tonic-gate 138*0Sstevel@tonic-gate /* Private routines */ 139*0Sstevel@tonic-gate static struct buf *bio_getfreeblk(long); 140*0Sstevel@tonic-gate static void bio_mem_get(long); 141*0Sstevel@tonic-gate static void bio_bhdr_free(struct buf *); 142*0Sstevel@tonic-gate static struct buf *bio_bhdr_alloc(void); 143*0Sstevel@tonic-gate static void bio_recycle(int, long); 144*0Sstevel@tonic-gate static void bio_pageio_done(struct buf *); 145*0Sstevel@tonic-gate static int bio_incore(dev_t, daddr_t); 146*0Sstevel@tonic-gate 147*0Sstevel@tonic-gate /* 148*0Sstevel@tonic-gate * Buffer cache constants 149*0Sstevel@tonic-gate */ 150*0Sstevel@tonic-gate #define BIO_BUF_PERCENT (100/2) /* default: 2% of memory */ 151*0Sstevel@tonic-gate #define BIO_MAX_PERCENT (100/20) /* max is 20% of real memory */ 152*0Sstevel@tonic-gate #define BIO_BHDR_POOL 100 /* Default bhdr pool size */ 153*0Sstevel@tonic-gate #define BIO_MIN_HDR 10 /* Minimum number of buffer headers */ 154*0Sstevel@tonic-gate #define BIO_MIN_HWM (BIO_MIN_HDR * MAXBSIZE / 1024) 155*0Sstevel@tonic-gate #define BIO_HASHLEN 4 /* Target length of hash chains */ 156*0Sstevel@tonic-gate 157*0Sstevel@tonic-gate 158*0Sstevel@tonic-gate /* Flags for bio_recycle() */ 159*0Sstevel@tonic-gate #define BIO_HEADER 0x01 160*0Sstevel@tonic-gate #define BIO_MEM 0x02 161*0Sstevel@tonic-gate 162*0Sstevel@tonic-gate extern int bufhwm; /* User tunable - high water mark for mem */ 163*0Sstevel@tonic-gate extern int bufhwm_pct; /* ditto - given in % of physmem */ 164*0Sstevel@tonic-gate 165*0Sstevel@tonic-gate /* 166*0Sstevel@tonic-gate * The following routines allocate and free 167*0Sstevel@tonic-gate * buffers with various side effects. In general the 168*0Sstevel@tonic-gate * arguments to an allocate routine are a device and 169*0Sstevel@tonic-gate * a block number, and the value is a pointer to 170*0Sstevel@tonic-gate * to the buffer header; the buffer returned is locked with a 171*0Sstevel@tonic-gate * binary semaphore so that no one else can touch it. If the block was 172*0Sstevel@tonic-gate * already in core, no I/O need be done; if it is 173*0Sstevel@tonic-gate * already locked, the process waits until it becomes free. 174*0Sstevel@tonic-gate * The following routines allocate a buffer: 175*0Sstevel@tonic-gate * getblk 176*0Sstevel@tonic-gate * bread/BREAD 177*0Sstevel@tonic-gate * breada 178*0Sstevel@tonic-gate * Eventually the buffer must be released, possibly with the 179*0Sstevel@tonic-gate * side effect of writing it out, by using one of 180*0Sstevel@tonic-gate * bwrite/BWRITE/brwrite 181*0Sstevel@tonic-gate * bdwrite/bdrwrite 182*0Sstevel@tonic-gate * bawrite 183*0Sstevel@tonic-gate * brelse 184*0Sstevel@tonic-gate * 185*0Sstevel@tonic-gate * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization. 186*0Sstevel@tonic-gate * Instead, a binary semaphore, b_sem is used to gain exclusive access to 187*0Sstevel@tonic-gate * a buffer and a binary semaphore, b_io is used for I/O synchronization. 188*0Sstevel@tonic-gate * B_DONE is still used to denote a buffer with I/O complete on it. 189*0Sstevel@tonic-gate * 190*0Sstevel@tonic-gate * The bfreelist.b_bcount field is computed everytime fsflush runs. It is 191*0Sstevel@tonic-gate * should not be used where a very accurate count of the free buffers is 192*0Sstevel@tonic-gate * needed. 193*0Sstevel@tonic-gate */ 194*0Sstevel@tonic-gate 195*0Sstevel@tonic-gate /* 196*0Sstevel@tonic-gate * Read in (if necessary) the block and return a buffer pointer. 197*0Sstevel@tonic-gate * 198*0Sstevel@tonic-gate * This interface is provided for binary compatibility. Using 199*0Sstevel@tonic-gate * BREAD() directly avoids the extra function call overhead invoked 200*0Sstevel@tonic-gate * by calling this routine. 201*0Sstevel@tonic-gate */ 202*0Sstevel@tonic-gate struct buf * 203*0Sstevel@tonic-gate bread(dev_t dev, daddr_t blkno, long bsize) 204*0Sstevel@tonic-gate { 205*0Sstevel@tonic-gate return (BREAD(dev, blkno, bsize)); 206*0Sstevel@tonic-gate } 207*0Sstevel@tonic-gate 208*0Sstevel@tonic-gate /* 209*0Sstevel@tonic-gate * Common code for reading a buffer with various options 210*0Sstevel@tonic-gate * 211*0Sstevel@tonic-gate * Read in (if necessary) the block and return a buffer pointer. 212*0Sstevel@tonic-gate */ 213*0Sstevel@tonic-gate struct buf * 214*0Sstevel@tonic-gate bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize) 215*0Sstevel@tonic-gate { 216*0Sstevel@tonic-gate struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg; 217*0Sstevel@tonic-gate struct buf *bp; 218*0Sstevel@tonic-gate klwp_t *lwp = ttolwp(curthread); 219*0Sstevel@tonic-gate 220*0Sstevel@tonic-gate CPU_STATS_ADD_K(sys, lread, 1); 221*0Sstevel@tonic-gate bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1); 222*0Sstevel@tonic-gate if (bp->b_flags & B_DONE) 223*0Sstevel@tonic-gate return (bp); 224*0Sstevel@tonic-gate bp->b_flags |= B_READ; 225*0Sstevel@tonic-gate ASSERT(bp->b_bcount == bsize); 226*0Sstevel@tonic-gate if (ufsvfsp == NULL) { /* !ufs */ 227*0Sstevel@tonic-gate (void) bdev_strategy(bp); 228*0Sstevel@tonic-gate } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) { 229*0Sstevel@tonic-gate /* ufs && logging */ 230*0Sstevel@tonic-gate (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp); 231*0Sstevel@tonic-gate } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) { 232*0Sstevel@tonic-gate /* ufs && snapshots */ 233*0Sstevel@tonic-gate (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp); 234*0Sstevel@tonic-gate } else { 235*0Sstevel@tonic-gate ufsvfsp->vfs_iotstamp = lbolt; 236*0Sstevel@tonic-gate ub.ub_breads.value.ul++; /* ufs && !logging */ 237*0Sstevel@tonic-gate (void) bdev_strategy(bp); 238*0Sstevel@tonic-gate } 239*0Sstevel@tonic-gate if (lwp != NULL) 240*0Sstevel@tonic-gate lwp->lwp_ru.inblock++; 241*0Sstevel@tonic-gate CPU_STATS_ADD_K(sys, bread, 1); 242*0Sstevel@tonic-gate (void) biowait(bp); 243*0Sstevel@tonic-gate return (bp); 244*0Sstevel@tonic-gate } 245*0Sstevel@tonic-gate 246*0Sstevel@tonic-gate /* 247*0Sstevel@tonic-gate * Read in the block, like bread, but also start I/O on the 248*0Sstevel@tonic-gate * read-ahead block (which is not allocated to the caller). 249*0Sstevel@tonic-gate */ 250*0Sstevel@tonic-gate struct buf * 251*0Sstevel@tonic-gate breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize) 252*0Sstevel@tonic-gate { 253*0Sstevel@tonic-gate struct buf *bp, *rabp; 254*0Sstevel@tonic-gate klwp_t *lwp = ttolwp(curthread); 255*0Sstevel@tonic-gate 256*0Sstevel@tonic-gate bp = NULL; 257*0Sstevel@tonic-gate if (!bio_incore(dev, blkno)) { 258*0Sstevel@tonic-gate CPU_STATS_ADD_K(sys, lread, 1); 259*0Sstevel@tonic-gate bp = GETBLK(dev, blkno, bsize); 260*0Sstevel@tonic-gate if ((bp->b_flags & B_DONE) == 0) { 261*0Sstevel@tonic-gate bp->b_flags |= B_READ; 262*0Sstevel@tonic-gate bp->b_bcount = bsize; 263*0Sstevel@tonic-gate (void) bdev_strategy(bp); 264*0Sstevel@tonic-gate if (lwp != NULL) 265*0Sstevel@tonic-gate lwp->lwp_ru.inblock++; 266*0Sstevel@tonic-gate CPU_STATS_ADD_K(sys, bread, 1); 267*0Sstevel@tonic-gate } 268*0Sstevel@tonic-gate } 269*0Sstevel@tonic-gate if (rablkno && bfreelist.b_bcount > 1 && 270*0Sstevel@tonic-gate !bio_incore(dev, rablkno)) { 271*0Sstevel@tonic-gate rabp = GETBLK(dev, rablkno, bsize); 272*0Sstevel@tonic-gate if (rabp->b_flags & B_DONE) 273*0Sstevel@tonic-gate brelse(rabp); 274*0Sstevel@tonic-gate else { 275*0Sstevel@tonic-gate rabp->b_flags |= B_READ|B_ASYNC; 276*0Sstevel@tonic-gate rabp->b_bcount = bsize; 277*0Sstevel@tonic-gate (void) bdev_strategy(rabp); 278*0Sstevel@tonic-gate if (lwp != NULL) 279*0Sstevel@tonic-gate lwp->lwp_ru.inblock++; 280*0Sstevel@tonic-gate CPU_STATS_ADD_K(sys, bread, 1); 281*0Sstevel@tonic-gate } 282*0Sstevel@tonic-gate } 283*0Sstevel@tonic-gate if (bp == NULL) 284*0Sstevel@tonic-gate return (BREAD(dev, blkno, bsize)); 285*0Sstevel@tonic-gate (void) biowait(bp); 286*0Sstevel@tonic-gate return (bp); 287*0Sstevel@tonic-gate } 288*0Sstevel@tonic-gate 289*0Sstevel@tonic-gate /* 290*0Sstevel@tonic-gate * Common code for writing a buffer with various options. 291*0Sstevel@tonic-gate * 292*0Sstevel@tonic-gate * force_wait - wait for write completion regardless of B_ASYNC flag 293*0Sstevel@tonic-gate * do_relse - release the buffer when we are done 294*0Sstevel@tonic-gate * clear_flags - flags to clear from the buffer 295*0Sstevel@tonic-gate */ 296*0Sstevel@tonic-gate void 297*0Sstevel@tonic-gate bwrite_common(void *arg, struct buf *bp, int force_wait, 298*0Sstevel@tonic-gate int do_relse, int clear_flags) 299*0Sstevel@tonic-gate { 300*0Sstevel@tonic-gate register int do_wait; 301*0Sstevel@tonic-gate struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg; 302*0Sstevel@tonic-gate int flag; 303*0Sstevel@tonic-gate klwp_t *lwp = ttolwp(curthread); 304*0Sstevel@tonic-gate struct cpu *cpup; 305*0Sstevel@tonic-gate 306*0Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 307*0Sstevel@tonic-gate flag = bp->b_flags; 308*0Sstevel@tonic-gate bp->b_flags &= ~clear_flags; 309*0Sstevel@tonic-gate if (lwp != NULL) 310*0Sstevel@tonic-gate lwp->lwp_ru.oublock++; 311*0Sstevel@tonic-gate CPU_STATS_ENTER_K(); 312*0Sstevel@tonic-gate cpup = CPU; /* get pointer AFTER preemption is disabled */ 313*0Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, sys, lwrite, 1); 314*0Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, sys, bwrite, 1); 315*0Sstevel@tonic-gate do_wait = ((flag & B_ASYNC) == 0 || force_wait); 316*0Sstevel@tonic-gate if (do_wait == 0) 317*0Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, sys, bawrite, 1); 318*0Sstevel@tonic-gate CPU_STATS_EXIT_K(); 319*0Sstevel@tonic-gate if (ufsvfsp == NULL) { 320*0Sstevel@tonic-gate (void) bdev_strategy(bp); 321*0Sstevel@tonic-gate } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) { 322*0Sstevel@tonic-gate /* ufs && logging */ 323*0Sstevel@tonic-gate (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp); 324*0Sstevel@tonic-gate } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) { 325*0Sstevel@tonic-gate /* ufs && snapshots */ 326*0Sstevel@tonic-gate (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp); 327*0Sstevel@tonic-gate } else { 328*0Sstevel@tonic-gate ub.ub_bwrites.value.ul++; /* ufs && !logging */ 329*0Sstevel@tonic-gate (void) bdev_strategy(bp); 330*0Sstevel@tonic-gate } 331*0Sstevel@tonic-gate if (do_wait) { 332*0Sstevel@tonic-gate (void) biowait(bp); 333*0Sstevel@tonic-gate if (do_relse) { 334*0Sstevel@tonic-gate brelse(bp); 335*0Sstevel@tonic-gate } 336*0Sstevel@tonic-gate } 337*0Sstevel@tonic-gate } 338*0Sstevel@tonic-gate 339*0Sstevel@tonic-gate /* 340*0Sstevel@tonic-gate * Write the buffer, waiting for completion (unless B_ASYNC is set). 341*0Sstevel@tonic-gate * Then release the buffer. 342*0Sstevel@tonic-gate * This interface is provided for binary compatibility. Using 343*0Sstevel@tonic-gate * BWRITE() directly avoids the extra function call overhead invoked 344*0Sstevel@tonic-gate * by calling this routine. 345*0Sstevel@tonic-gate */ 346*0Sstevel@tonic-gate void 347*0Sstevel@tonic-gate bwrite(struct buf *bp) 348*0Sstevel@tonic-gate { 349*0Sstevel@tonic-gate BWRITE(bp); 350*0Sstevel@tonic-gate } 351*0Sstevel@tonic-gate 352*0Sstevel@tonic-gate /* 353*0Sstevel@tonic-gate * Write the buffer, waiting for completion. 354*0Sstevel@tonic-gate * But don't release the buffer afterwards. 355*0Sstevel@tonic-gate * This interface is provided for binary compatibility. Using 356*0Sstevel@tonic-gate * BWRITE2() directly avoids the extra function call overhead. 357*0Sstevel@tonic-gate */ 358*0Sstevel@tonic-gate void 359*0Sstevel@tonic-gate bwrite2(struct buf *bp) 360*0Sstevel@tonic-gate { 361*0Sstevel@tonic-gate BWRITE2(bp); 362*0Sstevel@tonic-gate } 363*0Sstevel@tonic-gate 364*0Sstevel@tonic-gate /* 365*0Sstevel@tonic-gate * Release the buffer, marking it so that if it is grabbed 366*0Sstevel@tonic-gate * for another purpose it will be written out before being 367*0Sstevel@tonic-gate * given up (e.g. when writing a partial block where it is 368*0Sstevel@tonic-gate * assumed that another write for the same block will soon follow). 369*0Sstevel@tonic-gate * Also save the time that the block is first marked as delayed 370*0Sstevel@tonic-gate * so that it will be written in a reasonable time. 371*0Sstevel@tonic-gate */ 372*0Sstevel@tonic-gate void 373*0Sstevel@tonic-gate bdwrite(struct buf *bp) 374*0Sstevel@tonic-gate { 375*0Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 376*0Sstevel@tonic-gate CPU_STATS_ADD_K(sys, lwrite, 1); 377*0Sstevel@tonic-gate if ((bp->b_flags & B_DELWRI) == 0) 378*0Sstevel@tonic-gate bp->b_start = lbolt; 379*0Sstevel@tonic-gate /* 380*0Sstevel@tonic-gate * B_DONE allows others to use the buffer, B_DELWRI causes the 381*0Sstevel@tonic-gate * buffer to be written before being reused, and setting b_resid 382*0Sstevel@tonic-gate * to zero says the buffer is complete. 383*0Sstevel@tonic-gate */ 384*0Sstevel@tonic-gate bp->b_flags |= B_DELWRI | B_DONE; 385*0Sstevel@tonic-gate bp->b_resid = 0; 386*0Sstevel@tonic-gate brelse(bp); 387*0Sstevel@tonic-gate } 388*0Sstevel@tonic-gate 389*0Sstevel@tonic-gate /* 390*0Sstevel@tonic-gate * Release the buffer, start I/O on it, but don't wait for completion. 391*0Sstevel@tonic-gate */ 392*0Sstevel@tonic-gate void 393*0Sstevel@tonic-gate bawrite(struct buf *bp) 394*0Sstevel@tonic-gate { 395*0Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 396*0Sstevel@tonic-gate 397*0Sstevel@tonic-gate /* Use bfreelist.b_bcount as a weird-ass heuristic */ 398*0Sstevel@tonic-gate if (bfreelist.b_bcount > 4) 399*0Sstevel@tonic-gate bp->b_flags |= B_ASYNC; 400*0Sstevel@tonic-gate BWRITE(bp); 401*0Sstevel@tonic-gate } 402*0Sstevel@tonic-gate 403*0Sstevel@tonic-gate /* 404*0Sstevel@tonic-gate * Release the buffer, with no I/O implied. 405*0Sstevel@tonic-gate */ 406*0Sstevel@tonic-gate void 407*0Sstevel@tonic-gate brelse(struct buf *bp) 408*0Sstevel@tonic-gate { 409*0Sstevel@tonic-gate struct buf **backp; 410*0Sstevel@tonic-gate uint_t index; 411*0Sstevel@tonic-gate kmutex_t *hmp; 412*0Sstevel@tonic-gate struct buf *dp; 413*0Sstevel@tonic-gate struct hbuf *hp; 414*0Sstevel@tonic-gate 415*0Sstevel@tonic-gate 416*0Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 417*0Sstevel@tonic-gate 418*0Sstevel@tonic-gate /* 419*0Sstevel@tonic-gate * Clear the retry write flag if the buffer was written without 420*0Sstevel@tonic-gate * error. The presence of B_DELWRI means the buffer has not yet 421*0Sstevel@tonic-gate * been written and the presence of B_ERROR means that an error 422*0Sstevel@tonic-gate * is still occurring. 423*0Sstevel@tonic-gate */ 424*0Sstevel@tonic-gate if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) { 425*0Sstevel@tonic-gate bp->b_flags &= ~B_RETRYWRI; 426*0Sstevel@tonic-gate } 427*0Sstevel@tonic-gate 428*0Sstevel@tonic-gate /* Check for anomalous conditions */ 429*0Sstevel@tonic-gate if (bp->b_flags & (B_ERROR|B_NOCACHE)) { 430*0Sstevel@tonic-gate if (bp->b_flags & B_NOCACHE) { 431*0Sstevel@tonic-gate /* Don't add to the freelist. Destroy it now */ 432*0Sstevel@tonic-gate kmem_free(bp->b_un.b_addr, bp->b_bufsize); 433*0Sstevel@tonic-gate sema_destroy(&bp->b_sem); 434*0Sstevel@tonic-gate sema_destroy(&bp->b_io); 435*0Sstevel@tonic-gate kmem_free(bp, sizeof (struct buf)); 436*0Sstevel@tonic-gate return; 437*0Sstevel@tonic-gate } 438*0Sstevel@tonic-gate /* 439*0Sstevel@tonic-gate * If a write failed and we are supposed to retry write, 440*0Sstevel@tonic-gate * don't toss the buffer. Keep it around and mark it 441*0Sstevel@tonic-gate * delayed write in the hopes that it will eventually 442*0Sstevel@tonic-gate * get flushed (and still keep the system running.) 443*0Sstevel@tonic-gate */ 444*0Sstevel@tonic-gate if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) { 445*0Sstevel@tonic-gate bp->b_flags |= B_DELWRI; 446*0Sstevel@tonic-gate /* keep fsflush from trying continuously to flush */ 447*0Sstevel@tonic-gate bp->b_start = lbolt; 448*0Sstevel@tonic-gate } else 449*0Sstevel@tonic-gate bp->b_flags |= B_AGE|B_STALE; 450*0Sstevel@tonic-gate bp->b_flags &= ~B_ERROR; 451*0Sstevel@tonic-gate bp->b_error = 0; 452*0Sstevel@tonic-gate } 453*0Sstevel@tonic-gate 454*0Sstevel@tonic-gate /* 455*0Sstevel@tonic-gate * If delayed write is set then put in on the delayed 456*0Sstevel@tonic-gate * write list instead of the free buffer list. 457*0Sstevel@tonic-gate */ 458*0Sstevel@tonic-gate index = bio_bhash(bp->b_edev, bp->b_blkno); 459*0Sstevel@tonic-gate hmp = &hbuf[index].b_lock; 460*0Sstevel@tonic-gate 461*0Sstevel@tonic-gate mutex_enter(hmp); 462*0Sstevel@tonic-gate hp = &hbuf[index]; 463*0Sstevel@tonic-gate dp = (struct buf *)hp; 464*0Sstevel@tonic-gate 465*0Sstevel@tonic-gate /* 466*0Sstevel@tonic-gate * Make sure that the number of entries on this list are 467*0Sstevel@tonic-gate * Zero <= count <= total # buffers 468*0Sstevel@tonic-gate */ 469*0Sstevel@tonic-gate ASSERT(hp->b_length >= 0); 470*0Sstevel@tonic-gate ASSERT(hp->b_length < nbuf); 471*0Sstevel@tonic-gate 472*0Sstevel@tonic-gate hp->b_length++; /* We are adding this buffer */ 473*0Sstevel@tonic-gate 474*0Sstevel@tonic-gate if (bp->b_flags & B_DELWRI) { 475*0Sstevel@tonic-gate /* 476*0Sstevel@tonic-gate * This buffer goes on the delayed write buffer list 477*0Sstevel@tonic-gate */ 478*0Sstevel@tonic-gate dp = (struct buf *)&dwbuf[index]; 479*0Sstevel@tonic-gate } 480*0Sstevel@tonic-gate ASSERT(bp->b_bufsize > 0); 481*0Sstevel@tonic-gate ASSERT(bp->b_bcount > 0); 482*0Sstevel@tonic-gate ASSERT(bp->b_un.b_addr != NULL); 483*0Sstevel@tonic-gate 484*0Sstevel@tonic-gate if (bp->b_flags & B_AGE) { 485*0Sstevel@tonic-gate backp = &dp->av_forw; 486*0Sstevel@tonic-gate (*backp)->av_back = bp; 487*0Sstevel@tonic-gate bp->av_forw = *backp; 488*0Sstevel@tonic-gate *backp = bp; 489*0Sstevel@tonic-gate bp->av_back = dp; 490*0Sstevel@tonic-gate } else { 491*0Sstevel@tonic-gate backp = &dp->av_back; 492*0Sstevel@tonic-gate (*backp)->av_forw = bp; 493*0Sstevel@tonic-gate bp->av_back = *backp; 494*0Sstevel@tonic-gate *backp = bp; 495*0Sstevel@tonic-gate bp->av_forw = dp; 496*0Sstevel@tonic-gate } 497*0Sstevel@tonic-gate mutex_exit(hmp); 498*0Sstevel@tonic-gate 499*0Sstevel@tonic-gate if (bfreelist.b_flags & B_WANTED) { 500*0Sstevel@tonic-gate /* 501*0Sstevel@tonic-gate * Should come here very very rarely. 502*0Sstevel@tonic-gate */ 503*0Sstevel@tonic-gate mutex_enter(&bfree_lock); 504*0Sstevel@tonic-gate if (bfreelist.b_flags & B_WANTED) { 505*0Sstevel@tonic-gate bfreelist.b_flags &= ~B_WANTED; 506*0Sstevel@tonic-gate cv_broadcast(&bio_mem_cv); 507*0Sstevel@tonic-gate } 508*0Sstevel@tonic-gate mutex_exit(&bfree_lock); 509*0Sstevel@tonic-gate } 510*0Sstevel@tonic-gate 511*0Sstevel@tonic-gate bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC); 512*0Sstevel@tonic-gate /* 513*0Sstevel@tonic-gate * Don't let anyone get the buffer off the freelist before we 514*0Sstevel@tonic-gate * release our hold on it. 515*0Sstevel@tonic-gate */ 516*0Sstevel@tonic-gate sema_v(&bp->b_sem); 517*0Sstevel@tonic-gate } 518*0Sstevel@tonic-gate 519*0Sstevel@tonic-gate /* 520*0Sstevel@tonic-gate * Return a count of the number of B_BUSY buffers in the system 521*0Sstevel@tonic-gate * Can only be used as a good estimate. If 'cleanit' is set, 522*0Sstevel@tonic-gate * try to flush all bufs. 523*0Sstevel@tonic-gate */ 524*0Sstevel@tonic-gate int 525*0Sstevel@tonic-gate bio_busy(int cleanit) 526*0Sstevel@tonic-gate { 527*0Sstevel@tonic-gate struct buf *bp, *dp; 528*0Sstevel@tonic-gate int busy = 0; 529*0Sstevel@tonic-gate int i; 530*0Sstevel@tonic-gate kmutex_t *hmp; 531*0Sstevel@tonic-gate 532*0Sstevel@tonic-gate for (i = 0; i < v.v_hbuf; i++) { 533*0Sstevel@tonic-gate vfs_syncprogress(); 534*0Sstevel@tonic-gate dp = (struct buf *)&hbuf[i]; 535*0Sstevel@tonic-gate hmp = &hbuf[i].b_lock; 536*0Sstevel@tonic-gate 537*0Sstevel@tonic-gate mutex_enter(hmp); 538*0Sstevel@tonic-gate for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 539*0Sstevel@tonic-gate if (bp->b_flags & B_BUSY) 540*0Sstevel@tonic-gate busy++; 541*0Sstevel@tonic-gate } 542*0Sstevel@tonic-gate mutex_exit(hmp); 543*0Sstevel@tonic-gate } 544*0Sstevel@tonic-gate 545*0Sstevel@tonic-gate if (cleanit && busy != 0) { 546*0Sstevel@tonic-gate bflush(NODEV); 547*0Sstevel@tonic-gate } 548*0Sstevel@tonic-gate 549*0Sstevel@tonic-gate return (busy); 550*0Sstevel@tonic-gate } 551*0Sstevel@tonic-gate 552*0Sstevel@tonic-gate /* 553*0Sstevel@tonic-gate * this interface is provided for binary compatibility. 554*0Sstevel@tonic-gate * 555*0Sstevel@tonic-gate * Assign a buffer for the given block. If the appropriate 556*0Sstevel@tonic-gate * block is already associated, return it; otherwise search 557*0Sstevel@tonic-gate * for the oldest non-busy buffer and reassign it. 558*0Sstevel@tonic-gate */ 559*0Sstevel@tonic-gate struct buf * 560*0Sstevel@tonic-gate getblk(dev_t dev, daddr_t blkno, long bsize) 561*0Sstevel@tonic-gate { 562*0Sstevel@tonic-gate return (getblk_common(/* ufsvfsp */ NULL, dev, 563*0Sstevel@tonic-gate blkno, bsize, /* errflg */ 0)); 564*0Sstevel@tonic-gate } 565*0Sstevel@tonic-gate 566*0Sstevel@tonic-gate /* 567*0Sstevel@tonic-gate * Assign a buffer for the given block. If the appropriate 568*0Sstevel@tonic-gate * block is already associated, return it; otherwise search 569*0Sstevel@tonic-gate * for the oldest non-busy buffer and reassign it. 570*0Sstevel@tonic-gate */ 571*0Sstevel@tonic-gate struct buf * 572*0Sstevel@tonic-gate getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg) 573*0Sstevel@tonic-gate { 574*0Sstevel@tonic-gate ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg; 575*0Sstevel@tonic-gate struct buf *bp; 576*0Sstevel@tonic-gate struct buf *dp; 577*0Sstevel@tonic-gate struct buf *nbp = NULL; 578*0Sstevel@tonic-gate struct buf *errbp; 579*0Sstevel@tonic-gate uint_t index; 580*0Sstevel@tonic-gate kmutex_t *hmp; 581*0Sstevel@tonic-gate struct hbuf *hp; 582*0Sstevel@tonic-gate 583*0Sstevel@tonic-gate if (getmajor(dev) >= devcnt) 584*0Sstevel@tonic-gate cmn_err(CE_PANIC, "blkdev"); 585*0Sstevel@tonic-gate 586*0Sstevel@tonic-gate biostats.bio_lookup.value.ui32++; 587*0Sstevel@tonic-gate 588*0Sstevel@tonic-gate index = bio_bhash(dev, blkno); 589*0Sstevel@tonic-gate hp = &hbuf[index]; 590*0Sstevel@tonic-gate dp = (struct buf *)hp; 591*0Sstevel@tonic-gate hmp = &hp->b_lock; 592*0Sstevel@tonic-gate 593*0Sstevel@tonic-gate mutex_enter(hmp); 594*0Sstevel@tonic-gate loop: 595*0Sstevel@tonic-gate for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 596*0Sstevel@tonic-gate if (bp->b_blkno != blkno || bp->b_edev != dev || 597*0Sstevel@tonic-gate (bp->b_flags & B_STALE)) 598*0Sstevel@tonic-gate continue; 599*0Sstevel@tonic-gate /* 600*0Sstevel@tonic-gate * Avoid holding the hash lock in the event that 601*0Sstevel@tonic-gate * the buffer is locked by someone. Since the hash chain 602*0Sstevel@tonic-gate * may change when we drop the hash lock 603*0Sstevel@tonic-gate * we have to start at the beginning of the chain if the 604*0Sstevel@tonic-gate * buffer identity/contents aren't valid. 605*0Sstevel@tonic-gate */ 606*0Sstevel@tonic-gate if (!sema_tryp(&bp->b_sem)) { 607*0Sstevel@tonic-gate biostats.bio_bufbusy.value.ui32++; 608*0Sstevel@tonic-gate mutex_exit(hmp); 609*0Sstevel@tonic-gate /* 610*0Sstevel@tonic-gate * OK, we are dealing with a busy buffer. 611*0Sstevel@tonic-gate * In the case that we are panicking and we 612*0Sstevel@tonic-gate * got called from bread(), we have some chance 613*0Sstevel@tonic-gate * for error recovery. So better bail out from 614*0Sstevel@tonic-gate * here since sema_p() won't block. If we got 615*0Sstevel@tonic-gate * called directly from ufs routines, there is 616*0Sstevel@tonic-gate * no way to report an error yet. 617*0Sstevel@tonic-gate */ 618*0Sstevel@tonic-gate if (panicstr && errflg) 619*0Sstevel@tonic-gate goto errout; 620*0Sstevel@tonic-gate /* 621*0Sstevel@tonic-gate * For the following line of code to work 622*0Sstevel@tonic-gate * correctly never kmem_free the buffer "header". 623*0Sstevel@tonic-gate */ 624*0Sstevel@tonic-gate sema_p(&bp->b_sem); 625*0Sstevel@tonic-gate if (bp->b_blkno != blkno || bp->b_edev != dev || 626*0Sstevel@tonic-gate (bp->b_flags & B_STALE)) { 627*0Sstevel@tonic-gate sema_v(&bp->b_sem); 628*0Sstevel@tonic-gate mutex_enter(hmp); 629*0Sstevel@tonic-gate goto loop; /* start over */ 630*0Sstevel@tonic-gate } 631*0Sstevel@tonic-gate mutex_enter(hmp); 632*0Sstevel@tonic-gate } 633*0Sstevel@tonic-gate /* Found */ 634*0Sstevel@tonic-gate biostats.bio_hit.value.ui32++; 635*0Sstevel@tonic-gate bp->b_flags &= ~B_AGE; 636*0Sstevel@tonic-gate 637*0Sstevel@tonic-gate /* 638*0Sstevel@tonic-gate * Yank it off the free/delayed write lists 639*0Sstevel@tonic-gate */ 640*0Sstevel@tonic-gate hp->b_length--; 641*0Sstevel@tonic-gate notavail(bp); 642*0Sstevel@tonic-gate mutex_exit(hmp); 643*0Sstevel@tonic-gate 644*0Sstevel@tonic-gate ASSERT((bp->b_flags & B_NOCACHE) == NULL); 645*0Sstevel@tonic-gate 646*0Sstevel@tonic-gate if (nbp == NULL) { 647*0Sstevel@tonic-gate /* 648*0Sstevel@tonic-gate * Make the common path short. 649*0Sstevel@tonic-gate */ 650*0Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 651*0Sstevel@tonic-gate return (bp); 652*0Sstevel@tonic-gate } 653*0Sstevel@tonic-gate 654*0Sstevel@tonic-gate biostats.bio_bufdup.value.ui32++; 655*0Sstevel@tonic-gate 656*0Sstevel@tonic-gate /* 657*0Sstevel@tonic-gate * The buffer must have entered during the lock upgrade 658*0Sstevel@tonic-gate * so free the new buffer we allocated and return the 659*0Sstevel@tonic-gate * found buffer. 660*0Sstevel@tonic-gate */ 661*0Sstevel@tonic-gate kmem_free(nbp->b_un.b_addr, nbp->b_bufsize); 662*0Sstevel@tonic-gate nbp->b_un.b_addr = NULL; 663*0Sstevel@tonic-gate 664*0Sstevel@tonic-gate /* 665*0Sstevel@tonic-gate * Account for the memory 666*0Sstevel@tonic-gate */ 667*0Sstevel@tonic-gate mutex_enter(&bfree_lock); 668*0Sstevel@tonic-gate bfreelist.b_bufsize += nbp->b_bufsize; 669*0Sstevel@tonic-gate mutex_exit(&bfree_lock); 670*0Sstevel@tonic-gate 671*0Sstevel@tonic-gate /* 672*0Sstevel@tonic-gate * Destroy buf identity, and place on avail list 673*0Sstevel@tonic-gate */ 674*0Sstevel@tonic-gate nbp->b_dev = (o_dev_t)NODEV; 675*0Sstevel@tonic-gate nbp->b_edev = NODEV; 676*0Sstevel@tonic-gate nbp->b_flags = 0; 677*0Sstevel@tonic-gate nbp->b_file = NULL; 678*0Sstevel@tonic-gate nbp->b_offset = -1; 679*0Sstevel@tonic-gate 680*0Sstevel@tonic-gate sema_v(&nbp->b_sem); 681*0Sstevel@tonic-gate bio_bhdr_free(nbp); 682*0Sstevel@tonic-gate 683*0Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 684*0Sstevel@tonic-gate return (bp); 685*0Sstevel@tonic-gate } 686*0Sstevel@tonic-gate 687*0Sstevel@tonic-gate /* 688*0Sstevel@tonic-gate * bio_getfreeblk may block so check the hash chain again. 689*0Sstevel@tonic-gate */ 690*0Sstevel@tonic-gate if (nbp == NULL) { 691*0Sstevel@tonic-gate mutex_exit(hmp); 692*0Sstevel@tonic-gate nbp = bio_getfreeblk(bsize); 693*0Sstevel@tonic-gate mutex_enter(hmp); 694*0Sstevel@tonic-gate goto loop; 695*0Sstevel@tonic-gate } 696*0Sstevel@tonic-gate 697*0Sstevel@tonic-gate /* 698*0Sstevel@tonic-gate * New buffer. Assign nbp and stick it on the hash. 699*0Sstevel@tonic-gate */ 700*0Sstevel@tonic-gate nbp->b_flags = B_BUSY; 701*0Sstevel@tonic-gate nbp->b_edev = dev; 702*0Sstevel@tonic-gate nbp->b_dev = (o_dev_t)cmpdev(dev); 703*0Sstevel@tonic-gate nbp->b_blkno = blkno; 704*0Sstevel@tonic-gate nbp->b_iodone = NULL; 705*0Sstevel@tonic-gate nbp->b_bcount = bsize; 706*0Sstevel@tonic-gate /* 707*0Sstevel@tonic-gate * If we are given a ufsvfsp and the vfs_root field is NULL 708*0Sstevel@tonic-gate * then this must be I/O for a superblock. A superblock's 709*0Sstevel@tonic-gate * buffer is set up in mountfs() and there is no root vnode 710*0Sstevel@tonic-gate * at that point. 711*0Sstevel@tonic-gate */ 712*0Sstevel@tonic-gate if (ufsvfsp && ufsvfsp->vfs_root) { 713*0Sstevel@tonic-gate nbp->b_vp = ufsvfsp->vfs_root; 714*0Sstevel@tonic-gate } else { 715*0Sstevel@tonic-gate nbp->b_vp = NULL; 716*0Sstevel@tonic-gate } 717*0Sstevel@tonic-gate 718*0Sstevel@tonic-gate ASSERT((nbp->b_flags & B_NOCACHE) == NULL); 719*0Sstevel@tonic-gate 720*0Sstevel@tonic-gate binshash(nbp, dp); 721*0Sstevel@tonic-gate mutex_exit(hmp); 722*0Sstevel@tonic-gate 723*0Sstevel@tonic-gate ASSERT(SEMA_HELD(&nbp->b_sem)); 724*0Sstevel@tonic-gate 725*0Sstevel@tonic-gate return (nbp); 726*0Sstevel@tonic-gate 727*0Sstevel@tonic-gate 728*0Sstevel@tonic-gate /* 729*0Sstevel@tonic-gate * Come here in case of an internal error. At this point we couldn't 730*0Sstevel@tonic-gate * get a buffer, but he have to return one. Hence we allocate some 731*0Sstevel@tonic-gate * kind of error reply buffer on the fly. This buffer is marked as 732*0Sstevel@tonic-gate * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following: 733*0Sstevel@tonic-gate * - B_ERROR will indicate error to the caller. 734*0Sstevel@tonic-gate * - B_DONE will prevent us from reading the buffer from 735*0Sstevel@tonic-gate * the device. 736*0Sstevel@tonic-gate * - B_NOCACHE will cause that this buffer gets free'd in 737*0Sstevel@tonic-gate * brelse(). 738*0Sstevel@tonic-gate */ 739*0Sstevel@tonic-gate 740*0Sstevel@tonic-gate errout: 741*0Sstevel@tonic-gate errbp = geteblk(); 742*0Sstevel@tonic-gate sema_p(&errbp->b_sem); 743*0Sstevel@tonic-gate errbp->b_flags &= ~B_BUSY; 744*0Sstevel@tonic-gate errbp->b_flags |= (B_ERROR | B_DONE); 745*0Sstevel@tonic-gate return (errbp); 746*0Sstevel@tonic-gate } 747*0Sstevel@tonic-gate 748*0Sstevel@tonic-gate /* 749*0Sstevel@tonic-gate * Get an empty block, not assigned to any particular device. 750*0Sstevel@tonic-gate * Returns a locked buffer that is not on any hash or free list. 751*0Sstevel@tonic-gate */ 752*0Sstevel@tonic-gate struct buf * 753*0Sstevel@tonic-gate ngeteblk(long bsize) 754*0Sstevel@tonic-gate { 755*0Sstevel@tonic-gate struct buf *bp; 756*0Sstevel@tonic-gate 757*0Sstevel@tonic-gate bp = kmem_alloc(sizeof (struct buf), KM_SLEEP); 758*0Sstevel@tonic-gate bioinit(bp); 759*0Sstevel@tonic-gate bp->av_forw = bp->av_back = NULL; 760*0Sstevel@tonic-gate bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP); 761*0Sstevel@tonic-gate bp->b_bufsize = bsize; 762*0Sstevel@tonic-gate bp->b_flags = B_BUSY | B_NOCACHE | B_AGE; 763*0Sstevel@tonic-gate bp->b_dev = (o_dev_t)NODEV; 764*0Sstevel@tonic-gate bp->b_edev = NODEV; 765*0Sstevel@tonic-gate bp->b_lblkno = 0; 766*0Sstevel@tonic-gate bp->b_bcount = bsize; 767*0Sstevel@tonic-gate bp->b_iodone = NULL; 768*0Sstevel@tonic-gate return (bp); 769*0Sstevel@tonic-gate } 770*0Sstevel@tonic-gate 771*0Sstevel@tonic-gate /* 772*0Sstevel@tonic-gate * Interface of geteblk() is kept intact to maintain driver compatibility. 773*0Sstevel@tonic-gate * Use ngeteblk() to allocate block size other than 1 KB. 774*0Sstevel@tonic-gate */ 775*0Sstevel@tonic-gate struct buf * 776*0Sstevel@tonic-gate geteblk(void) 777*0Sstevel@tonic-gate { 778*0Sstevel@tonic-gate return (ngeteblk((long)1024)); 779*0Sstevel@tonic-gate } 780*0Sstevel@tonic-gate 781*0Sstevel@tonic-gate /* 782*0Sstevel@tonic-gate * Return a buffer w/o sleeping 783*0Sstevel@tonic-gate */ 784*0Sstevel@tonic-gate struct buf * 785*0Sstevel@tonic-gate trygetblk(dev_t dev, daddr_t blkno) 786*0Sstevel@tonic-gate { 787*0Sstevel@tonic-gate struct buf *bp; 788*0Sstevel@tonic-gate struct buf *dp; 789*0Sstevel@tonic-gate struct hbuf *hp; 790*0Sstevel@tonic-gate kmutex_t *hmp; 791*0Sstevel@tonic-gate uint_t index; 792*0Sstevel@tonic-gate 793*0Sstevel@tonic-gate index = bio_bhash(dev, blkno); 794*0Sstevel@tonic-gate hp = &hbuf[index]; 795*0Sstevel@tonic-gate hmp = &hp->b_lock; 796*0Sstevel@tonic-gate 797*0Sstevel@tonic-gate if (!mutex_tryenter(hmp)) 798*0Sstevel@tonic-gate return (NULL); 799*0Sstevel@tonic-gate 800*0Sstevel@tonic-gate dp = (struct buf *)hp; 801*0Sstevel@tonic-gate for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 802*0Sstevel@tonic-gate if (bp->b_blkno != blkno || bp->b_edev != dev || 803*0Sstevel@tonic-gate (bp->b_flags & B_STALE)) 804*0Sstevel@tonic-gate continue; 805*0Sstevel@tonic-gate /* 806*0Sstevel@tonic-gate * Get access to a valid buffer without sleeping 807*0Sstevel@tonic-gate */ 808*0Sstevel@tonic-gate if (sema_tryp(&bp->b_sem)) { 809*0Sstevel@tonic-gate if (bp->b_flags & B_DONE) { 810*0Sstevel@tonic-gate hp->b_length--; 811*0Sstevel@tonic-gate notavail(bp); 812*0Sstevel@tonic-gate mutex_exit(hmp); 813*0Sstevel@tonic-gate return (bp); 814*0Sstevel@tonic-gate } else { 815*0Sstevel@tonic-gate sema_v(&bp->b_sem); 816*0Sstevel@tonic-gate break; 817*0Sstevel@tonic-gate } 818*0Sstevel@tonic-gate } 819*0Sstevel@tonic-gate break; 820*0Sstevel@tonic-gate } 821*0Sstevel@tonic-gate mutex_exit(hmp); 822*0Sstevel@tonic-gate return (NULL); 823*0Sstevel@tonic-gate } 824*0Sstevel@tonic-gate 825*0Sstevel@tonic-gate /* 826*0Sstevel@tonic-gate * Wait for I/O completion on the buffer; return errors 827*0Sstevel@tonic-gate * to the user. 828*0Sstevel@tonic-gate */ 829*0Sstevel@tonic-gate int 830*0Sstevel@tonic-gate iowait(struct buf *bp) 831*0Sstevel@tonic-gate { 832*0Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 833*0Sstevel@tonic-gate return (biowait(bp)); 834*0Sstevel@tonic-gate } 835*0Sstevel@tonic-gate 836*0Sstevel@tonic-gate /* 837*0Sstevel@tonic-gate * Mark I/O complete on a buffer, release it if I/O is asynchronous, 838*0Sstevel@tonic-gate * and wake up anyone waiting for it. 839*0Sstevel@tonic-gate */ 840*0Sstevel@tonic-gate void 841*0Sstevel@tonic-gate iodone(struct buf *bp) 842*0Sstevel@tonic-gate { 843*0Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 844*0Sstevel@tonic-gate (void) biodone(bp); 845*0Sstevel@tonic-gate } 846*0Sstevel@tonic-gate 847*0Sstevel@tonic-gate /* 848*0Sstevel@tonic-gate * Zero the core associated with a buffer. 849*0Sstevel@tonic-gate */ 850*0Sstevel@tonic-gate void 851*0Sstevel@tonic-gate clrbuf(struct buf *bp) 852*0Sstevel@tonic-gate { 853*0Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 854*0Sstevel@tonic-gate bzero(bp->b_un.b_addr, bp->b_bcount); 855*0Sstevel@tonic-gate bp->b_resid = 0; 856*0Sstevel@tonic-gate } 857*0Sstevel@tonic-gate 858*0Sstevel@tonic-gate 859*0Sstevel@tonic-gate /* 860*0Sstevel@tonic-gate * Make sure all write-behind blocks on dev (or NODEV for all) 861*0Sstevel@tonic-gate * are flushed out. 862*0Sstevel@tonic-gate */ 863*0Sstevel@tonic-gate void 864*0Sstevel@tonic-gate bflush(dev_t dev) 865*0Sstevel@tonic-gate { 866*0Sstevel@tonic-gate struct buf *bp, *dp; 867*0Sstevel@tonic-gate struct hbuf *hp; 868*0Sstevel@tonic-gate struct buf *delwri_list = EMPTY_LIST; 869*0Sstevel@tonic-gate int i, index; 870*0Sstevel@tonic-gate kmutex_t *hmp; 871*0Sstevel@tonic-gate 872*0Sstevel@tonic-gate mutex_enter(&blist_lock); 873*0Sstevel@tonic-gate /* 874*0Sstevel@tonic-gate * Wait for any invalidates or flushes ahead of us to finish. 875*0Sstevel@tonic-gate * We really could split blist_lock up per device for better 876*0Sstevel@tonic-gate * parallelism here. 877*0Sstevel@tonic-gate */ 878*0Sstevel@tonic-gate while (bio_doinginval || bio_doingflush) { 879*0Sstevel@tonic-gate bio_flinv_cv_wanted = 1; 880*0Sstevel@tonic-gate cv_wait(&bio_flushinval_cv, &blist_lock); 881*0Sstevel@tonic-gate } 882*0Sstevel@tonic-gate bio_doingflush++; 883*0Sstevel@tonic-gate /* 884*0Sstevel@tonic-gate * Gather all B_DELWRI buffer for device. 885*0Sstevel@tonic-gate * Lock ordering is b_sem > hash lock (brelse). 886*0Sstevel@tonic-gate * Since we are finding the buffer via the delayed write list, 887*0Sstevel@tonic-gate * it may be busy and we would block trying to get the 888*0Sstevel@tonic-gate * b_sem lock while holding hash lock. So transfer all the 889*0Sstevel@tonic-gate * candidates on the delwri_list and then drop the hash locks. 890*0Sstevel@tonic-gate */ 891*0Sstevel@tonic-gate for (i = 0; i < v.v_hbuf; i++) { 892*0Sstevel@tonic-gate vfs_syncprogress(); 893*0Sstevel@tonic-gate hmp = &hbuf[i].b_lock; 894*0Sstevel@tonic-gate dp = (struct buf *)&dwbuf[i]; 895*0Sstevel@tonic-gate mutex_enter(hmp); 896*0Sstevel@tonic-gate for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) { 897*0Sstevel@tonic-gate if (dev == NODEV || bp->b_edev == dev) { 898*0Sstevel@tonic-gate if (bp->b_list == NULL) { 899*0Sstevel@tonic-gate bp->b_list = delwri_list; 900*0Sstevel@tonic-gate delwri_list = bp; 901*0Sstevel@tonic-gate } 902*0Sstevel@tonic-gate } 903*0Sstevel@tonic-gate } 904*0Sstevel@tonic-gate mutex_exit(hmp); 905*0Sstevel@tonic-gate } 906*0Sstevel@tonic-gate mutex_exit(&blist_lock); 907*0Sstevel@tonic-gate 908*0Sstevel@tonic-gate /* 909*0Sstevel@tonic-gate * Now that the hash locks have been dropped grab the semaphores 910*0Sstevel@tonic-gate * and write back all the buffers that have B_DELWRI set. 911*0Sstevel@tonic-gate */ 912*0Sstevel@tonic-gate while (delwri_list != EMPTY_LIST) { 913*0Sstevel@tonic-gate vfs_syncprogress(); 914*0Sstevel@tonic-gate bp = delwri_list; 915*0Sstevel@tonic-gate 916*0Sstevel@tonic-gate sema_p(&bp->b_sem); /* may block */ 917*0Sstevel@tonic-gate if ((dev != bp->b_edev && dev != NODEV) || 918*0Sstevel@tonic-gate (panicstr && bp->b_flags & B_BUSY)) { 919*0Sstevel@tonic-gate sema_v(&bp->b_sem); 920*0Sstevel@tonic-gate delwri_list = bp->b_list; 921*0Sstevel@tonic-gate bp->b_list = NULL; 922*0Sstevel@tonic-gate continue; /* No longer a candidate */ 923*0Sstevel@tonic-gate } 924*0Sstevel@tonic-gate if (bp->b_flags & B_DELWRI) { 925*0Sstevel@tonic-gate index = bio_bhash(bp->b_edev, bp->b_blkno); 926*0Sstevel@tonic-gate hp = &hbuf[index]; 927*0Sstevel@tonic-gate hmp = &hp->b_lock; 928*0Sstevel@tonic-gate dp = (struct buf *)hp; 929*0Sstevel@tonic-gate 930*0Sstevel@tonic-gate bp->b_flags |= B_ASYNC; 931*0Sstevel@tonic-gate mutex_enter(hmp); 932*0Sstevel@tonic-gate hp->b_length--; 933*0Sstevel@tonic-gate notavail(bp); 934*0Sstevel@tonic-gate mutex_exit(hmp); 935*0Sstevel@tonic-gate if (bp->b_vp == NULL) { /* !ufs */ 936*0Sstevel@tonic-gate BWRITE(bp); 937*0Sstevel@tonic-gate } else { /* ufs */ 938*0Sstevel@tonic-gate UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp); 939*0Sstevel@tonic-gate } 940*0Sstevel@tonic-gate } else { 941*0Sstevel@tonic-gate sema_v(&bp->b_sem); 942*0Sstevel@tonic-gate } 943*0Sstevel@tonic-gate delwri_list = bp->b_list; 944*0Sstevel@tonic-gate bp->b_list = NULL; 945*0Sstevel@tonic-gate } 946*0Sstevel@tonic-gate mutex_enter(&blist_lock); 947*0Sstevel@tonic-gate bio_doingflush--; 948*0Sstevel@tonic-gate if (bio_flinv_cv_wanted) { 949*0Sstevel@tonic-gate bio_flinv_cv_wanted = 0; 950*0Sstevel@tonic-gate cv_broadcast(&bio_flushinval_cv); 951*0Sstevel@tonic-gate } 952*0Sstevel@tonic-gate mutex_exit(&blist_lock); 953*0Sstevel@tonic-gate } 954*0Sstevel@tonic-gate 955*0Sstevel@tonic-gate /* 956*0Sstevel@tonic-gate * Ensure that a specified block is up-to-date on disk. 957*0Sstevel@tonic-gate */ 958*0Sstevel@tonic-gate void 959*0Sstevel@tonic-gate blkflush(dev_t dev, daddr_t blkno) 960*0Sstevel@tonic-gate { 961*0Sstevel@tonic-gate struct buf *bp, *dp; 962*0Sstevel@tonic-gate struct hbuf *hp; 963*0Sstevel@tonic-gate struct buf *sbp = NULL; 964*0Sstevel@tonic-gate uint_t index; 965*0Sstevel@tonic-gate kmutex_t *hmp; 966*0Sstevel@tonic-gate 967*0Sstevel@tonic-gate index = bio_bhash(dev, blkno); 968*0Sstevel@tonic-gate hp = &hbuf[index]; 969*0Sstevel@tonic-gate dp = (struct buf *)hp; 970*0Sstevel@tonic-gate hmp = &hp->b_lock; 971*0Sstevel@tonic-gate 972*0Sstevel@tonic-gate /* 973*0Sstevel@tonic-gate * Identify the buffer in the cache belonging to 974*0Sstevel@tonic-gate * this device and blkno (if any). 975*0Sstevel@tonic-gate */ 976*0Sstevel@tonic-gate mutex_enter(hmp); 977*0Sstevel@tonic-gate for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 978*0Sstevel@tonic-gate if (bp->b_blkno != blkno || bp->b_edev != dev || 979*0Sstevel@tonic-gate (bp->b_flags & B_STALE)) 980*0Sstevel@tonic-gate continue; 981*0Sstevel@tonic-gate sbp = bp; 982*0Sstevel@tonic-gate break; 983*0Sstevel@tonic-gate } 984*0Sstevel@tonic-gate mutex_exit(hmp); 985*0Sstevel@tonic-gate if (sbp == NULL) 986*0Sstevel@tonic-gate return; 987*0Sstevel@tonic-gate /* 988*0Sstevel@tonic-gate * Now check the buffer we have identified and 989*0Sstevel@tonic-gate * make sure it still belongs to the device and is B_DELWRI 990*0Sstevel@tonic-gate */ 991*0Sstevel@tonic-gate sema_p(&sbp->b_sem); 992*0Sstevel@tonic-gate if (sbp->b_blkno == blkno && sbp->b_edev == dev && 993*0Sstevel@tonic-gate (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) { 994*0Sstevel@tonic-gate mutex_enter(hmp); 995*0Sstevel@tonic-gate hp->b_length--; 996*0Sstevel@tonic-gate notavail(sbp); 997*0Sstevel@tonic-gate mutex_exit(hmp); 998*0Sstevel@tonic-gate /* 999*0Sstevel@tonic-gate * XXX - There is nothing to guarantee a synchronous 1000*0Sstevel@tonic-gate * write here if the B_ASYNC flag is set. This needs 1001*0Sstevel@tonic-gate * some investigation. 1002*0Sstevel@tonic-gate */ 1003*0Sstevel@tonic-gate if (sbp->b_vp == NULL) { /* !ufs */ 1004*0Sstevel@tonic-gate BWRITE(sbp); /* synchronous write */ 1005*0Sstevel@tonic-gate } else { /* ufs */ 1006*0Sstevel@tonic-gate UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp); 1007*0Sstevel@tonic-gate } 1008*0Sstevel@tonic-gate } else { 1009*0Sstevel@tonic-gate sema_v(&sbp->b_sem); 1010*0Sstevel@tonic-gate } 1011*0Sstevel@tonic-gate } 1012*0Sstevel@tonic-gate 1013*0Sstevel@tonic-gate /* 1014*0Sstevel@tonic-gate * Same as binval, except can force-invalidate delayed-write buffers 1015*0Sstevel@tonic-gate * (which are not be already flushed because of device errors). Also 1016*0Sstevel@tonic-gate * makes sure that the retry write flag is cleared. 1017*0Sstevel@tonic-gate */ 1018*0Sstevel@tonic-gate int 1019*0Sstevel@tonic-gate bfinval(dev_t dev, int force) 1020*0Sstevel@tonic-gate { 1021*0Sstevel@tonic-gate struct buf *dp; 1022*0Sstevel@tonic-gate struct buf *bp; 1023*0Sstevel@tonic-gate struct buf *binval_list = EMPTY_LIST; 1024*0Sstevel@tonic-gate int i, error = 0; 1025*0Sstevel@tonic-gate kmutex_t *hmp; 1026*0Sstevel@tonic-gate uint_t index; 1027*0Sstevel@tonic-gate struct buf **backp; 1028*0Sstevel@tonic-gate 1029*0Sstevel@tonic-gate mutex_enter(&blist_lock); 1030*0Sstevel@tonic-gate /* 1031*0Sstevel@tonic-gate * Wait for any flushes ahead of us to finish, it's ok to 1032*0Sstevel@tonic-gate * do invalidates in parallel. 1033*0Sstevel@tonic-gate */ 1034*0Sstevel@tonic-gate while (bio_doingflush) { 1035*0Sstevel@tonic-gate bio_flinv_cv_wanted = 1; 1036*0Sstevel@tonic-gate cv_wait(&bio_flushinval_cv, &blist_lock); 1037*0Sstevel@tonic-gate } 1038*0Sstevel@tonic-gate bio_doinginval++; 1039*0Sstevel@tonic-gate 1040*0Sstevel@tonic-gate /* Gather bp's */ 1041*0Sstevel@tonic-gate for (i = 0; i < v.v_hbuf; i++) { 1042*0Sstevel@tonic-gate dp = (struct buf *)&hbuf[i]; 1043*0Sstevel@tonic-gate hmp = &hbuf[i].b_lock; 1044*0Sstevel@tonic-gate 1045*0Sstevel@tonic-gate mutex_enter(hmp); 1046*0Sstevel@tonic-gate for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1047*0Sstevel@tonic-gate if (bp->b_edev == dev) { 1048*0Sstevel@tonic-gate if (bp->b_list == NULL) { 1049*0Sstevel@tonic-gate bp->b_list = binval_list; 1050*0Sstevel@tonic-gate binval_list = bp; 1051*0Sstevel@tonic-gate } 1052*0Sstevel@tonic-gate } 1053*0Sstevel@tonic-gate } 1054*0Sstevel@tonic-gate mutex_exit(hmp); 1055*0Sstevel@tonic-gate } 1056*0Sstevel@tonic-gate mutex_exit(&blist_lock); 1057*0Sstevel@tonic-gate 1058*0Sstevel@tonic-gate /* Invalidate all bp's found */ 1059*0Sstevel@tonic-gate while (binval_list != EMPTY_LIST) { 1060*0Sstevel@tonic-gate bp = binval_list; 1061*0Sstevel@tonic-gate 1062*0Sstevel@tonic-gate sema_p(&bp->b_sem); 1063*0Sstevel@tonic-gate if (bp->b_edev == dev) { 1064*0Sstevel@tonic-gate if (force && (bp->b_flags & B_DELWRI)) { 1065*0Sstevel@tonic-gate /* clear B_DELWRI, move to non-dw freelist */ 1066*0Sstevel@tonic-gate index = bio_bhash(bp->b_edev, bp->b_blkno); 1067*0Sstevel@tonic-gate hmp = &hbuf[index].b_lock; 1068*0Sstevel@tonic-gate dp = (struct buf *)&hbuf[index]; 1069*0Sstevel@tonic-gate mutex_enter(hmp); 1070*0Sstevel@tonic-gate 1071*0Sstevel@tonic-gate /* remove from delayed write freelist */ 1072*0Sstevel@tonic-gate notavail(bp); 1073*0Sstevel@tonic-gate 1074*0Sstevel@tonic-gate /* add to B_AGE side of non-dw freelist */ 1075*0Sstevel@tonic-gate backp = &dp->av_forw; 1076*0Sstevel@tonic-gate (*backp)->av_back = bp; 1077*0Sstevel@tonic-gate bp->av_forw = *backp; 1078*0Sstevel@tonic-gate *backp = bp; 1079*0Sstevel@tonic-gate bp->av_back = dp; 1080*0Sstevel@tonic-gate 1081*0Sstevel@tonic-gate /* 1082*0Sstevel@tonic-gate * make sure write retries and busy are cleared 1083*0Sstevel@tonic-gate */ 1084*0Sstevel@tonic-gate bp->b_flags &= 1085*0Sstevel@tonic-gate ~(B_BUSY | B_DELWRI | B_RETRYWRI); 1086*0Sstevel@tonic-gate mutex_exit(hmp); 1087*0Sstevel@tonic-gate } 1088*0Sstevel@tonic-gate if ((bp->b_flags & B_DELWRI) == 0) 1089*0Sstevel@tonic-gate bp->b_flags |= B_STALE|B_AGE; 1090*0Sstevel@tonic-gate else 1091*0Sstevel@tonic-gate error = EIO; 1092*0Sstevel@tonic-gate } 1093*0Sstevel@tonic-gate sema_v(&bp->b_sem); 1094*0Sstevel@tonic-gate binval_list = bp->b_list; 1095*0Sstevel@tonic-gate bp->b_list = NULL; 1096*0Sstevel@tonic-gate } 1097*0Sstevel@tonic-gate mutex_enter(&blist_lock); 1098*0Sstevel@tonic-gate bio_doinginval--; 1099*0Sstevel@tonic-gate if (bio_flinv_cv_wanted) { 1100*0Sstevel@tonic-gate cv_broadcast(&bio_flushinval_cv); 1101*0Sstevel@tonic-gate bio_flinv_cv_wanted = 0; 1102*0Sstevel@tonic-gate } 1103*0Sstevel@tonic-gate mutex_exit(&blist_lock); 1104*0Sstevel@tonic-gate return (error); 1105*0Sstevel@tonic-gate } 1106*0Sstevel@tonic-gate 1107*0Sstevel@tonic-gate /* 1108*0Sstevel@tonic-gate * If possible, invalidate blocks for a dev on demand 1109*0Sstevel@tonic-gate */ 1110*0Sstevel@tonic-gate void 1111*0Sstevel@tonic-gate binval(dev_t dev) 1112*0Sstevel@tonic-gate { 1113*0Sstevel@tonic-gate (void) bfinval(dev, 0); 1114*0Sstevel@tonic-gate } 1115*0Sstevel@tonic-gate 1116*0Sstevel@tonic-gate /* 1117*0Sstevel@tonic-gate * Initialize the buffer I/O system by freeing 1118*0Sstevel@tonic-gate * all buffers and setting all device hash buffer lists to empty. 1119*0Sstevel@tonic-gate */ 1120*0Sstevel@tonic-gate void 1121*0Sstevel@tonic-gate binit(void) 1122*0Sstevel@tonic-gate { 1123*0Sstevel@tonic-gate struct buf *bp; 1124*0Sstevel@tonic-gate unsigned int i, pct; 1125*0Sstevel@tonic-gate ulong_t bio_max_hwm, bio_default_hwm; 1126*0Sstevel@tonic-gate 1127*0Sstevel@tonic-gate /* 1128*0Sstevel@tonic-gate * Maximum/Default values for bufhwm are set to the smallest of: 1129*0Sstevel@tonic-gate * - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory 1130*0Sstevel@tonic-gate * - 1/4 of kernel virtual memory 1131*0Sstevel@tonic-gate * - INT32_MAX to prevent overflows of v.v_bufhwm (which is int). 1132*0Sstevel@tonic-gate * Additionally, in order to allow simple tuning by percentage of 1133*0Sstevel@tonic-gate * physical memory, bufhwm_pct is used to calculate the default if 1134*0Sstevel@tonic-gate * the value of this tunable is between 0 and BIO_MAX_PERCENT. 1135*0Sstevel@tonic-gate * 1136*0Sstevel@tonic-gate * Since the unit for v.v_bufhwm is kilobytes, this allows for 1137*0Sstevel@tonic-gate * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers. 1138*0Sstevel@tonic-gate */ 1139*0Sstevel@tonic-gate bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT, 1140*0Sstevel@tonic-gate btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024); 1141*0Sstevel@tonic-gate bio_max_hwm = MIN(INT32_MAX, bio_max_hwm); 1142*0Sstevel@tonic-gate 1143*0Sstevel@tonic-gate pct = BIO_BUF_PERCENT; 1144*0Sstevel@tonic-gate if (bufhwm_pct != 0 && 1145*0Sstevel@tonic-gate ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) { 1146*0Sstevel@tonic-gate pct = BIO_BUF_PERCENT; 1147*0Sstevel@tonic-gate /* 1148*0Sstevel@tonic-gate * Invalid user specified value, emit a warning. 1149*0Sstevel@tonic-gate */ 1150*0Sstevel@tonic-gate cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \ 1151*0Sstevel@tonic-gate range(1..%d). Using %d as default.", 1152*0Sstevel@tonic-gate bufhwm_pct, 1153*0Sstevel@tonic-gate 100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT); 1154*0Sstevel@tonic-gate } 1155*0Sstevel@tonic-gate 1156*0Sstevel@tonic-gate bio_default_hwm = MIN(physmem / pct, 1157*0Sstevel@tonic-gate btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024); 1158*0Sstevel@tonic-gate bio_default_hwm = MIN(INT32_MAX, bio_default_hwm); 1159*0Sstevel@tonic-gate 1160*0Sstevel@tonic-gate if ((v.v_bufhwm = bufhwm) == 0) 1161*0Sstevel@tonic-gate v.v_bufhwm = bio_default_hwm; 1162*0Sstevel@tonic-gate 1163*0Sstevel@tonic-gate if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) { 1164*0Sstevel@tonic-gate v.v_bufhwm = (int)bio_max_hwm; 1165*0Sstevel@tonic-gate /* 1166*0Sstevel@tonic-gate * Invalid user specified value, emit a warning. 1167*0Sstevel@tonic-gate */ 1168*0Sstevel@tonic-gate cmn_err(CE_WARN, 1169*0Sstevel@tonic-gate "binit: bufhwm(%d) out \ 1170*0Sstevel@tonic-gate of range(%d..%lu). Using %lu as default", 1171*0Sstevel@tonic-gate bufhwm, 1172*0Sstevel@tonic-gate BIO_MIN_HWM, bio_max_hwm, bio_max_hwm); 1173*0Sstevel@tonic-gate } 1174*0Sstevel@tonic-gate 1175*0Sstevel@tonic-gate /* 1176*0Sstevel@tonic-gate * Determine the number of hash buckets. Default is to 1177*0Sstevel@tonic-gate * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers. 1178*0Sstevel@tonic-gate * Round up number to the next power of 2. 1179*0Sstevel@tonic-gate */ 1180*0Sstevel@tonic-gate v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) / 1181*0Sstevel@tonic-gate BIO_HASHLEN); 1182*0Sstevel@tonic-gate v.v_hmask = v.v_hbuf - 1; 1183*0Sstevel@tonic-gate v.v_buf = BIO_BHDR_POOL; 1184*0Sstevel@tonic-gate 1185*0Sstevel@tonic-gate hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP); 1186*0Sstevel@tonic-gate 1187*0Sstevel@tonic-gate dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP); 1188*0Sstevel@tonic-gate 1189*0Sstevel@tonic-gate bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024; 1190*0Sstevel@tonic-gate bp = &bfreelist; 1191*0Sstevel@tonic-gate bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp; 1192*0Sstevel@tonic-gate 1193*0Sstevel@tonic-gate for (i = 0; i < v.v_hbuf; i++) { 1194*0Sstevel@tonic-gate hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i]; 1195*0Sstevel@tonic-gate hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i]; 1196*0Sstevel@tonic-gate 1197*0Sstevel@tonic-gate /* 1198*0Sstevel@tonic-gate * Initialize the delayed write buffer list. 1199*0Sstevel@tonic-gate */ 1200*0Sstevel@tonic-gate dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i]; 1201*0Sstevel@tonic-gate dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i]; 1202*0Sstevel@tonic-gate } 1203*0Sstevel@tonic-gate } 1204*0Sstevel@tonic-gate 1205*0Sstevel@tonic-gate /* 1206*0Sstevel@tonic-gate * Wait for I/O completion on the buffer; return error code. 1207*0Sstevel@tonic-gate * If bp was for synchronous I/O, bp is invalid and associated 1208*0Sstevel@tonic-gate * resources are freed on return. 1209*0Sstevel@tonic-gate */ 1210*0Sstevel@tonic-gate int 1211*0Sstevel@tonic-gate biowait(struct buf *bp) 1212*0Sstevel@tonic-gate { 1213*0Sstevel@tonic-gate int error = 0; 1214*0Sstevel@tonic-gate struct cpu *cpup; 1215*0Sstevel@tonic-gate 1216*0Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 1217*0Sstevel@tonic-gate 1218*0Sstevel@tonic-gate cpup = CPU; 1219*0Sstevel@tonic-gate atomic_add_64(&cpup->cpu_stats.sys.iowait, 1); 1220*0Sstevel@tonic-gate DTRACE_IO1(wait__start, struct buf *, bp); 1221*0Sstevel@tonic-gate 1222*0Sstevel@tonic-gate /* 1223*0Sstevel@tonic-gate * In case of panic, busy wait for completion 1224*0Sstevel@tonic-gate */ 1225*0Sstevel@tonic-gate if (panicstr) { 1226*0Sstevel@tonic-gate while ((bp->b_flags & B_DONE) == 0) 1227*0Sstevel@tonic-gate drv_usecwait(10); 1228*0Sstevel@tonic-gate } else 1229*0Sstevel@tonic-gate sema_p(&bp->b_io); 1230*0Sstevel@tonic-gate 1231*0Sstevel@tonic-gate DTRACE_IO1(wait__done, struct buf *, bp); 1232*0Sstevel@tonic-gate atomic_add_64(&cpup->cpu_stats.sys.iowait, -1); 1233*0Sstevel@tonic-gate 1234*0Sstevel@tonic-gate error = geterror(bp); 1235*0Sstevel@tonic-gate if ((bp->b_flags & B_ASYNC) == 0) { 1236*0Sstevel@tonic-gate if (bp->b_flags & B_REMAPPED) 1237*0Sstevel@tonic-gate bp_mapout(bp); 1238*0Sstevel@tonic-gate } 1239*0Sstevel@tonic-gate return (error); 1240*0Sstevel@tonic-gate } 1241*0Sstevel@tonic-gate 1242*0Sstevel@tonic-gate static void 1243*0Sstevel@tonic-gate biodone_tnf_probe(struct buf *bp) 1244*0Sstevel@tonic-gate { 1245*0Sstevel@tonic-gate /* Kernel probe */ 1246*0Sstevel@tonic-gate TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */, 1247*0Sstevel@tonic-gate tnf_device, device, bp->b_edev, 1248*0Sstevel@tonic-gate tnf_diskaddr, block, bp->b_lblkno, 1249*0Sstevel@tonic-gate tnf_opaque, buf, bp); 1250*0Sstevel@tonic-gate } 1251*0Sstevel@tonic-gate 1252*0Sstevel@tonic-gate /* 1253*0Sstevel@tonic-gate * Mark I/O complete on a buffer, release it if I/O is asynchronous, 1254*0Sstevel@tonic-gate * and wake up anyone waiting for it. 1255*0Sstevel@tonic-gate */ 1256*0Sstevel@tonic-gate void 1257*0Sstevel@tonic-gate biodone(struct buf *bp) 1258*0Sstevel@tonic-gate { 1259*0Sstevel@tonic-gate if (bp->b_flags & B_STARTED) { 1260*0Sstevel@tonic-gate DTRACE_IO1(done, struct buf *, bp); 1261*0Sstevel@tonic-gate bp->b_flags &= ~B_STARTED; 1262*0Sstevel@tonic-gate } 1263*0Sstevel@tonic-gate 1264*0Sstevel@tonic-gate /* 1265*0Sstevel@tonic-gate * Call the TNF probe here instead of the inline code 1266*0Sstevel@tonic-gate * to force our compiler to use the tail call optimization. 1267*0Sstevel@tonic-gate */ 1268*0Sstevel@tonic-gate biodone_tnf_probe(bp); 1269*0Sstevel@tonic-gate 1270*0Sstevel@tonic-gate if (bp->b_iodone != NULL) { 1271*0Sstevel@tonic-gate (*(bp->b_iodone))(bp); 1272*0Sstevel@tonic-gate return; 1273*0Sstevel@tonic-gate } 1274*0Sstevel@tonic-gate ASSERT((bp->b_flags & B_DONE) == 0); 1275*0Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 1276*0Sstevel@tonic-gate bp->b_flags |= B_DONE; 1277*0Sstevel@tonic-gate if (bp->b_flags & B_ASYNC) { 1278*0Sstevel@tonic-gate if (bp->b_flags & (B_PAGEIO|B_REMAPPED)) 1279*0Sstevel@tonic-gate bio_pageio_done(bp); 1280*0Sstevel@tonic-gate else 1281*0Sstevel@tonic-gate brelse(bp); /* release bp to freelist */ 1282*0Sstevel@tonic-gate } else { 1283*0Sstevel@tonic-gate sema_v(&bp->b_io); 1284*0Sstevel@tonic-gate } 1285*0Sstevel@tonic-gate } 1286*0Sstevel@tonic-gate 1287*0Sstevel@tonic-gate /* 1288*0Sstevel@tonic-gate * Pick up the device's error number and pass it to the user; 1289*0Sstevel@tonic-gate * if there is an error but the number is 0 set a generalized code. 1290*0Sstevel@tonic-gate */ 1291*0Sstevel@tonic-gate int 1292*0Sstevel@tonic-gate geterror(struct buf *bp) 1293*0Sstevel@tonic-gate { 1294*0Sstevel@tonic-gate int error = 0; 1295*0Sstevel@tonic-gate 1296*0Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 1297*0Sstevel@tonic-gate if (bp->b_flags & B_ERROR) { 1298*0Sstevel@tonic-gate error = bp->b_error; 1299*0Sstevel@tonic-gate if (!error) 1300*0Sstevel@tonic-gate error = EIO; 1301*0Sstevel@tonic-gate } 1302*0Sstevel@tonic-gate return (error); 1303*0Sstevel@tonic-gate } 1304*0Sstevel@tonic-gate 1305*0Sstevel@tonic-gate /* 1306*0Sstevel@tonic-gate * Support for pageio buffers. 1307*0Sstevel@tonic-gate * 1308*0Sstevel@tonic-gate * This stuff should be generalized to provide a generalized bp 1309*0Sstevel@tonic-gate * header facility that can be used for things other than pageio. 1310*0Sstevel@tonic-gate */ 1311*0Sstevel@tonic-gate 1312*0Sstevel@tonic-gate /* 1313*0Sstevel@tonic-gate * Allocate and initialize a buf struct for use with pageio. 1314*0Sstevel@tonic-gate */ 1315*0Sstevel@tonic-gate struct buf * 1316*0Sstevel@tonic-gate pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags) 1317*0Sstevel@tonic-gate { 1318*0Sstevel@tonic-gate struct buf *bp; 1319*0Sstevel@tonic-gate struct cpu *cpup; 1320*0Sstevel@tonic-gate 1321*0Sstevel@tonic-gate if (flags & B_READ) { 1322*0Sstevel@tonic-gate CPU_STATS_ENTER_K(); 1323*0Sstevel@tonic-gate cpup = CPU; /* get pointer AFTER preemption is disabled */ 1324*0Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, pgin, 1); 1325*0Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len)); 1326*0Sstevel@tonic-gate if ((flags & B_ASYNC) == 0) { 1327*0Sstevel@tonic-gate klwp_t *lwp = ttolwp(curthread); 1328*0Sstevel@tonic-gate if (lwp != NULL) 1329*0Sstevel@tonic-gate lwp->lwp_ru.majflt++; 1330*0Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, maj_fault, 1); 1331*0Sstevel@tonic-gate /* Kernel probe */ 1332*0Sstevel@tonic-gate TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */, 1333*0Sstevel@tonic-gate tnf_opaque, vnode, pp->p_vnode, 1334*0Sstevel@tonic-gate tnf_offset, offset, pp->p_offset); 1335*0Sstevel@tonic-gate } 1336*0Sstevel@tonic-gate /* 1337*0Sstevel@tonic-gate * Update statistics for pages being paged in 1338*0Sstevel@tonic-gate */ 1339*0Sstevel@tonic-gate if (pp != NULL && pp->p_vnode != NULL) { 1340*0Sstevel@tonic-gate if (IS_SWAPFSVP(pp->p_vnode)) { 1341*0Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, anonpgin, 1342*0Sstevel@tonic-gate btopr(len)); 1343*0Sstevel@tonic-gate } else { 1344*0Sstevel@tonic-gate if (pp->p_vnode->v_flag & VVMEXEC) { 1345*0Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, execpgin, 1346*0Sstevel@tonic-gate btopr(len)); 1347*0Sstevel@tonic-gate } else { 1348*0Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, fspgin, 1349*0Sstevel@tonic-gate btopr(len)); 1350*0Sstevel@tonic-gate } 1351*0Sstevel@tonic-gate } 1352*0Sstevel@tonic-gate } 1353*0Sstevel@tonic-gate CPU_STATS_EXIT_K(); 1354*0Sstevel@tonic-gate TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN, 1355*0Sstevel@tonic-gate "page_ws_in:pp %p", pp); 1356*0Sstevel@tonic-gate /* Kernel probe */ 1357*0Sstevel@tonic-gate TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */, 1358*0Sstevel@tonic-gate tnf_opaque, vnode, pp->p_vnode, 1359*0Sstevel@tonic-gate tnf_offset, offset, pp->p_offset, 1360*0Sstevel@tonic-gate tnf_size, size, len); 1361*0Sstevel@tonic-gate } 1362*0Sstevel@tonic-gate 1363*0Sstevel@tonic-gate bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP); 1364*0Sstevel@tonic-gate bp->b_bcount = len; 1365*0Sstevel@tonic-gate bp->b_bufsize = len; 1366*0Sstevel@tonic-gate bp->b_pages = pp; 1367*0Sstevel@tonic-gate bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags; 1368*0Sstevel@tonic-gate bp->b_offset = -1; 1369*0Sstevel@tonic-gate sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL); 1370*0Sstevel@tonic-gate 1371*0Sstevel@tonic-gate /* Initialize bp->b_sem in "locked" state */ 1372*0Sstevel@tonic-gate sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL); 1373*0Sstevel@tonic-gate 1374*0Sstevel@tonic-gate VN_HOLD(vp); 1375*0Sstevel@tonic-gate bp->b_vp = vp; 1376*0Sstevel@tonic-gate THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */ 1377*0Sstevel@tonic-gate 1378*0Sstevel@tonic-gate /* 1379*0Sstevel@tonic-gate * Caller sets dev & blkno and can adjust 1380*0Sstevel@tonic-gate * b_addr for page offset and can use bp_mapin 1381*0Sstevel@tonic-gate * to make pages kernel addressable. 1382*0Sstevel@tonic-gate */ 1383*0Sstevel@tonic-gate return (bp); 1384*0Sstevel@tonic-gate } 1385*0Sstevel@tonic-gate 1386*0Sstevel@tonic-gate void 1387*0Sstevel@tonic-gate pageio_done(struct buf *bp) 1388*0Sstevel@tonic-gate { 1389*0Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 1390*0Sstevel@tonic-gate if (bp->b_flags & B_REMAPPED) 1391*0Sstevel@tonic-gate bp_mapout(bp); 1392*0Sstevel@tonic-gate VN_RELE(bp->b_vp); 1393*0Sstevel@tonic-gate bp->b_vp = NULL; 1394*0Sstevel@tonic-gate ASSERT((bp->b_flags & B_NOCACHE) != 0); 1395*0Sstevel@tonic-gate 1396*0Sstevel@tonic-gate /* A sema_v(bp->b_sem) is implied if we are destroying it */ 1397*0Sstevel@tonic-gate sema_destroy(&bp->b_sem); 1398*0Sstevel@tonic-gate sema_destroy(&bp->b_io); 1399*0Sstevel@tonic-gate kmem_free(bp, sizeof (struct buf)); 1400*0Sstevel@tonic-gate } 1401*0Sstevel@tonic-gate 1402*0Sstevel@tonic-gate /* 1403*0Sstevel@tonic-gate * Check to see whether the buffers, except the one pointed by sbp, 1404*0Sstevel@tonic-gate * associated with the device are busy. 1405*0Sstevel@tonic-gate * NOTE: This expensive operation shall be improved together with ufs_icheck(). 1406*0Sstevel@tonic-gate */ 1407*0Sstevel@tonic-gate int 1408*0Sstevel@tonic-gate bcheck(dev_t dev, struct buf *sbp) 1409*0Sstevel@tonic-gate { 1410*0Sstevel@tonic-gate struct buf *bp; 1411*0Sstevel@tonic-gate struct buf *dp; 1412*0Sstevel@tonic-gate int i; 1413*0Sstevel@tonic-gate kmutex_t *hmp; 1414*0Sstevel@tonic-gate 1415*0Sstevel@tonic-gate /* 1416*0Sstevel@tonic-gate * check for busy bufs for this filesystem 1417*0Sstevel@tonic-gate */ 1418*0Sstevel@tonic-gate for (i = 0; i < v.v_hbuf; i++) { 1419*0Sstevel@tonic-gate dp = (struct buf *)&hbuf[i]; 1420*0Sstevel@tonic-gate hmp = &hbuf[i].b_lock; 1421*0Sstevel@tonic-gate 1422*0Sstevel@tonic-gate mutex_enter(hmp); 1423*0Sstevel@tonic-gate for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1424*0Sstevel@tonic-gate /* 1425*0Sstevel@tonic-gate * if buf is busy or dirty, then filesystem is busy 1426*0Sstevel@tonic-gate */ 1427*0Sstevel@tonic-gate if ((bp->b_edev == dev) && 1428*0Sstevel@tonic-gate ((bp->b_flags & B_STALE) == 0) && 1429*0Sstevel@tonic-gate (bp->b_flags & (B_DELWRI|B_BUSY)) && 1430*0Sstevel@tonic-gate (bp != sbp)) { 1431*0Sstevel@tonic-gate mutex_exit(hmp); 1432*0Sstevel@tonic-gate return (1); 1433*0Sstevel@tonic-gate } 1434*0Sstevel@tonic-gate } 1435*0Sstevel@tonic-gate mutex_exit(hmp); 1436*0Sstevel@tonic-gate } 1437*0Sstevel@tonic-gate return (0); 1438*0Sstevel@tonic-gate } 1439*0Sstevel@tonic-gate 1440*0Sstevel@tonic-gate /* 1441*0Sstevel@tonic-gate * Hash two 32 bit entities. 1442*0Sstevel@tonic-gate */ 1443*0Sstevel@tonic-gate int 1444*0Sstevel@tonic-gate hash2ints(int x, int y) 1445*0Sstevel@tonic-gate { 1446*0Sstevel@tonic-gate int hash = 0; 1447*0Sstevel@tonic-gate 1448*0Sstevel@tonic-gate hash = x - 1; 1449*0Sstevel@tonic-gate hash = ((hash * 7) + (x >> 8)) - 1; 1450*0Sstevel@tonic-gate hash = ((hash * 7) + (x >> 16)) - 1; 1451*0Sstevel@tonic-gate hash = ((hash * 7) + (x >> 24)) - 1; 1452*0Sstevel@tonic-gate hash = ((hash * 7) + y) - 1; 1453*0Sstevel@tonic-gate hash = ((hash * 7) + (y >> 8)) - 1; 1454*0Sstevel@tonic-gate hash = ((hash * 7) + (y >> 16)) - 1; 1455*0Sstevel@tonic-gate hash = ((hash * 7) + (y >> 24)) - 1; 1456*0Sstevel@tonic-gate 1457*0Sstevel@tonic-gate return (hash); 1458*0Sstevel@tonic-gate } 1459*0Sstevel@tonic-gate 1460*0Sstevel@tonic-gate 1461*0Sstevel@tonic-gate /* 1462*0Sstevel@tonic-gate * Return a new buffer struct. 1463*0Sstevel@tonic-gate * Create a new buffer if we haven't gone over our high water 1464*0Sstevel@tonic-gate * mark for memory, otherwise try to get one off the freelist. 1465*0Sstevel@tonic-gate * 1466*0Sstevel@tonic-gate * Returns a locked buf that has no id and is not on any hash or free 1467*0Sstevel@tonic-gate * list. 1468*0Sstevel@tonic-gate */ 1469*0Sstevel@tonic-gate static struct buf * 1470*0Sstevel@tonic-gate bio_getfreeblk(long bsize) 1471*0Sstevel@tonic-gate { 1472*0Sstevel@tonic-gate struct buf *bp, *dp; 1473*0Sstevel@tonic-gate struct hbuf *hp; 1474*0Sstevel@tonic-gate kmutex_t *hmp; 1475*0Sstevel@tonic-gate uint_t start, end; 1476*0Sstevel@tonic-gate 1477*0Sstevel@tonic-gate /* 1478*0Sstevel@tonic-gate * mutex_enter(&bfree_lock); 1479*0Sstevel@tonic-gate * bfreelist.b_bufsize represents the amount of memory 1480*0Sstevel@tonic-gate * mutex_exit(&bfree_lock); protect ref to bfreelist 1481*0Sstevel@tonic-gate * we are allowed to allocate in the cache before we hit our hwm. 1482*0Sstevel@tonic-gate */ 1483*0Sstevel@tonic-gate bio_mem_get(bsize); /* Account for our memory request */ 1484*0Sstevel@tonic-gate 1485*0Sstevel@tonic-gate again: 1486*0Sstevel@tonic-gate bp = bio_bhdr_alloc(); /* Get a buf hdr */ 1487*0Sstevel@tonic-gate sema_p(&bp->b_sem); /* Should never fail */ 1488*0Sstevel@tonic-gate 1489*0Sstevel@tonic-gate ASSERT(bp->b_un.b_addr == NULL); 1490*0Sstevel@tonic-gate bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP); 1491*0Sstevel@tonic-gate if (bp->b_un.b_addr != NULL) { 1492*0Sstevel@tonic-gate /* 1493*0Sstevel@tonic-gate * Make the common path short 1494*0Sstevel@tonic-gate */ 1495*0Sstevel@tonic-gate bp->b_bufsize = bsize; 1496*0Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 1497*0Sstevel@tonic-gate return (bp); 1498*0Sstevel@tonic-gate } else { 1499*0Sstevel@tonic-gate struct buf *save; 1500*0Sstevel@tonic-gate 1501*0Sstevel@tonic-gate save = bp; /* Save bp we allocated */ 1502*0Sstevel@tonic-gate start = end = lastindex; 1503*0Sstevel@tonic-gate 1504*0Sstevel@tonic-gate biostats.bio_bufwant.value.ui32++; 1505*0Sstevel@tonic-gate 1506*0Sstevel@tonic-gate /* 1507*0Sstevel@tonic-gate * Memory isn't available from the system now. Scan 1508*0Sstevel@tonic-gate * the hash buckets till enough space is found. 1509*0Sstevel@tonic-gate */ 1510*0Sstevel@tonic-gate do { 1511*0Sstevel@tonic-gate hp = &hbuf[start]; 1512*0Sstevel@tonic-gate hmp = &hp->b_lock; 1513*0Sstevel@tonic-gate dp = (struct buf *)hp; 1514*0Sstevel@tonic-gate 1515*0Sstevel@tonic-gate mutex_enter(hmp); 1516*0Sstevel@tonic-gate bp = dp->av_forw; 1517*0Sstevel@tonic-gate 1518*0Sstevel@tonic-gate while (bp != dp) { 1519*0Sstevel@tonic-gate 1520*0Sstevel@tonic-gate ASSERT(bp != NULL); 1521*0Sstevel@tonic-gate 1522*0Sstevel@tonic-gate if (!sema_tryp(&bp->b_sem)) { 1523*0Sstevel@tonic-gate bp = bp->av_forw; 1524*0Sstevel@tonic-gate continue; 1525*0Sstevel@tonic-gate } 1526*0Sstevel@tonic-gate 1527*0Sstevel@tonic-gate /* 1528*0Sstevel@tonic-gate * Since we are going down the freelist 1529*0Sstevel@tonic-gate * associated with this hash bucket the 1530*0Sstevel@tonic-gate * B_DELWRI flag should not be set. 1531*0Sstevel@tonic-gate */ 1532*0Sstevel@tonic-gate ASSERT(!(bp->b_flags & B_DELWRI)); 1533*0Sstevel@tonic-gate 1534*0Sstevel@tonic-gate if (bp->b_bufsize == bsize) { 1535*0Sstevel@tonic-gate hp->b_length--; 1536*0Sstevel@tonic-gate notavail(bp); 1537*0Sstevel@tonic-gate bremhash(bp); 1538*0Sstevel@tonic-gate mutex_exit(hmp); 1539*0Sstevel@tonic-gate 1540*0Sstevel@tonic-gate /* 1541*0Sstevel@tonic-gate * Didn't kmem_alloc any more, so don't 1542*0Sstevel@tonic-gate * count it twice. 1543*0Sstevel@tonic-gate */ 1544*0Sstevel@tonic-gate mutex_enter(&bfree_lock); 1545*0Sstevel@tonic-gate bfreelist.b_bufsize += bsize; 1546*0Sstevel@tonic-gate mutex_exit(&bfree_lock); 1547*0Sstevel@tonic-gate 1548*0Sstevel@tonic-gate /* 1549*0Sstevel@tonic-gate * Update the lastindex value. 1550*0Sstevel@tonic-gate */ 1551*0Sstevel@tonic-gate lastindex = start; 1552*0Sstevel@tonic-gate 1553*0Sstevel@tonic-gate /* 1554*0Sstevel@tonic-gate * Put our saved bp back on the list 1555*0Sstevel@tonic-gate */ 1556*0Sstevel@tonic-gate sema_v(&save->b_sem); 1557*0Sstevel@tonic-gate bio_bhdr_free(save); 1558*0Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 1559*0Sstevel@tonic-gate return (bp); 1560*0Sstevel@tonic-gate } 1561*0Sstevel@tonic-gate sema_v(&bp->b_sem); 1562*0Sstevel@tonic-gate bp = bp->av_forw; 1563*0Sstevel@tonic-gate } 1564*0Sstevel@tonic-gate mutex_exit(hmp); 1565*0Sstevel@tonic-gate start = ((start + 1) % v.v_hbuf); 1566*0Sstevel@tonic-gate } while (start != end); 1567*0Sstevel@tonic-gate 1568*0Sstevel@tonic-gate biostats.bio_bufwait.value.ui32++; 1569*0Sstevel@tonic-gate bp = save; /* Use original bp */ 1570*0Sstevel@tonic-gate bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP); 1571*0Sstevel@tonic-gate } 1572*0Sstevel@tonic-gate 1573*0Sstevel@tonic-gate bp->b_bufsize = bsize; 1574*0Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 1575*0Sstevel@tonic-gate return (bp); 1576*0Sstevel@tonic-gate } 1577*0Sstevel@tonic-gate 1578*0Sstevel@tonic-gate /* 1579*0Sstevel@tonic-gate * Allocate a buffer header. If none currently available, allocate 1580*0Sstevel@tonic-gate * a new pool. 1581*0Sstevel@tonic-gate */ 1582*0Sstevel@tonic-gate static struct buf * 1583*0Sstevel@tonic-gate bio_bhdr_alloc(void) 1584*0Sstevel@tonic-gate { 1585*0Sstevel@tonic-gate struct buf *dp, *sdp; 1586*0Sstevel@tonic-gate struct buf *bp; 1587*0Sstevel@tonic-gate int i; 1588*0Sstevel@tonic-gate 1589*0Sstevel@tonic-gate for (;;) { 1590*0Sstevel@tonic-gate mutex_enter(&bhdr_lock); 1591*0Sstevel@tonic-gate if (bhdrlist != NULL) { 1592*0Sstevel@tonic-gate bp = bhdrlist; 1593*0Sstevel@tonic-gate bhdrlist = bp->av_forw; 1594*0Sstevel@tonic-gate mutex_exit(&bhdr_lock); 1595*0Sstevel@tonic-gate bp->av_forw = NULL; 1596*0Sstevel@tonic-gate return (bp); 1597*0Sstevel@tonic-gate } 1598*0Sstevel@tonic-gate mutex_exit(&bhdr_lock); 1599*0Sstevel@tonic-gate 1600*0Sstevel@tonic-gate /* 1601*0Sstevel@tonic-gate * Need to allocate a new pool. If the system is currently 1602*0Sstevel@tonic-gate * out of memory, then try freeing things on the freelist. 1603*0Sstevel@tonic-gate */ 1604*0Sstevel@tonic-gate dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP); 1605*0Sstevel@tonic-gate if (dp == NULL) { 1606*0Sstevel@tonic-gate /* 1607*0Sstevel@tonic-gate * System can't give us a pool of headers, try 1608*0Sstevel@tonic-gate * recycling from the free lists. 1609*0Sstevel@tonic-gate */ 1610*0Sstevel@tonic-gate bio_recycle(BIO_HEADER, 0); 1611*0Sstevel@tonic-gate } else { 1612*0Sstevel@tonic-gate sdp = dp; 1613*0Sstevel@tonic-gate for (i = 0; i < v.v_buf; i++, dp++) { 1614*0Sstevel@tonic-gate /* 1615*0Sstevel@tonic-gate * The next two lines are needed since NODEV 1616*0Sstevel@tonic-gate * is -1 and not NULL 1617*0Sstevel@tonic-gate */ 1618*0Sstevel@tonic-gate dp->b_dev = (o_dev_t)NODEV; 1619*0Sstevel@tonic-gate dp->b_edev = NODEV; 1620*0Sstevel@tonic-gate dp->av_forw = dp + 1; 1621*0Sstevel@tonic-gate sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT, 1622*0Sstevel@tonic-gate NULL); 1623*0Sstevel@tonic-gate sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT, 1624*0Sstevel@tonic-gate NULL); 1625*0Sstevel@tonic-gate dp->b_offset = -1; 1626*0Sstevel@tonic-gate } 1627*0Sstevel@tonic-gate mutex_enter(&bhdr_lock); 1628*0Sstevel@tonic-gate (--dp)->av_forw = bhdrlist; /* Fix last pointer */ 1629*0Sstevel@tonic-gate bhdrlist = sdp; 1630*0Sstevel@tonic-gate nbuf += v.v_buf; 1631*0Sstevel@tonic-gate bp = bhdrlist; 1632*0Sstevel@tonic-gate bhdrlist = bp->av_forw; 1633*0Sstevel@tonic-gate mutex_exit(&bhdr_lock); 1634*0Sstevel@tonic-gate 1635*0Sstevel@tonic-gate bp->av_forw = NULL; 1636*0Sstevel@tonic-gate return (bp); 1637*0Sstevel@tonic-gate } 1638*0Sstevel@tonic-gate } 1639*0Sstevel@tonic-gate } 1640*0Sstevel@tonic-gate 1641*0Sstevel@tonic-gate static void 1642*0Sstevel@tonic-gate bio_bhdr_free(struct buf *bp) 1643*0Sstevel@tonic-gate { 1644*0Sstevel@tonic-gate ASSERT(bp->b_back == NULL); 1645*0Sstevel@tonic-gate ASSERT(bp->b_forw == NULL); 1646*0Sstevel@tonic-gate ASSERT(bp->av_back == NULL); 1647*0Sstevel@tonic-gate ASSERT(bp->av_forw == NULL); 1648*0Sstevel@tonic-gate ASSERT(bp->b_un.b_addr == NULL); 1649*0Sstevel@tonic-gate ASSERT(bp->b_dev == (o_dev_t)NODEV); 1650*0Sstevel@tonic-gate ASSERT(bp->b_edev == NODEV); 1651*0Sstevel@tonic-gate ASSERT(bp->b_flags == 0); 1652*0Sstevel@tonic-gate 1653*0Sstevel@tonic-gate mutex_enter(&bhdr_lock); 1654*0Sstevel@tonic-gate bp->av_forw = bhdrlist; 1655*0Sstevel@tonic-gate bhdrlist = bp; 1656*0Sstevel@tonic-gate mutex_exit(&bhdr_lock); 1657*0Sstevel@tonic-gate } 1658*0Sstevel@tonic-gate 1659*0Sstevel@tonic-gate /* 1660*0Sstevel@tonic-gate * If we haven't gone over the high water mark, it's o.k. to 1661*0Sstevel@tonic-gate * allocate more buffer space, otherwise recycle buffers 1662*0Sstevel@tonic-gate * from the freelist until enough memory is free for a bsize request. 1663*0Sstevel@tonic-gate * 1664*0Sstevel@tonic-gate * We account for this memory, even though 1665*0Sstevel@tonic-gate * we don't allocate it here. 1666*0Sstevel@tonic-gate */ 1667*0Sstevel@tonic-gate static void 1668*0Sstevel@tonic-gate bio_mem_get(long bsize) 1669*0Sstevel@tonic-gate { 1670*0Sstevel@tonic-gate mutex_enter(&bfree_lock); 1671*0Sstevel@tonic-gate if (bfreelist.b_bufsize > bsize) { 1672*0Sstevel@tonic-gate bfreelist.b_bufsize -= bsize; 1673*0Sstevel@tonic-gate mutex_exit(&bfree_lock); 1674*0Sstevel@tonic-gate return; 1675*0Sstevel@tonic-gate } 1676*0Sstevel@tonic-gate mutex_exit(&bfree_lock); 1677*0Sstevel@tonic-gate bio_recycle(BIO_MEM, bsize); 1678*0Sstevel@tonic-gate } 1679*0Sstevel@tonic-gate 1680*0Sstevel@tonic-gate /* 1681*0Sstevel@tonic-gate * flush a list of delayed write buffers. 1682*0Sstevel@tonic-gate * (currently used only by bio_recycle below.) 1683*0Sstevel@tonic-gate */ 1684*0Sstevel@tonic-gate static void 1685*0Sstevel@tonic-gate bio_flushlist(struct buf *delwri_list) 1686*0Sstevel@tonic-gate { 1687*0Sstevel@tonic-gate struct buf *bp; 1688*0Sstevel@tonic-gate 1689*0Sstevel@tonic-gate while (delwri_list != EMPTY_LIST) { 1690*0Sstevel@tonic-gate bp = delwri_list; 1691*0Sstevel@tonic-gate bp->b_flags |= B_AGE | B_ASYNC; 1692*0Sstevel@tonic-gate if (bp->b_vp == NULL) { /* !ufs */ 1693*0Sstevel@tonic-gate BWRITE(bp); 1694*0Sstevel@tonic-gate } else { /* ufs */ 1695*0Sstevel@tonic-gate UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp); 1696*0Sstevel@tonic-gate } 1697*0Sstevel@tonic-gate delwri_list = bp->b_list; 1698*0Sstevel@tonic-gate bp->b_list = NULL; 1699*0Sstevel@tonic-gate } 1700*0Sstevel@tonic-gate } 1701*0Sstevel@tonic-gate 1702*0Sstevel@tonic-gate /* 1703*0Sstevel@tonic-gate * Start recycling buffers on the freelist for one of 2 reasons: 1704*0Sstevel@tonic-gate * - we need a buffer header 1705*0Sstevel@tonic-gate * - we need to free up memory 1706*0Sstevel@tonic-gate * Once started we continue to recycle buffers until the B_AGE 1707*0Sstevel@tonic-gate * buffers are gone. 1708*0Sstevel@tonic-gate */ 1709*0Sstevel@tonic-gate static void 1710*0Sstevel@tonic-gate bio_recycle(int want, long bsize) 1711*0Sstevel@tonic-gate { 1712*0Sstevel@tonic-gate struct buf *bp, *dp, *dwp, *nbp; 1713*0Sstevel@tonic-gate struct hbuf *hp; 1714*0Sstevel@tonic-gate int found = 0; 1715*0Sstevel@tonic-gate kmutex_t *hmp; 1716*0Sstevel@tonic-gate int start, end; 1717*0Sstevel@tonic-gate struct buf *delwri_list = EMPTY_LIST; 1718*0Sstevel@tonic-gate 1719*0Sstevel@tonic-gate /* 1720*0Sstevel@tonic-gate * Recycle buffers. 1721*0Sstevel@tonic-gate */ 1722*0Sstevel@tonic-gate top: 1723*0Sstevel@tonic-gate start = end = lastindex; 1724*0Sstevel@tonic-gate do { 1725*0Sstevel@tonic-gate hp = &hbuf[start]; 1726*0Sstevel@tonic-gate hmp = &hp->b_lock; 1727*0Sstevel@tonic-gate dp = (struct buf *)hp; 1728*0Sstevel@tonic-gate 1729*0Sstevel@tonic-gate mutex_enter(hmp); 1730*0Sstevel@tonic-gate bp = dp->av_forw; 1731*0Sstevel@tonic-gate 1732*0Sstevel@tonic-gate while (bp != dp) { 1733*0Sstevel@tonic-gate 1734*0Sstevel@tonic-gate ASSERT(bp != NULL); 1735*0Sstevel@tonic-gate 1736*0Sstevel@tonic-gate if (!sema_tryp(&bp->b_sem)) { 1737*0Sstevel@tonic-gate bp = bp->av_forw; 1738*0Sstevel@tonic-gate continue; 1739*0Sstevel@tonic-gate } 1740*0Sstevel@tonic-gate /* 1741*0Sstevel@tonic-gate * Do we really want to nuke all of the B_AGE stuff?? 1742*0Sstevel@tonic-gate */ 1743*0Sstevel@tonic-gate if ((bp->b_flags & B_AGE) == 0 && found) { 1744*0Sstevel@tonic-gate sema_v(&bp->b_sem); 1745*0Sstevel@tonic-gate mutex_exit(hmp); 1746*0Sstevel@tonic-gate lastindex = start; 1747*0Sstevel@tonic-gate return; /* All done */ 1748*0Sstevel@tonic-gate } 1749*0Sstevel@tonic-gate 1750*0Sstevel@tonic-gate ASSERT(MUTEX_HELD(&hp->b_lock)); 1751*0Sstevel@tonic-gate ASSERT(!(bp->b_flags & B_DELWRI)); 1752*0Sstevel@tonic-gate hp->b_length--; 1753*0Sstevel@tonic-gate notavail(bp); 1754*0Sstevel@tonic-gate 1755*0Sstevel@tonic-gate /* 1756*0Sstevel@tonic-gate * Remove bhdr from cache, free up memory, 1757*0Sstevel@tonic-gate * and add the hdr to the freelist. 1758*0Sstevel@tonic-gate */ 1759*0Sstevel@tonic-gate bremhash(bp); 1760*0Sstevel@tonic-gate mutex_exit(hmp); 1761*0Sstevel@tonic-gate 1762*0Sstevel@tonic-gate if (bp->b_bufsize) { 1763*0Sstevel@tonic-gate kmem_free(bp->b_un.b_addr, bp->b_bufsize); 1764*0Sstevel@tonic-gate bp->b_un.b_addr = NULL; 1765*0Sstevel@tonic-gate mutex_enter(&bfree_lock); 1766*0Sstevel@tonic-gate bfreelist.b_bufsize += bp->b_bufsize; 1767*0Sstevel@tonic-gate mutex_exit(&bfree_lock); 1768*0Sstevel@tonic-gate } 1769*0Sstevel@tonic-gate 1770*0Sstevel@tonic-gate bp->b_dev = (o_dev_t)NODEV; 1771*0Sstevel@tonic-gate bp->b_edev = NODEV; 1772*0Sstevel@tonic-gate bp->b_flags = 0; 1773*0Sstevel@tonic-gate sema_v(&bp->b_sem); 1774*0Sstevel@tonic-gate bio_bhdr_free(bp); 1775*0Sstevel@tonic-gate if (want == BIO_HEADER) { 1776*0Sstevel@tonic-gate found = 1; 1777*0Sstevel@tonic-gate } else { 1778*0Sstevel@tonic-gate ASSERT(want == BIO_MEM); 1779*0Sstevel@tonic-gate if (!found && bfreelist.b_bufsize >= bsize) { 1780*0Sstevel@tonic-gate /* Account for the memory we want */ 1781*0Sstevel@tonic-gate mutex_enter(&bfree_lock); 1782*0Sstevel@tonic-gate if (bfreelist.b_bufsize >= bsize) { 1783*0Sstevel@tonic-gate bfreelist.b_bufsize -= bsize; 1784*0Sstevel@tonic-gate found = 1; 1785*0Sstevel@tonic-gate } 1786*0Sstevel@tonic-gate mutex_exit(&bfree_lock); 1787*0Sstevel@tonic-gate } 1788*0Sstevel@tonic-gate } 1789*0Sstevel@tonic-gate 1790*0Sstevel@tonic-gate /* 1791*0Sstevel@tonic-gate * Since we dropped hmp start from the 1792*0Sstevel@tonic-gate * begining. 1793*0Sstevel@tonic-gate */ 1794*0Sstevel@tonic-gate mutex_enter(hmp); 1795*0Sstevel@tonic-gate bp = dp->av_forw; 1796*0Sstevel@tonic-gate } 1797*0Sstevel@tonic-gate mutex_exit(hmp); 1798*0Sstevel@tonic-gate 1799*0Sstevel@tonic-gate /* 1800*0Sstevel@tonic-gate * Look at the delayed write list. 1801*0Sstevel@tonic-gate * First gather into a private list, then write them. 1802*0Sstevel@tonic-gate */ 1803*0Sstevel@tonic-gate dwp = (struct buf *)&dwbuf[start]; 1804*0Sstevel@tonic-gate mutex_enter(&blist_lock); 1805*0Sstevel@tonic-gate bio_doingflush++; 1806*0Sstevel@tonic-gate mutex_enter(hmp); 1807*0Sstevel@tonic-gate for (bp = dwp->av_forw; bp != dwp; bp = nbp) { 1808*0Sstevel@tonic-gate 1809*0Sstevel@tonic-gate ASSERT(bp != NULL); 1810*0Sstevel@tonic-gate nbp = bp->av_forw; 1811*0Sstevel@tonic-gate 1812*0Sstevel@tonic-gate if (!sema_tryp(&bp->b_sem)) 1813*0Sstevel@tonic-gate continue; 1814*0Sstevel@tonic-gate ASSERT(bp->b_flags & B_DELWRI); 1815*0Sstevel@tonic-gate /* 1816*0Sstevel@tonic-gate * Do we really want to nuke all of the B_AGE stuff?? 1817*0Sstevel@tonic-gate */ 1818*0Sstevel@tonic-gate 1819*0Sstevel@tonic-gate if ((bp->b_flags & B_AGE) == 0 && found) { 1820*0Sstevel@tonic-gate sema_v(&bp->b_sem); 1821*0Sstevel@tonic-gate mutex_exit(hmp); 1822*0Sstevel@tonic-gate lastindex = start; 1823*0Sstevel@tonic-gate mutex_exit(&blist_lock); 1824*0Sstevel@tonic-gate bio_flushlist(delwri_list); 1825*0Sstevel@tonic-gate mutex_enter(&blist_lock); 1826*0Sstevel@tonic-gate bio_doingflush--; 1827*0Sstevel@tonic-gate if (bio_flinv_cv_wanted) { 1828*0Sstevel@tonic-gate bio_flinv_cv_wanted = 0; 1829*0Sstevel@tonic-gate cv_broadcast(&bio_flushinval_cv); 1830*0Sstevel@tonic-gate } 1831*0Sstevel@tonic-gate mutex_exit(&blist_lock); 1832*0Sstevel@tonic-gate return; /* All done */ 1833*0Sstevel@tonic-gate } 1834*0Sstevel@tonic-gate 1835*0Sstevel@tonic-gate /* 1836*0Sstevel@tonic-gate * If the buffer is already on a flush or 1837*0Sstevel@tonic-gate * invalidate list then just skip it. 1838*0Sstevel@tonic-gate */ 1839*0Sstevel@tonic-gate if (bp->b_list != NULL) { 1840*0Sstevel@tonic-gate sema_v(&bp->b_sem); 1841*0Sstevel@tonic-gate continue; 1842*0Sstevel@tonic-gate } 1843*0Sstevel@tonic-gate /* 1844*0Sstevel@tonic-gate * We are still on the same bucket. 1845*0Sstevel@tonic-gate */ 1846*0Sstevel@tonic-gate hp->b_length--; 1847*0Sstevel@tonic-gate notavail(bp); 1848*0Sstevel@tonic-gate bp->b_list = delwri_list; 1849*0Sstevel@tonic-gate delwri_list = bp; 1850*0Sstevel@tonic-gate } 1851*0Sstevel@tonic-gate mutex_exit(hmp); 1852*0Sstevel@tonic-gate mutex_exit(&blist_lock); 1853*0Sstevel@tonic-gate bio_flushlist(delwri_list); 1854*0Sstevel@tonic-gate delwri_list = EMPTY_LIST; 1855*0Sstevel@tonic-gate mutex_enter(&blist_lock); 1856*0Sstevel@tonic-gate bio_doingflush--; 1857*0Sstevel@tonic-gate if (bio_flinv_cv_wanted) { 1858*0Sstevel@tonic-gate bio_flinv_cv_wanted = 0; 1859*0Sstevel@tonic-gate cv_broadcast(&bio_flushinval_cv); 1860*0Sstevel@tonic-gate } 1861*0Sstevel@tonic-gate mutex_exit(&blist_lock); 1862*0Sstevel@tonic-gate start = (start + 1) % v.v_hbuf; 1863*0Sstevel@tonic-gate 1864*0Sstevel@tonic-gate } while (start != end); 1865*0Sstevel@tonic-gate 1866*0Sstevel@tonic-gate if (found) 1867*0Sstevel@tonic-gate return; 1868*0Sstevel@tonic-gate 1869*0Sstevel@tonic-gate /* 1870*0Sstevel@tonic-gate * Free lists exhausted and we haven't satisfied the request. 1871*0Sstevel@tonic-gate * Wait here for more entries to be added to freelist. 1872*0Sstevel@tonic-gate * Because this might have just happened, make it timed. 1873*0Sstevel@tonic-gate */ 1874*0Sstevel@tonic-gate mutex_enter(&bfree_lock); 1875*0Sstevel@tonic-gate bfreelist.b_flags |= B_WANTED; 1876*0Sstevel@tonic-gate (void) cv_timedwait(&bio_mem_cv, &bfree_lock, lbolt+hz); 1877*0Sstevel@tonic-gate mutex_exit(&bfree_lock); 1878*0Sstevel@tonic-gate goto top; 1879*0Sstevel@tonic-gate } 1880*0Sstevel@tonic-gate 1881*0Sstevel@tonic-gate /* 1882*0Sstevel@tonic-gate * See if the block is associated with some buffer 1883*0Sstevel@tonic-gate * (mainly to avoid getting hung up on a wait in breada). 1884*0Sstevel@tonic-gate */ 1885*0Sstevel@tonic-gate static int 1886*0Sstevel@tonic-gate bio_incore(dev_t dev, daddr_t blkno) 1887*0Sstevel@tonic-gate { 1888*0Sstevel@tonic-gate struct buf *bp; 1889*0Sstevel@tonic-gate struct buf *dp; 1890*0Sstevel@tonic-gate uint_t index; 1891*0Sstevel@tonic-gate kmutex_t *hmp; 1892*0Sstevel@tonic-gate 1893*0Sstevel@tonic-gate index = bio_bhash(dev, blkno); 1894*0Sstevel@tonic-gate dp = (struct buf *)&hbuf[index]; 1895*0Sstevel@tonic-gate hmp = &hbuf[index].b_lock; 1896*0Sstevel@tonic-gate 1897*0Sstevel@tonic-gate mutex_enter(hmp); 1898*0Sstevel@tonic-gate for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1899*0Sstevel@tonic-gate if (bp->b_blkno == blkno && bp->b_edev == dev && 1900*0Sstevel@tonic-gate (bp->b_flags & B_STALE) == 0) { 1901*0Sstevel@tonic-gate mutex_exit(hmp); 1902*0Sstevel@tonic-gate return (1); 1903*0Sstevel@tonic-gate } 1904*0Sstevel@tonic-gate } 1905*0Sstevel@tonic-gate mutex_exit(hmp); 1906*0Sstevel@tonic-gate return (0); 1907*0Sstevel@tonic-gate } 1908*0Sstevel@tonic-gate 1909*0Sstevel@tonic-gate static void 1910*0Sstevel@tonic-gate bio_pageio_done(struct buf *bp) 1911*0Sstevel@tonic-gate { 1912*0Sstevel@tonic-gate if (bp->b_flags & B_PAGEIO) { 1913*0Sstevel@tonic-gate 1914*0Sstevel@tonic-gate if (bp->b_flags & B_REMAPPED) 1915*0Sstevel@tonic-gate bp_mapout(bp); 1916*0Sstevel@tonic-gate 1917*0Sstevel@tonic-gate if (bp->b_flags & B_READ) 1918*0Sstevel@tonic-gate pvn_read_done(bp->b_pages, bp->b_flags); 1919*0Sstevel@tonic-gate else 1920*0Sstevel@tonic-gate pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags); 1921*0Sstevel@tonic-gate pageio_done(bp); 1922*0Sstevel@tonic-gate } else { 1923*0Sstevel@tonic-gate ASSERT(bp->b_flags & B_REMAPPED); 1924*0Sstevel@tonic-gate bp_mapout(bp); 1925*0Sstevel@tonic-gate brelse(bp); 1926*0Sstevel@tonic-gate } 1927*0Sstevel@tonic-gate } 1928*0Sstevel@tonic-gate 1929*0Sstevel@tonic-gate /* 1930*0Sstevel@tonic-gate * bioerror(9F) - indicate error in buffer header 1931*0Sstevel@tonic-gate * If 'error' is zero, remove the error indication. 1932*0Sstevel@tonic-gate */ 1933*0Sstevel@tonic-gate void 1934*0Sstevel@tonic-gate bioerror(struct buf *bp, int error) 1935*0Sstevel@tonic-gate { 1936*0Sstevel@tonic-gate ASSERT(bp != NULL); 1937*0Sstevel@tonic-gate ASSERT(error >= 0); 1938*0Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 1939*0Sstevel@tonic-gate 1940*0Sstevel@tonic-gate if (error != 0) { 1941*0Sstevel@tonic-gate bp->b_flags |= B_ERROR; 1942*0Sstevel@tonic-gate } else { 1943*0Sstevel@tonic-gate bp->b_flags &= ~B_ERROR; 1944*0Sstevel@tonic-gate } 1945*0Sstevel@tonic-gate bp->b_error = error; 1946*0Sstevel@tonic-gate } 1947*0Sstevel@tonic-gate 1948*0Sstevel@tonic-gate /* 1949*0Sstevel@tonic-gate * bioreset(9F) - reuse a private buffer header after I/O is complete 1950*0Sstevel@tonic-gate */ 1951*0Sstevel@tonic-gate void 1952*0Sstevel@tonic-gate bioreset(struct buf *bp) 1953*0Sstevel@tonic-gate { 1954*0Sstevel@tonic-gate ASSERT(bp != NULL); 1955*0Sstevel@tonic-gate 1956*0Sstevel@tonic-gate biofini(bp); 1957*0Sstevel@tonic-gate bioinit(bp); 1958*0Sstevel@tonic-gate } 1959*0Sstevel@tonic-gate 1960*0Sstevel@tonic-gate /* 1961*0Sstevel@tonic-gate * biosize(9F) - return size of a buffer header 1962*0Sstevel@tonic-gate */ 1963*0Sstevel@tonic-gate size_t 1964*0Sstevel@tonic-gate biosize(void) 1965*0Sstevel@tonic-gate { 1966*0Sstevel@tonic-gate return (sizeof (struct buf)); 1967*0Sstevel@tonic-gate } 1968*0Sstevel@tonic-gate 1969*0Sstevel@tonic-gate /* 1970*0Sstevel@tonic-gate * biomodified(9F) - check if buffer is modified 1971*0Sstevel@tonic-gate */ 1972*0Sstevel@tonic-gate int 1973*0Sstevel@tonic-gate biomodified(struct buf *bp) 1974*0Sstevel@tonic-gate { 1975*0Sstevel@tonic-gate int npf; 1976*0Sstevel@tonic-gate int ppattr; 1977*0Sstevel@tonic-gate struct page *pp; 1978*0Sstevel@tonic-gate 1979*0Sstevel@tonic-gate ASSERT(bp != NULL); 1980*0Sstevel@tonic-gate 1981*0Sstevel@tonic-gate if ((bp->b_flags & B_PAGEIO) == 0) { 1982*0Sstevel@tonic-gate return (-1); 1983*0Sstevel@tonic-gate } 1984*0Sstevel@tonic-gate pp = bp->b_pages; 1985*0Sstevel@tonic-gate npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET)); 1986*0Sstevel@tonic-gate 1987*0Sstevel@tonic-gate while (npf > 0) { 1988*0Sstevel@tonic-gate ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO | 1989*0Sstevel@tonic-gate HAT_SYNC_STOPON_MOD); 1990*0Sstevel@tonic-gate if (ppattr & P_MOD) 1991*0Sstevel@tonic-gate return (1); 1992*0Sstevel@tonic-gate pp = pp->p_next; 1993*0Sstevel@tonic-gate npf--; 1994*0Sstevel@tonic-gate } 1995*0Sstevel@tonic-gate 1996*0Sstevel@tonic-gate return (0); 1997*0Sstevel@tonic-gate } 1998*0Sstevel@tonic-gate 1999*0Sstevel@tonic-gate /* 2000*0Sstevel@tonic-gate * bioinit(9F) - initialize a buffer structure 2001*0Sstevel@tonic-gate */ 2002*0Sstevel@tonic-gate void 2003*0Sstevel@tonic-gate bioinit(struct buf *bp) 2004*0Sstevel@tonic-gate { 2005*0Sstevel@tonic-gate bzero(bp, sizeof (struct buf)); 2006*0Sstevel@tonic-gate sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL); 2007*0Sstevel@tonic-gate sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL); 2008*0Sstevel@tonic-gate bp->b_offset = -1; 2009*0Sstevel@tonic-gate } 2010*0Sstevel@tonic-gate 2011*0Sstevel@tonic-gate /* 2012*0Sstevel@tonic-gate * biofini(9F) - uninitialize a buffer structure 2013*0Sstevel@tonic-gate */ 2014*0Sstevel@tonic-gate void 2015*0Sstevel@tonic-gate biofini(struct buf *bp) 2016*0Sstevel@tonic-gate { 2017*0Sstevel@tonic-gate sema_destroy(&bp->b_io); 2018*0Sstevel@tonic-gate sema_destroy(&bp->b_sem); 2019*0Sstevel@tonic-gate } 2020*0Sstevel@tonic-gate 2021*0Sstevel@tonic-gate /* 2022*0Sstevel@tonic-gate * bioclone(9F) - clone a buffer 2023*0Sstevel@tonic-gate */ 2024*0Sstevel@tonic-gate struct buf * 2025*0Sstevel@tonic-gate bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno, 2026*0Sstevel@tonic-gate int (*iodone)(struct buf *), struct buf *bp_mem, int sleep) 2027*0Sstevel@tonic-gate { 2028*0Sstevel@tonic-gate struct buf *bufp; 2029*0Sstevel@tonic-gate 2030*0Sstevel@tonic-gate ASSERT(bp); 2031*0Sstevel@tonic-gate if (bp_mem == NULL) { 2032*0Sstevel@tonic-gate bufp = kmem_alloc(sizeof (struct buf), sleep); 2033*0Sstevel@tonic-gate if (bufp == NULL) { 2034*0Sstevel@tonic-gate return (NULL); 2035*0Sstevel@tonic-gate } 2036*0Sstevel@tonic-gate bioinit(bufp); 2037*0Sstevel@tonic-gate } else { 2038*0Sstevel@tonic-gate bufp = bp_mem; 2039*0Sstevel@tonic-gate bioreset(bufp); 2040*0Sstevel@tonic-gate } 2041*0Sstevel@tonic-gate 2042*0Sstevel@tonic-gate #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\ 2043*0Sstevel@tonic-gate B_ABRWRITE) 2044*0Sstevel@tonic-gate 2045*0Sstevel@tonic-gate /* 2046*0Sstevel@tonic-gate * the cloned buffer does not inherit the B_REMAPPED flag. A separate 2047*0Sstevel@tonic-gate * bp_mapin(9F) has to be done to get a kernel mapping. 2048*0Sstevel@tonic-gate */ 2049*0Sstevel@tonic-gate bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS) | B_BUSY; 2050*0Sstevel@tonic-gate bufp->b_bcount = len; 2051*0Sstevel@tonic-gate bufp->b_blkno = blkno; 2052*0Sstevel@tonic-gate bufp->b_iodone = iodone; 2053*0Sstevel@tonic-gate bufp->b_proc = bp->b_proc; 2054*0Sstevel@tonic-gate bufp->b_edev = dev; 2055*0Sstevel@tonic-gate bufp->b_file = bp->b_file; 2056*0Sstevel@tonic-gate bufp->b_offset = bp->b_offset; 2057*0Sstevel@tonic-gate 2058*0Sstevel@tonic-gate if (bp->b_flags & B_SHADOW) { 2059*0Sstevel@tonic-gate ASSERT(bp->b_shadow); 2060*0Sstevel@tonic-gate ASSERT(bp->b_flags & B_PHYS); 2061*0Sstevel@tonic-gate 2062*0Sstevel@tonic-gate bufp->b_shadow = bp->b_shadow + 2063*0Sstevel@tonic-gate btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off); 2064*0Sstevel@tonic-gate bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off); 2065*0Sstevel@tonic-gate } else { 2066*0Sstevel@tonic-gate if (bp->b_flags & B_PAGEIO) { 2067*0Sstevel@tonic-gate struct page *pp; 2068*0Sstevel@tonic-gate off_t o; 2069*0Sstevel@tonic-gate int i; 2070*0Sstevel@tonic-gate 2071*0Sstevel@tonic-gate pp = bp->b_pages; 2072*0Sstevel@tonic-gate o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off; 2073*0Sstevel@tonic-gate for (i = btop(o); i > 0; i--) { 2074*0Sstevel@tonic-gate pp = pp->p_next; 2075*0Sstevel@tonic-gate } 2076*0Sstevel@tonic-gate bufp->b_pages = pp; 2077*0Sstevel@tonic-gate bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET); 2078*0Sstevel@tonic-gate } else { 2079*0Sstevel@tonic-gate bufp->b_un.b_addr = 2080*0Sstevel@tonic-gate (caddr_t)((uintptr_t)bp->b_un.b_addr + off); 2081*0Sstevel@tonic-gate if (bp->b_flags & B_REMAPPED) 2082*0Sstevel@tonic-gate bufp->b_proc = NULL; 2083*0Sstevel@tonic-gate } 2084*0Sstevel@tonic-gate } 2085*0Sstevel@tonic-gate return (bufp); 2086*0Sstevel@tonic-gate } 2087