xref: /onnv-gate/usr/src/uts/common/os/bio.c (revision 0:68f95e015346)
1*0Sstevel@tonic-gate /*
2*0Sstevel@tonic-gate  * CDDL HEADER START
3*0Sstevel@tonic-gate  *
4*0Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*0Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*0Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*0Sstevel@tonic-gate  * with the License.
8*0Sstevel@tonic-gate  *
9*0Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*0Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*0Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*0Sstevel@tonic-gate  * and limitations under the License.
13*0Sstevel@tonic-gate  *
14*0Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*0Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*0Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*0Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*0Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*0Sstevel@tonic-gate  *
20*0Sstevel@tonic-gate  * CDDL HEADER END
21*0Sstevel@tonic-gate  */
22*0Sstevel@tonic-gate /*
23*0Sstevel@tonic-gate  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*0Sstevel@tonic-gate  * Use is subject to license terms.
25*0Sstevel@tonic-gate  */
26*0Sstevel@tonic-gate 
27*0Sstevel@tonic-gate /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28*0Sstevel@tonic-gate /*	  All Rights Reserved  	*/
29*0Sstevel@tonic-gate 
30*0Sstevel@tonic-gate /*
31*0Sstevel@tonic-gate  * University Copyright- Copyright (c) 1982, 1986, 1988
32*0Sstevel@tonic-gate  * The Regents of the University of California
33*0Sstevel@tonic-gate  * All Rights Reserved
34*0Sstevel@tonic-gate  *
35*0Sstevel@tonic-gate  * University Acknowledgment- Portions of this document are derived from
36*0Sstevel@tonic-gate  * software developed by the University of California, Berkeley, and its
37*0Sstevel@tonic-gate  * contributors.
38*0Sstevel@tonic-gate  */
39*0Sstevel@tonic-gate 
40*0Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
41*0Sstevel@tonic-gate 
42*0Sstevel@tonic-gate #include <sys/types.h>
43*0Sstevel@tonic-gate #include <sys/t_lock.h>
44*0Sstevel@tonic-gate #include <sys/sysmacros.h>
45*0Sstevel@tonic-gate #include <sys/conf.h>
46*0Sstevel@tonic-gate #include <sys/cpuvar.h>
47*0Sstevel@tonic-gate #include <sys/errno.h>
48*0Sstevel@tonic-gate #include <sys/debug.h>
49*0Sstevel@tonic-gate #include <sys/buf.h>
50*0Sstevel@tonic-gate #include <sys/var.h>
51*0Sstevel@tonic-gate #include <sys/vnode.h>
52*0Sstevel@tonic-gate #include <sys/bitmap.h>
53*0Sstevel@tonic-gate #include <sys/cmn_err.h>
54*0Sstevel@tonic-gate #include <sys/kmem.h>
55*0Sstevel@tonic-gate #include <sys/vmem.h>
56*0Sstevel@tonic-gate #include <sys/atomic.h>
57*0Sstevel@tonic-gate #include <vm/seg_kmem.h>
58*0Sstevel@tonic-gate #include <vm/page.h>
59*0Sstevel@tonic-gate #include <vm/pvn.h>
60*0Sstevel@tonic-gate #include <sys/vtrace.h>
61*0Sstevel@tonic-gate #include <sys/tnf_probe.h>
62*0Sstevel@tonic-gate #include <sys/fs/ufs_inode.h>
63*0Sstevel@tonic-gate #include <sys/fs/ufs_bio.h>
64*0Sstevel@tonic-gate #include <sys/fs/ufs_log.h>
65*0Sstevel@tonic-gate #include <sys/systm.h>
66*0Sstevel@tonic-gate #include <sys/vfs.h>
67*0Sstevel@tonic-gate #include <sys/sdt.h>
68*0Sstevel@tonic-gate 
69*0Sstevel@tonic-gate /* Locks */
70*0Sstevel@tonic-gate static	kmutex_t	blist_lock;	/* protects b_list */
71*0Sstevel@tonic-gate static	kmutex_t	bhdr_lock;	/* protects the bhdrlist */
72*0Sstevel@tonic-gate static	kmutex_t	bfree_lock;	/* protects the bfreelist structure */
73*0Sstevel@tonic-gate 
74*0Sstevel@tonic-gate struct hbuf	*hbuf;			/* Hash buckets */
75*0Sstevel@tonic-gate struct dwbuf	*dwbuf;			/* Delayed write buckets */
76*0Sstevel@tonic-gate static struct buf *bhdrlist;		/* buf header free list */
77*0Sstevel@tonic-gate static int 	nbuf;			/* number of buffer headers allocated */
78*0Sstevel@tonic-gate 
79*0Sstevel@tonic-gate static int	lastindex;		/* Reference point on where to start */
80*0Sstevel@tonic-gate 					/* when looking for free buffers */
81*0Sstevel@tonic-gate 
82*0Sstevel@tonic-gate #define	bio_bhash(dev, bn)	(hash2ints((dev), (int)(bn)) & v.v_hmask)
83*0Sstevel@tonic-gate #define	EMPTY_LIST	((struct buf *)-1)
84*0Sstevel@tonic-gate 
85*0Sstevel@tonic-gate static kcondvar_t	bio_mem_cv; 	/* Condition variables */
86*0Sstevel@tonic-gate static kcondvar_t	bio_flushinval_cv;
87*0Sstevel@tonic-gate static int	bio_doingflush;		/* flush in progress */
88*0Sstevel@tonic-gate static int	bio_doinginval;		/* inval in progress */
89*0Sstevel@tonic-gate static int	bio_flinv_cv_wanted;	/* someone waiting for cv */
90*0Sstevel@tonic-gate 
91*0Sstevel@tonic-gate /*
92*0Sstevel@tonic-gate  * Statistics on the buffer cache
93*0Sstevel@tonic-gate  */
94*0Sstevel@tonic-gate struct biostats biostats = {
95*0Sstevel@tonic-gate 	{ "buffer_cache_lookups",		KSTAT_DATA_UINT32 },
96*0Sstevel@tonic-gate 	{ "buffer_cache_hits",			KSTAT_DATA_UINT32 },
97*0Sstevel@tonic-gate 	{ "new_buffer_requests",		KSTAT_DATA_UINT32 },
98*0Sstevel@tonic-gate 	{ "waits_for_buffer_allocs",		KSTAT_DATA_UINT32 },
99*0Sstevel@tonic-gate 	{ "buffers_locked_by_someone",		KSTAT_DATA_UINT32 },
100*0Sstevel@tonic-gate 	{ "duplicate_buffers_found",		KSTAT_DATA_UINT32 }
101*0Sstevel@tonic-gate };
102*0Sstevel@tonic-gate 
103*0Sstevel@tonic-gate /*
104*0Sstevel@tonic-gate  * kstat data
105*0Sstevel@tonic-gate  */
106*0Sstevel@tonic-gate kstat_named_t	*biostats_ptr = (kstat_named_t *)&biostats;
107*0Sstevel@tonic-gate uint_t		biostats_ndata = (uint_t)(sizeof (biostats) /
108*0Sstevel@tonic-gate 					sizeof (kstat_named_t));
109*0Sstevel@tonic-gate 
110*0Sstevel@tonic-gate /*
111*0Sstevel@tonic-gate  * Statistics on ufs buffer cache
112*0Sstevel@tonic-gate  * Not protected by locks
113*0Sstevel@tonic-gate  */
114*0Sstevel@tonic-gate struct ufsbiostats ub = {
115*0Sstevel@tonic-gate 	{ "breads",			KSTAT_DATA_UINT32 },
116*0Sstevel@tonic-gate 	{ "bwrites",			KSTAT_DATA_UINT32 },
117*0Sstevel@tonic-gate 	{ "fbiwrites",			KSTAT_DATA_UINT32 },
118*0Sstevel@tonic-gate 	{ "getpages",			KSTAT_DATA_UINT32 },
119*0Sstevel@tonic-gate 	{ "getras",			KSTAT_DATA_UINT32 },
120*0Sstevel@tonic-gate 	{ "putsyncs",			KSTAT_DATA_UINT32 },
121*0Sstevel@tonic-gate 	{ "putasyncs",			KSTAT_DATA_UINT32 },
122*0Sstevel@tonic-gate 	{ "putpageios",			KSTAT_DATA_UINT32 },
123*0Sstevel@tonic-gate };
124*0Sstevel@tonic-gate 
125*0Sstevel@tonic-gate /*
126*0Sstevel@tonic-gate  * more UFS Logging eccentricities...
127*0Sstevel@tonic-gate  *
128*0Sstevel@tonic-gate  * required since "#pragma weak ..." doesn't work in reverse order.
129*0Sstevel@tonic-gate  * i.e.:  genunix (bio.c) is loaded before the ufs modules and pointers
130*0Sstevel@tonic-gate  *        to ufs routines don't get plugged into bio.c calls so
131*0Sstevel@tonic-gate  *        we initialize it when setting up the "lufsops" table
132*0Sstevel@tonic-gate  *        in "lufs.c:_init()"
133*0Sstevel@tonic-gate  */
134*0Sstevel@tonic-gate void (*bio_lufs_strategy)(void *, buf_t *);
135*0Sstevel@tonic-gate void (*bio_snapshot_strategy)(void *, buf_t *);
136*0Sstevel@tonic-gate 
137*0Sstevel@tonic-gate 
138*0Sstevel@tonic-gate /* Private routines */
139*0Sstevel@tonic-gate static struct buf	*bio_getfreeblk(long);
140*0Sstevel@tonic-gate static void 		bio_mem_get(long);
141*0Sstevel@tonic-gate static void		bio_bhdr_free(struct buf *);
142*0Sstevel@tonic-gate static struct buf	*bio_bhdr_alloc(void);
143*0Sstevel@tonic-gate static void		bio_recycle(int, long);
144*0Sstevel@tonic-gate static void 		bio_pageio_done(struct buf *);
145*0Sstevel@tonic-gate static int 		bio_incore(dev_t, daddr_t);
146*0Sstevel@tonic-gate 
147*0Sstevel@tonic-gate /*
148*0Sstevel@tonic-gate  * Buffer cache constants
149*0Sstevel@tonic-gate  */
150*0Sstevel@tonic-gate #define	BIO_BUF_PERCENT	(100/2)		/* default: 2% of memory */
151*0Sstevel@tonic-gate #define	BIO_MAX_PERCENT	(100/20)	/* max is 20% of real memory */
152*0Sstevel@tonic-gate #define	BIO_BHDR_POOL	100		/* Default bhdr pool size */
153*0Sstevel@tonic-gate #define	BIO_MIN_HDR	10		/* Minimum number of buffer headers */
154*0Sstevel@tonic-gate #define	BIO_MIN_HWM	(BIO_MIN_HDR * MAXBSIZE / 1024)
155*0Sstevel@tonic-gate #define	BIO_HASHLEN	4		/* Target length of hash chains */
156*0Sstevel@tonic-gate 
157*0Sstevel@tonic-gate 
158*0Sstevel@tonic-gate /* Flags for bio_recycle() */
159*0Sstevel@tonic-gate #define	BIO_HEADER	0x01
160*0Sstevel@tonic-gate #define	BIO_MEM		0x02
161*0Sstevel@tonic-gate 
162*0Sstevel@tonic-gate extern	int bufhwm;		/* User tunable - high water mark for mem  */
163*0Sstevel@tonic-gate extern	int bufhwm_pct;		/* ditto - given in % of physmem  */
164*0Sstevel@tonic-gate 
165*0Sstevel@tonic-gate /*
166*0Sstevel@tonic-gate  * The following routines allocate and free
167*0Sstevel@tonic-gate  * buffers with various side effects.  In general the
168*0Sstevel@tonic-gate  * arguments to an allocate routine are a device and
169*0Sstevel@tonic-gate  * a block number, and the value is a pointer to
170*0Sstevel@tonic-gate  * to the buffer header; the buffer returned is locked with a
171*0Sstevel@tonic-gate  * binary semaphore so that no one else can touch it. If the block was
172*0Sstevel@tonic-gate  * already in core, no I/O need be done; if it is
173*0Sstevel@tonic-gate  * already locked, the process waits until it becomes free.
174*0Sstevel@tonic-gate  * The following routines allocate a buffer:
175*0Sstevel@tonic-gate  *	getblk
176*0Sstevel@tonic-gate  *	bread/BREAD
177*0Sstevel@tonic-gate  *	breada
178*0Sstevel@tonic-gate  * Eventually the buffer must be released, possibly with the
179*0Sstevel@tonic-gate  * side effect of writing it out, by using one of
180*0Sstevel@tonic-gate  *	bwrite/BWRITE/brwrite
181*0Sstevel@tonic-gate  *	bdwrite/bdrwrite
182*0Sstevel@tonic-gate  *	bawrite
183*0Sstevel@tonic-gate  *	brelse
184*0Sstevel@tonic-gate  *
185*0Sstevel@tonic-gate  * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
186*0Sstevel@tonic-gate  * Instead, a binary semaphore, b_sem is used to gain exclusive access to
187*0Sstevel@tonic-gate  * a buffer and a binary semaphore, b_io is used for I/O synchronization.
188*0Sstevel@tonic-gate  * B_DONE is still used to denote a buffer with I/O complete on it.
189*0Sstevel@tonic-gate  *
190*0Sstevel@tonic-gate  * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
191*0Sstevel@tonic-gate  * should not be used where a very accurate count of the free buffers is
192*0Sstevel@tonic-gate  * needed.
193*0Sstevel@tonic-gate  */
194*0Sstevel@tonic-gate 
195*0Sstevel@tonic-gate /*
196*0Sstevel@tonic-gate  * Read in (if necessary) the block and return a buffer pointer.
197*0Sstevel@tonic-gate  *
198*0Sstevel@tonic-gate  * This interface is provided for binary compatibility.  Using
199*0Sstevel@tonic-gate  * BREAD() directly avoids the extra function call overhead invoked
200*0Sstevel@tonic-gate  * by calling this routine.
201*0Sstevel@tonic-gate  */
202*0Sstevel@tonic-gate struct buf *
203*0Sstevel@tonic-gate bread(dev_t dev, daddr_t blkno, long bsize)
204*0Sstevel@tonic-gate {
205*0Sstevel@tonic-gate 	return (BREAD(dev, blkno, bsize));
206*0Sstevel@tonic-gate }
207*0Sstevel@tonic-gate 
208*0Sstevel@tonic-gate /*
209*0Sstevel@tonic-gate  * Common code for reading a buffer with various options
210*0Sstevel@tonic-gate  *
211*0Sstevel@tonic-gate  * Read in (if necessary) the block and return a buffer pointer.
212*0Sstevel@tonic-gate  */
213*0Sstevel@tonic-gate struct buf *
214*0Sstevel@tonic-gate bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
215*0Sstevel@tonic-gate {
216*0Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
217*0Sstevel@tonic-gate 	struct buf *bp;
218*0Sstevel@tonic-gate 	klwp_t *lwp = ttolwp(curthread);
219*0Sstevel@tonic-gate 
220*0Sstevel@tonic-gate 	CPU_STATS_ADD_K(sys, lread, 1);
221*0Sstevel@tonic-gate 	bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
222*0Sstevel@tonic-gate 	if (bp->b_flags & B_DONE)
223*0Sstevel@tonic-gate 		return (bp);
224*0Sstevel@tonic-gate 	bp->b_flags |= B_READ;
225*0Sstevel@tonic-gate 	ASSERT(bp->b_bcount == bsize);
226*0Sstevel@tonic-gate 	if (ufsvfsp == NULL) {					/* !ufs */
227*0Sstevel@tonic-gate 		(void) bdev_strategy(bp);
228*0Sstevel@tonic-gate 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
229*0Sstevel@tonic-gate 							/* ufs && logging */
230*0Sstevel@tonic-gate 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
231*0Sstevel@tonic-gate 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
232*0Sstevel@tonic-gate 							/* ufs && snapshots */
233*0Sstevel@tonic-gate 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
234*0Sstevel@tonic-gate 	} else {
235*0Sstevel@tonic-gate 		ufsvfsp->vfs_iotstamp = lbolt;
236*0Sstevel@tonic-gate 		ub.ub_breads.value.ul++;		/* ufs && !logging */
237*0Sstevel@tonic-gate 		(void) bdev_strategy(bp);
238*0Sstevel@tonic-gate 	}
239*0Sstevel@tonic-gate 	if (lwp != NULL)
240*0Sstevel@tonic-gate 		lwp->lwp_ru.inblock++;
241*0Sstevel@tonic-gate 	CPU_STATS_ADD_K(sys, bread, 1);
242*0Sstevel@tonic-gate 	(void) biowait(bp);
243*0Sstevel@tonic-gate 	return (bp);
244*0Sstevel@tonic-gate }
245*0Sstevel@tonic-gate 
246*0Sstevel@tonic-gate /*
247*0Sstevel@tonic-gate  * Read in the block, like bread, but also start I/O on the
248*0Sstevel@tonic-gate  * read-ahead block (which is not allocated to the caller).
249*0Sstevel@tonic-gate  */
250*0Sstevel@tonic-gate struct buf *
251*0Sstevel@tonic-gate breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
252*0Sstevel@tonic-gate {
253*0Sstevel@tonic-gate 	struct buf *bp, *rabp;
254*0Sstevel@tonic-gate 	klwp_t *lwp = ttolwp(curthread);
255*0Sstevel@tonic-gate 
256*0Sstevel@tonic-gate 	bp = NULL;
257*0Sstevel@tonic-gate 	if (!bio_incore(dev, blkno)) {
258*0Sstevel@tonic-gate 		CPU_STATS_ADD_K(sys, lread, 1);
259*0Sstevel@tonic-gate 		bp = GETBLK(dev, blkno, bsize);
260*0Sstevel@tonic-gate 		if ((bp->b_flags & B_DONE) == 0) {
261*0Sstevel@tonic-gate 			bp->b_flags |= B_READ;
262*0Sstevel@tonic-gate 			bp->b_bcount = bsize;
263*0Sstevel@tonic-gate 			(void) bdev_strategy(bp);
264*0Sstevel@tonic-gate 			if (lwp != NULL)
265*0Sstevel@tonic-gate 				lwp->lwp_ru.inblock++;
266*0Sstevel@tonic-gate 			CPU_STATS_ADD_K(sys, bread, 1);
267*0Sstevel@tonic-gate 		}
268*0Sstevel@tonic-gate 	}
269*0Sstevel@tonic-gate 	if (rablkno && bfreelist.b_bcount > 1 &&
270*0Sstevel@tonic-gate 	    !bio_incore(dev, rablkno)) {
271*0Sstevel@tonic-gate 		rabp = GETBLK(dev, rablkno, bsize);
272*0Sstevel@tonic-gate 		if (rabp->b_flags & B_DONE)
273*0Sstevel@tonic-gate 			brelse(rabp);
274*0Sstevel@tonic-gate 		else {
275*0Sstevel@tonic-gate 			rabp->b_flags |= B_READ|B_ASYNC;
276*0Sstevel@tonic-gate 			rabp->b_bcount = bsize;
277*0Sstevel@tonic-gate 			(void) bdev_strategy(rabp);
278*0Sstevel@tonic-gate 			if (lwp != NULL)
279*0Sstevel@tonic-gate 				lwp->lwp_ru.inblock++;
280*0Sstevel@tonic-gate 			CPU_STATS_ADD_K(sys, bread, 1);
281*0Sstevel@tonic-gate 		}
282*0Sstevel@tonic-gate 	}
283*0Sstevel@tonic-gate 	if (bp == NULL)
284*0Sstevel@tonic-gate 		return (BREAD(dev, blkno, bsize));
285*0Sstevel@tonic-gate 	(void) biowait(bp);
286*0Sstevel@tonic-gate 	return (bp);
287*0Sstevel@tonic-gate }
288*0Sstevel@tonic-gate 
289*0Sstevel@tonic-gate /*
290*0Sstevel@tonic-gate  * Common code for writing a buffer with various options.
291*0Sstevel@tonic-gate  *
292*0Sstevel@tonic-gate  * force_wait  - wait for write completion regardless of B_ASYNC flag
293*0Sstevel@tonic-gate  * do_relse    - release the buffer when we are done
294*0Sstevel@tonic-gate  * clear_flags - flags to clear from the buffer
295*0Sstevel@tonic-gate  */
296*0Sstevel@tonic-gate void
297*0Sstevel@tonic-gate bwrite_common(void *arg, struct buf *bp, int force_wait,
298*0Sstevel@tonic-gate 				int do_relse, int clear_flags)
299*0Sstevel@tonic-gate {
300*0Sstevel@tonic-gate 	register int do_wait;
301*0Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
302*0Sstevel@tonic-gate 	int flag;
303*0Sstevel@tonic-gate 	klwp_t *lwp = ttolwp(curthread);
304*0Sstevel@tonic-gate 	struct cpu *cpup;
305*0Sstevel@tonic-gate 
306*0Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
307*0Sstevel@tonic-gate 	flag = bp->b_flags;
308*0Sstevel@tonic-gate 	bp->b_flags &= ~clear_flags;
309*0Sstevel@tonic-gate 	if (lwp != NULL)
310*0Sstevel@tonic-gate 		lwp->lwp_ru.oublock++;
311*0Sstevel@tonic-gate 	CPU_STATS_ENTER_K();
312*0Sstevel@tonic-gate 	cpup = CPU;		/* get pointer AFTER preemption is disabled */
313*0Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
314*0Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
315*0Sstevel@tonic-gate 	do_wait = ((flag & B_ASYNC) == 0 || force_wait);
316*0Sstevel@tonic-gate 	if (do_wait == 0)
317*0Sstevel@tonic-gate 		CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
318*0Sstevel@tonic-gate 	CPU_STATS_EXIT_K();
319*0Sstevel@tonic-gate 	if (ufsvfsp == NULL) {
320*0Sstevel@tonic-gate 		(void) bdev_strategy(bp);
321*0Sstevel@tonic-gate 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
322*0Sstevel@tonic-gate 							/* ufs && logging */
323*0Sstevel@tonic-gate 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
324*0Sstevel@tonic-gate 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
325*0Sstevel@tonic-gate 							/* ufs && snapshots */
326*0Sstevel@tonic-gate 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
327*0Sstevel@tonic-gate 	} else {
328*0Sstevel@tonic-gate 		ub.ub_bwrites.value.ul++;		/* ufs && !logging */
329*0Sstevel@tonic-gate 		(void) bdev_strategy(bp);
330*0Sstevel@tonic-gate 	}
331*0Sstevel@tonic-gate 	if (do_wait) {
332*0Sstevel@tonic-gate 		(void) biowait(bp);
333*0Sstevel@tonic-gate 		if (do_relse) {
334*0Sstevel@tonic-gate 			brelse(bp);
335*0Sstevel@tonic-gate 		}
336*0Sstevel@tonic-gate 	}
337*0Sstevel@tonic-gate }
338*0Sstevel@tonic-gate 
339*0Sstevel@tonic-gate /*
340*0Sstevel@tonic-gate  * Write the buffer, waiting for completion (unless B_ASYNC is set).
341*0Sstevel@tonic-gate  * Then release the buffer.
342*0Sstevel@tonic-gate  * This interface is provided for binary compatibility.  Using
343*0Sstevel@tonic-gate  * BWRITE() directly avoids the extra function call overhead invoked
344*0Sstevel@tonic-gate  * by calling this routine.
345*0Sstevel@tonic-gate  */
346*0Sstevel@tonic-gate void
347*0Sstevel@tonic-gate bwrite(struct buf *bp)
348*0Sstevel@tonic-gate {
349*0Sstevel@tonic-gate 	BWRITE(bp);
350*0Sstevel@tonic-gate }
351*0Sstevel@tonic-gate 
352*0Sstevel@tonic-gate /*
353*0Sstevel@tonic-gate  * Write the buffer, waiting for completion.
354*0Sstevel@tonic-gate  * But don't release the buffer afterwards.
355*0Sstevel@tonic-gate  * This interface is provided for binary compatibility.  Using
356*0Sstevel@tonic-gate  * BWRITE2() directly avoids the extra function call overhead.
357*0Sstevel@tonic-gate  */
358*0Sstevel@tonic-gate void
359*0Sstevel@tonic-gate bwrite2(struct buf *bp)
360*0Sstevel@tonic-gate {
361*0Sstevel@tonic-gate 	BWRITE2(bp);
362*0Sstevel@tonic-gate }
363*0Sstevel@tonic-gate 
364*0Sstevel@tonic-gate /*
365*0Sstevel@tonic-gate  * Release the buffer, marking it so that if it is grabbed
366*0Sstevel@tonic-gate  * for another purpose it will be written out before being
367*0Sstevel@tonic-gate  * given up (e.g. when writing a partial block where it is
368*0Sstevel@tonic-gate  * assumed that another write for the same block will soon follow).
369*0Sstevel@tonic-gate  * Also save the time that the block is first marked as delayed
370*0Sstevel@tonic-gate  * so that it will be written in a reasonable time.
371*0Sstevel@tonic-gate  */
372*0Sstevel@tonic-gate void
373*0Sstevel@tonic-gate bdwrite(struct buf *bp)
374*0Sstevel@tonic-gate {
375*0Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
376*0Sstevel@tonic-gate 	CPU_STATS_ADD_K(sys, lwrite, 1);
377*0Sstevel@tonic-gate 	if ((bp->b_flags & B_DELWRI) == 0)
378*0Sstevel@tonic-gate 		bp->b_start = lbolt;
379*0Sstevel@tonic-gate 	/*
380*0Sstevel@tonic-gate 	 * B_DONE allows others to use the buffer, B_DELWRI causes the
381*0Sstevel@tonic-gate 	 * buffer to be written before being reused, and setting b_resid
382*0Sstevel@tonic-gate 	 * to zero says the buffer is complete.
383*0Sstevel@tonic-gate 	 */
384*0Sstevel@tonic-gate 	bp->b_flags |= B_DELWRI | B_DONE;
385*0Sstevel@tonic-gate 	bp->b_resid = 0;
386*0Sstevel@tonic-gate 	brelse(bp);
387*0Sstevel@tonic-gate }
388*0Sstevel@tonic-gate 
389*0Sstevel@tonic-gate /*
390*0Sstevel@tonic-gate  * Release the buffer, start I/O on it, but don't wait for completion.
391*0Sstevel@tonic-gate  */
392*0Sstevel@tonic-gate void
393*0Sstevel@tonic-gate bawrite(struct buf *bp)
394*0Sstevel@tonic-gate {
395*0Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
396*0Sstevel@tonic-gate 
397*0Sstevel@tonic-gate 	/* Use bfreelist.b_bcount as a weird-ass heuristic */
398*0Sstevel@tonic-gate 	if (bfreelist.b_bcount > 4)
399*0Sstevel@tonic-gate 		bp->b_flags |= B_ASYNC;
400*0Sstevel@tonic-gate 	BWRITE(bp);
401*0Sstevel@tonic-gate }
402*0Sstevel@tonic-gate 
403*0Sstevel@tonic-gate /*
404*0Sstevel@tonic-gate  * Release the buffer, with no I/O implied.
405*0Sstevel@tonic-gate  */
406*0Sstevel@tonic-gate void
407*0Sstevel@tonic-gate brelse(struct buf *bp)
408*0Sstevel@tonic-gate {
409*0Sstevel@tonic-gate 	struct buf	**backp;
410*0Sstevel@tonic-gate 	uint_t		index;
411*0Sstevel@tonic-gate 	kmutex_t	*hmp;
412*0Sstevel@tonic-gate 	struct	buf	*dp;
413*0Sstevel@tonic-gate 	struct	hbuf	*hp;
414*0Sstevel@tonic-gate 
415*0Sstevel@tonic-gate 
416*0Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
417*0Sstevel@tonic-gate 
418*0Sstevel@tonic-gate 	/*
419*0Sstevel@tonic-gate 	 * Clear the retry write flag if the buffer was written without
420*0Sstevel@tonic-gate 	 * error.  The presence of B_DELWRI means the buffer has not yet
421*0Sstevel@tonic-gate 	 * been written and the presence of B_ERROR means that an error
422*0Sstevel@tonic-gate 	 * is still occurring.
423*0Sstevel@tonic-gate 	 */
424*0Sstevel@tonic-gate 	if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
425*0Sstevel@tonic-gate 		bp->b_flags &= ~B_RETRYWRI;
426*0Sstevel@tonic-gate 	}
427*0Sstevel@tonic-gate 
428*0Sstevel@tonic-gate 	/* Check for anomalous conditions */
429*0Sstevel@tonic-gate 	if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
430*0Sstevel@tonic-gate 		if (bp->b_flags & B_NOCACHE) {
431*0Sstevel@tonic-gate 			/* Don't add to the freelist. Destroy it now */
432*0Sstevel@tonic-gate 			kmem_free(bp->b_un.b_addr, bp->b_bufsize);
433*0Sstevel@tonic-gate 			sema_destroy(&bp->b_sem);
434*0Sstevel@tonic-gate 			sema_destroy(&bp->b_io);
435*0Sstevel@tonic-gate 			kmem_free(bp, sizeof (struct buf));
436*0Sstevel@tonic-gate 			return;
437*0Sstevel@tonic-gate 		}
438*0Sstevel@tonic-gate 		/*
439*0Sstevel@tonic-gate 		 * If a write failed and we are supposed to retry write,
440*0Sstevel@tonic-gate 		 * don't toss the buffer.  Keep it around and mark it
441*0Sstevel@tonic-gate 		 * delayed write in the hopes that it will eventually
442*0Sstevel@tonic-gate 		 * get flushed (and still keep the system running.)
443*0Sstevel@tonic-gate 		 */
444*0Sstevel@tonic-gate 		if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
445*0Sstevel@tonic-gate 			bp->b_flags |= B_DELWRI;
446*0Sstevel@tonic-gate 			/* keep fsflush from trying continuously to flush */
447*0Sstevel@tonic-gate 			bp->b_start = lbolt;
448*0Sstevel@tonic-gate 		} else
449*0Sstevel@tonic-gate 			bp->b_flags |= B_AGE|B_STALE;
450*0Sstevel@tonic-gate 		bp->b_flags &= ~B_ERROR;
451*0Sstevel@tonic-gate 		bp->b_error = 0;
452*0Sstevel@tonic-gate 	}
453*0Sstevel@tonic-gate 
454*0Sstevel@tonic-gate 	/*
455*0Sstevel@tonic-gate 	 * If delayed write is set then put in on the delayed
456*0Sstevel@tonic-gate 	 * write list instead of the free buffer list.
457*0Sstevel@tonic-gate 	 */
458*0Sstevel@tonic-gate 	index = bio_bhash(bp->b_edev, bp->b_blkno);
459*0Sstevel@tonic-gate 	hmp   = &hbuf[index].b_lock;
460*0Sstevel@tonic-gate 
461*0Sstevel@tonic-gate 	mutex_enter(hmp);
462*0Sstevel@tonic-gate 	hp = &hbuf[index];
463*0Sstevel@tonic-gate 	dp = (struct buf *)hp;
464*0Sstevel@tonic-gate 
465*0Sstevel@tonic-gate 	/*
466*0Sstevel@tonic-gate 	 * Make sure that the number of entries on this list are
467*0Sstevel@tonic-gate 	 * Zero <= count <= total # buffers
468*0Sstevel@tonic-gate 	 */
469*0Sstevel@tonic-gate 	ASSERT(hp->b_length >= 0);
470*0Sstevel@tonic-gate 	ASSERT(hp->b_length < nbuf);
471*0Sstevel@tonic-gate 
472*0Sstevel@tonic-gate 	hp->b_length++;		/* We are adding this buffer */
473*0Sstevel@tonic-gate 
474*0Sstevel@tonic-gate 	if (bp->b_flags & B_DELWRI) {
475*0Sstevel@tonic-gate 		/*
476*0Sstevel@tonic-gate 		 * This buffer goes on the delayed write buffer list
477*0Sstevel@tonic-gate 		 */
478*0Sstevel@tonic-gate 		dp = (struct buf *)&dwbuf[index];
479*0Sstevel@tonic-gate 	}
480*0Sstevel@tonic-gate 	ASSERT(bp->b_bufsize > 0);
481*0Sstevel@tonic-gate 	ASSERT(bp->b_bcount > 0);
482*0Sstevel@tonic-gate 	ASSERT(bp->b_un.b_addr != NULL);
483*0Sstevel@tonic-gate 
484*0Sstevel@tonic-gate 	if (bp->b_flags & B_AGE) {
485*0Sstevel@tonic-gate 		backp = &dp->av_forw;
486*0Sstevel@tonic-gate 		(*backp)->av_back = bp;
487*0Sstevel@tonic-gate 		bp->av_forw = *backp;
488*0Sstevel@tonic-gate 		*backp = bp;
489*0Sstevel@tonic-gate 		bp->av_back = dp;
490*0Sstevel@tonic-gate 	} else {
491*0Sstevel@tonic-gate 		backp = &dp->av_back;
492*0Sstevel@tonic-gate 		(*backp)->av_forw = bp;
493*0Sstevel@tonic-gate 		bp->av_back = *backp;
494*0Sstevel@tonic-gate 		*backp = bp;
495*0Sstevel@tonic-gate 		bp->av_forw = dp;
496*0Sstevel@tonic-gate 	}
497*0Sstevel@tonic-gate 	mutex_exit(hmp);
498*0Sstevel@tonic-gate 
499*0Sstevel@tonic-gate 	if (bfreelist.b_flags & B_WANTED) {
500*0Sstevel@tonic-gate 		/*
501*0Sstevel@tonic-gate 		 * Should come here very very rarely.
502*0Sstevel@tonic-gate 		 */
503*0Sstevel@tonic-gate 		mutex_enter(&bfree_lock);
504*0Sstevel@tonic-gate 		if (bfreelist.b_flags & B_WANTED) {
505*0Sstevel@tonic-gate 			bfreelist.b_flags &= ~B_WANTED;
506*0Sstevel@tonic-gate 			cv_broadcast(&bio_mem_cv);
507*0Sstevel@tonic-gate 		}
508*0Sstevel@tonic-gate 		mutex_exit(&bfree_lock);
509*0Sstevel@tonic-gate 	}
510*0Sstevel@tonic-gate 
511*0Sstevel@tonic-gate 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
512*0Sstevel@tonic-gate 	/*
513*0Sstevel@tonic-gate 	 * Don't let anyone get the buffer off the freelist before we
514*0Sstevel@tonic-gate 	 * release our hold on it.
515*0Sstevel@tonic-gate 	 */
516*0Sstevel@tonic-gate 	sema_v(&bp->b_sem);
517*0Sstevel@tonic-gate }
518*0Sstevel@tonic-gate 
519*0Sstevel@tonic-gate /*
520*0Sstevel@tonic-gate  * Return a count of the number of B_BUSY buffers in the system
521*0Sstevel@tonic-gate  * Can only be used as a good estimate.  If 'cleanit' is set,
522*0Sstevel@tonic-gate  * try to flush all bufs.
523*0Sstevel@tonic-gate  */
524*0Sstevel@tonic-gate int
525*0Sstevel@tonic-gate bio_busy(int cleanit)
526*0Sstevel@tonic-gate {
527*0Sstevel@tonic-gate 	struct buf *bp, *dp;
528*0Sstevel@tonic-gate 	int busy = 0;
529*0Sstevel@tonic-gate 	int i;
530*0Sstevel@tonic-gate 	kmutex_t *hmp;
531*0Sstevel@tonic-gate 
532*0Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
533*0Sstevel@tonic-gate 		vfs_syncprogress();
534*0Sstevel@tonic-gate 		dp = (struct buf *)&hbuf[i];
535*0Sstevel@tonic-gate 		hmp = &hbuf[i].b_lock;
536*0Sstevel@tonic-gate 
537*0Sstevel@tonic-gate 		mutex_enter(hmp);
538*0Sstevel@tonic-gate 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
539*0Sstevel@tonic-gate 			if (bp->b_flags & B_BUSY)
540*0Sstevel@tonic-gate 				busy++;
541*0Sstevel@tonic-gate 		}
542*0Sstevel@tonic-gate 		mutex_exit(hmp);
543*0Sstevel@tonic-gate 	}
544*0Sstevel@tonic-gate 
545*0Sstevel@tonic-gate 	if (cleanit && busy != 0) {
546*0Sstevel@tonic-gate 		bflush(NODEV);
547*0Sstevel@tonic-gate 	}
548*0Sstevel@tonic-gate 
549*0Sstevel@tonic-gate 	return (busy);
550*0Sstevel@tonic-gate }
551*0Sstevel@tonic-gate 
552*0Sstevel@tonic-gate /*
553*0Sstevel@tonic-gate  * this interface is provided for binary compatibility.
554*0Sstevel@tonic-gate  *
555*0Sstevel@tonic-gate  * Assign a buffer for the given block.  If the appropriate
556*0Sstevel@tonic-gate  * block is already associated, return it; otherwise search
557*0Sstevel@tonic-gate  * for the oldest non-busy buffer and reassign it.
558*0Sstevel@tonic-gate  */
559*0Sstevel@tonic-gate struct buf *
560*0Sstevel@tonic-gate getblk(dev_t dev, daddr_t blkno, long bsize)
561*0Sstevel@tonic-gate {
562*0Sstevel@tonic-gate 	return (getblk_common(/* ufsvfsp */ NULL, dev,
563*0Sstevel@tonic-gate 			blkno, bsize, /* errflg */ 0));
564*0Sstevel@tonic-gate }
565*0Sstevel@tonic-gate 
566*0Sstevel@tonic-gate /*
567*0Sstevel@tonic-gate  * Assign a buffer for the given block.  If the appropriate
568*0Sstevel@tonic-gate  * block is already associated, return it; otherwise search
569*0Sstevel@tonic-gate  * for the oldest non-busy buffer and reassign it.
570*0Sstevel@tonic-gate  */
571*0Sstevel@tonic-gate struct buf *
572*0Sstevel@tonic-gate getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
573*0Sstevel@tonic-gate {
574*0Sstevel@tonic-gate 	ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
575*0Sstevel@tonic-gate 	struct buf *bp;
576*0Sstevel@tonic-gate 	struct buf *dp;
577*0Sstevel@tonic-gate 	struct buf *nbp = NULL;
578*0Sstevel@tonic-gate 	struct buf *errbp;
579*0Sstevel@tonic-gate 	uint_t		index;
580*0Sstevel@tonic-gate 	kmutex_t	*hmp;
581*0Sstevel@tonic-gate 	struct	hbuf	*hp;
582*0Sstevel@tonic-gate 
583*0Sstevel@tonic-gate 	if (getmajor(dev) >= devcnt)
584*0Sstevel@tonic-gate 		cmn_err(CE_PANIC, "blkdev");
585*0Sstevel@tonic-gate 
586*0Sstevel@tonic-gate 	biostats.bio_lookup.value.ui32++;
587*0Sstevel@tonic-gate 
588*0Sstevel@tonic-gate 	index = bio_bhash(dev, blkno);
589*0Sstevel@tonic-gate 	hp    = &hbuf[index];
590*0Sstevel@tonic-gate 	dp    = (struct buf *)hp;
591*0Sstevel@tonic-gate 	hmp   = &hp->b_lock;
592*0Sstevel@tonic-gate 
593*0Sstevel@tonic-gate 	mutex_enter(hmp);
594*0Sstevel@tonic-gate loop:
595*0Sstevel@tonic-gate 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
596*0Sstevel@tonic-gate 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
597*0Sstevel@tonic-gate 		    (bp->b_flags & B_STALE))
598*0Sstevel@tonic-gate 			continue;
599*0Sstevel@tonic-gate 		/*
600*0Sstevel@tonic-gate 		 * Avoid holding the hash lock in the event that
601*0Sstevel@tonic-gate 		 * the buffer is locked by someone. Since the hash chain
602*0Sstevel@tonic-gate 		 * may change when we drop the hash lock
603*0Sstevel@tonic-gate 		 * we have to start at the beginning of the chain if the
604*0Sstevel@tonic-gate 		 * buffer identity/contents aren't valid.
605*0Sstevel@tonic-gate 		 */
606*0Sstevel@tonic-gate 		if (!sema_tryp(&bp->b_sem)) {
607*0Sstevel@tonic-gate 			biostats.bio_bufbusy.value.ui32++;
608*0Sstevel@tonic-gate 			mutex_exit(hmp);
609*0Sstevel@tonic-gate 			/*
610*0Sstevel@tonic-gate 			 * OK, we are dealing with a busy buffer.
611*0Sstevel@tonic-gate 			 * In the case that we are panicking and we
612*0Sstevel@tonic-gate 			 * got called from bread(), we have some chance
613*0Sstevel@tonic-gate 			 * for error recovery. So better bail out from
614*0Sstevel@tonic-gate 			 * here since sema_p() won't block. If we got
615*0Sstevel@tonic-gate 			 * called directly from ufs routines, there is
616*0Sstevel@tonic-gate 			 * no way to report an error yet.
617*0Sstevel@tonic-gate 			 */
618*0Sstevel@tonic-gate 			if (panicstr && errflg)
619*0Sstevel@tonic-gate 				goto errout;
620*0Sstevel@tonic-gate 			/*
621*0Sstevel@tonic-gate 			 * For the following line of code to work
622*0Sstevel@tonic-gate 			 * correctly never kmem_free the buffer "header".
623*0Sstevel@tonic-gate 			 */
624*0Sstevel@tonic-gate 			sema_p(&bp->b_sem);
625*0Sstevel@tonic-gate 			if (bp->b_blkno != blkno || bp->b_edev != dev ||
626*0Sstevel@tonic-gate 			    (bp->b_flags & B_STALE)) {
627*0Sstevel@tonic-gate 				sema_v(&bp->b_sem);
628*0Sstevel@tonic-gate 				mutex_enter(hmp);
629*0Sstevel@tonic-gate 				goto loop;	/* start over */
630*0Sstevel@tonic-gate 			}
631*0Sstevel@tonic-gate 			mutex_enter(hmp);
632*0Sstevel@tonic-gate 		}
633*0Sstevel@tonic-gate 		/* Found */
634*0Sstevel@tonic-gate 		biostats.bio_hit.value.ui32++;
635*0Sstevel@tonic-gate 		bp->b_flags &= ~B_AGE;
636*0Sstevel@tonic-gate 
637*0Sstevel@tonic-gate 		/*
638*0Sstevel@tonic-gate 		 * Yank it off the free/delayed write lists
639*0Sstevel@tonic-gate 		 */
640*0Sstevel@tonic-gate 		hp->b_length--;
641*0Sstevel@tonic-gate 		notavail(bp);
642*0Sstevel@tonic-gate 		mutex_exit(hmp);
643*0Sstevel@tonic-gate 
644*0Sstevel@tonic-gate 		ASSERT((bp->b_flags & B_NOCACHE) == NULL);
645*0Sstevel@tonic-gate 
646*0Sstevel@tonic-gate 		if (nbp == NULL) {
647*0Sstevel@tonic-gate 			/*
648*0Sstevel@tonic-gate 			 * Make the common path short.
649*0Sstevel@tonic-gate 			 */
650*0Sstevel@tonic-gate 			ASSERT(SEMA_HELD(&bp->b_sem));
651*0Sstevel@tonic-gate 			return (bp);
652*0Sstevel@tonic-gate 		}
653*0Sstevel@tonic-gate 
654*0Sstevel@tonic-gate 		biostats.bio_bufdup.value.ui32++;
655*0Sstevel@tonic-gate 
656*0Sstevel@tonic-gate 		/*
657*0Sstevel@tonic-gate 		 * The buffer must have entered during the lock upgrade
658*0Sstevel@tonic-gate 		 * so free the new buffer we allocated and return the
659*0Sstevel@tonic-gate 		 * found buffer.
660*0Sstevel@tonic-gate 		 */
661*0Sstevel@tonic-gate 		kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
662*0Sstevel@tonic-gate 		nbp->b_un.b_addr = NULL;
663*0Sstevel@tonic-gate 
664*0Sstevel@tonic-gate 		/*
665*0Sstevel@tonic-gate 		 * Account for the memory
666*0Sstevel@tonic-gate 		 */
667*0Sstevel@tonic-gate 		mutex_enter(&bfree_lock);
668*0Sstevel@tonic-gate 		bfreelist.b_bufsize += nbp->b_bufsize;
669*0Sstevel@tonic-gate 		mutex_exit(&bfree_lock);
670*0Sstevel@tonic-gate 
671*0Sstevel@tonic-gate 		/*
672*0Sstevel@tonic-gate 		 * Destroy buf identity, and place on avail list
673*0Sstevel@tonic-gate 		 */
674*0Sstevel@tonic-gate 		nbp->b_dev = (o_dev_t)NODEV;
675*0Sstevel@tonic-gate 		nbp->b_edev = NODEV;
676*0Sstevel@tonic-gate 		nbp->b_flags = 0;
677*0Sstevel@tonic-gate 		nbp->b_file = NULL;
678*0Sstevel@tonic-gate 		nbp->b_offset = -1;
679*0Sstevel@tonic-gate 
680*0Sstevel@tonic-gate 		sema_v(&nbp->b_sem);
681*0Sstevel@tonic-gate 		bio_bhdr_free(nbp);
682*0Sstevel@tonic-gate 
683*0Sstevel@tonic-gate 		ASSERT(SEMA_HELD(&bp->b_sem));
684*0Sstevel@tonic-gate 		return (bp);
685*0Sstevel@tonic-gate 	}
686*0Sstevel@tonic-gate 
687*0Sstevel@tonic-gate 	/*
688*0Sstevel@tonic-gate 	 * bio_getfreeblk may block so check the hash chain again.
689*0Sstevel@tonic-gate 	 */
690*0Sstevel@tonic-gate 	if (nbp == NULL) {
691*0Sstevel@tonic-gate 		mutex_exit(hmp);
692*0Sstevel@tonic-gate 		nbp = bio_getfreeblk(bsize);
693*0Sstevel@tonic-gate 		mutex_enter(hmp);
694*0Sstevel@tonic-gate 		goto loop;
695*0Sstevel@tonic-gate 	}
696*0Sstevel@tonic-gate 
697*0Sstevel@tonic-gate 	/*
698*0Sstevel@tonic-gate 	 * New buffer. Assign nbp and stick it on the hash.
699*0Sstevel@tonic-gate 	 */
700*0Sstevel@tonic-gate 	nbp->b_flags = B_BUSY;
701*0Sstevel@tonic-gate 	nbp->b_edev = dev;
702*0Sstevel@tonic-gate 	nbp->b_dev = (o_dev_t)cmpdev(dev);
703*0Sstevel@tonic-gate 	nbp->b_blkno = blkno;
704*0Sstevel@tonic-gate 	nbp->b_iodone = NULL;
705*0Sstevel@tonic-gate 	nbp->b_bcount = bsize;
706*0Sstevel@tonic-gate 	/*
707*0Sstevel@tonic-gate 	 * If we are given a ufsvfsp and the vfs_root field is NULL
708*0Sstevel@tonic-gate 	 * then this must be I/O for a superblock.  A superblock's
709*0Sstevel@tonic-gate 	 * buffer is set up in mountfs() and there is no root vnode
710*0Sstevel@tonic-gate 	 * at that point.
711*0Sstevel@tonic-gate 	 */
712*0Sstevel@tonic-gate 	if (ufsvfsp && ufsvfsp->vfs_root) {
713*0Sstevel@tonic-gate 		nbp->b_vp = ufsvfsp->vfs_root;
714*0Sstevel@tonic-gate 	} else {
715*0Sstevel@tonic-gate 		nbp->b_vp = NULL;
716*0Sstevel@tonic-gate 	}
717*0Sstevel@tonic-gate 
718*0Sstevel@tonic-gate 	ASSERT((nbp->b_flags & B_NOCACHE) == NULL);
719*0Sstevel@tonic-gate 
720*0Sstevel@tonic-gate 	binshash(nbp, dp);
721*0Sstevel@tonic-gate 	mutex_exit(hmp);
722*0Sstevel@tonic-gate 
723*0Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&nbp->b_sem));
724*0Sstevel@tonic-gate 
725*0Sstevel@tonic-gate 	return (nbp);
726*0Sstevel@tonic-gate 
727*0Sstevel@tonic-gate 
728*0Sstevel@tonic-gate 	/*
729*0Sstevel@tonic-gate 	 * Come here in case of an internal error. At this point we couldn't
730*0Sstevel@tonic-gate 	 * get a buffer, but he have to return one. Hence we allocate some
731*0Sstevel@tonic-gate 	 * kind of error reply buffer on the fly. This buffer is marked as
732*0Sstevel@tonic-gate 	 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
733*0Sstevel@tonic-gate 	 *	- B_ERROR will indicate error to the caller.
734*0Sstevel@tonic-gate 	 *	- B_DONE will prevent us from reading the buffer from
735*0Sstevel@tonic-gate 	 *	  the device.
736*0Sstevel@tonic-gate 	 *	- B_NOCACHE will cause that this buffer gets free'd in
737*0Sstevel@tonic-gate 	 *	  brelse().
738*0Sstevel@tonic-gate 	 */
739*0Sstevel@tonic-gate 
740*0Sstevel@tonic-gate errout:
741*0Sstevel@tonic-gate 	errbp = geteblk();
742*0Sstevel@tonic-gate 	sema_p(&errbp->b_sem);
743*0Sstevel@tonic-gate 	errbp->b_flags &= ~B_BUSY;
744*0Sstevel@tonic-gate 	errbp->b_flags |= (B_ERROR | B_DONE);
745*0Sstevel@tonic-gate 	return (errbp);
746*0Sstevel@tonic-gate }
747*0Sstevel@tonic-gate 
748*0Sstevel@tonic-gate /*
749*0Sstevel@tonic-gate  * Get an empty block, not assigned to any particular device.
750*0Sstevel@tonic-gate  * Returns a locked buffer that is not on any hash or free list.
751*0Sstevel@tonic-gate  */
752*0Sstevel@tonic-gate struct buf *
753*0Sstevel@tonic-gate ngeteblk(long bsize)
754*0Sstevel@tonic-gate {
755*0Sstevel@tonic-gate 	struct buf *bp;
756*0Sstevel@tonic-gate 
757*0Sstevel@tonic-gate 	bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
758*0Sstevel@tonic-gate 	bioinit(bp);
759*0Sstevel@tonic-gate 	bp->av_forw = bp->av_back = NULL;
760*0Sstevel@tonic-gate 	bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
761*0Sstevel@tonic-gate 	bp->b_bufsize = bsize;
762*0Sstevel@tonic-gate 	bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
763*0Sstevel@tonic-gate 	bp->b_dev = (o_dev_t)NODEV;
764*0Sstevel@tonic-gate 	bp->b_edev = NODEV;
765*0Sstevel@tonic-gate 	bp->b_lblkno = 0;
766*0Sstevel@tonic-gate 	bp->b_bcount = bsize;
767*0Sstevel@tonic-gate 	bp->b_iodone = NULL;
768*0Sstevel@tonic-gate 	return (bp);
769*0Sstevel@tonic-gate }
770*0Sstevel@tonic-gate 
771*0Sstevel@tonic-gate /*
772*0Sstevel@tonic-gate  * Interface of geteblk() is kept intact to maintain driver compatibility.
773*0Sstevel@tonic-gate  * Use ngeteblk() to allocate block size other than 1 KB.
774*0Sstevel@tonic-gate  */
775*0Sstevel@tonic-gate struct buf *
776*0Sstevel@tonic-gate geteblk(void)
777*0Sstevel@tonic-gate {
778*0Sstevel@tonic-gate 	return (ngeteblk((long)1024));
779*0Sstevel@tonic-gate }
780*0Sstevel@tonic-gate 
781*0Sstevel@tonic-gate /*
782*0Sstevel@tonic-gate  * Return a buffer w/o sleeping
783*0Sstevel@tonic-gate  */
784*0Sstevel@tonic-gate struct buf *
785*0Sstevel@tonic-gate trygetblk(dev_t dev, daddr_t blkno)
786*0Sstevel@tonic-gate {
787*0Sstevel@tonic-gate 	struct buf	*bp;
788*0Sstevel@tonic-gate 	struct buf	*dp;
789*0Sstevel@tonic-gate 	struct hbuf	*hp;
790*0Sstevel@tonic-gate 	kmutex_t	*hmp;
791*0Sstevel@tonic-gate 	uint_t		index;
792*0Sstevel@tonic-gate 
793*0Sstevel@tonic-gate 	index = bio_bhash(dev, blkno);
794*0Sstevel@tonic-gate 	hp = &hbuf[index];
795*0Sstevel@tonic-gate 	hmp = &hp->b_lock;
796*0Sstevel@tonic-gate 
797*0Sstevel@tonic-gate 	if (!mutex_tryenter(hmp))
798*0Sstevel@tonic-gate 		return (NULL);
799*0Sstevel@tonic-gate 
800*0Sstevel@tonic-gate 	dp = (struct buf *)hp;
801*0Sstevel@tonic-gate 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
802*0Sstevel@tonic-gate 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
803*0Sstevel@tonic-gate 		    (bp->b_flags & B_STALE))
804*0Sstevel@tonic-gate 			continue;
805*0Sstevel@tonic-gate 		/*
806*0Sstevel@tonic-gate 		 * Get access to a valid buffer without sleeping
807*0Sstevel@tonic-gate 		 */
808*0Sstevel@tonic-gate 		if (sema_tryp(&bp->b_sem)) {
809*0Sstevel@tonic-gate 			if (bp->b_flags & B_DONE) {
810*0Sstevel@tonic-gate 				hp->b_length--;
811*0Sstevel@tonic-gate 				notavail(bp);
812*0Sstevel@tonic-gate 				mutex_exit(hmp);
813*0Sstevel@tonic-gate 				return (bp);
814*0Sstevel@tonic-gate 			} else {
815*0Sstevel@tonic-gate 				sema_v(&bp->b_sem);
816*0Sstevel@tonic-gate 				break;
817*0Sstevel@tonic-gate 			}
818*0Sstevel@tonic-gate 		}
819*0Sstevel@tonic-gate 		break;
820*0Sstevel@tonic-gate 	}
821*0Sstevel@tonic-gate 	mutex_exit(hmp);
822*0Sstevel@tonic-gate 	return (NULL);
823*0Sstevel@tonic-gate }
824*0Sstevel@tonic-gate 
825*0Sstevel@tonic-gate /*
826*0Sstevel@tonic-gate  * Wait for I/O completion on the buffer; return errors
827*0Sstevel@tonic-gate  * to the user.
828*0Sstevel@tonic-gate  */
829*0Sstevel@tonic-gate int
830*0Sstevel@tonic-gate iowait(struct buf *bp)
831*0Sstevel@tonic-gate {
832*0Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
833*0Sstevel@tonic-gate 	return (biowait(bp));
834*0Sstevel@tonic-gate }
835*0Sstevel@tonic-gate 
836*0Sstevel@tonic-gate /*
837*0Sstevel@tonic-gate  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
838*0Sstevel@tonic-gate  * and wake up anyone waiting for it.
839*0Sstevel@tonic-gate  */
840*0Sstevel@tonic-gate void
841*0Sstevel@tonic-gate iodone(struct buf *bp)
842*0Sstevel@tonic-gate {
843*0Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
844*0Sstevel@tonic-gate 	(void) biodone(bp);
845*0Sstevel@tonic-gate }
846*0Sstevel@tonic-gate 
847*0Sstevel@tonic-gate /*
848*0Sstevel@tonic-gate  * Zero the core associated with a buffer.
849*0Sstevel@tonic-gate  */
850*0Sstevel@tonic-gate void
851*0Sstevel@tonic-gate clrbuf(struct buf *bp)
852*0Sstevel@tonic-gate {
853*0Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
854*0Sstevel@tonic-gate 	bzero(bp->b_un.b_addr, bp->b_bcount);
855*0Sstevel@tonic-gate 	bp->b_resid = 0;
856*0Sstevel@tonic-gate }
857*0Sstevel@tonic-gate 
858*0Sstevel@tonic-gate 
859*0Sstevel@tonic-gate /*
860*0Sstevel@tonic-gate  * Make sure all write-behind blocks on dev (or NODEV for all)
861*0Sstevel@tonic-gate  * are flushed out.
862*0Sstevel@tonic-gate  */
863*0Sstevel@tonic-gate void
864*0Sstevel@tonic-gate bflush(dev_t dev)
865*0Sstevel@tonic-gate {
866*0Sstevel@tonic-gate 	struct buf *bp, *dp;
867*0Sstevel@tonic-gate 	struct hbuf *hp;
868*0Sstevel@tonic-gate 	struct buf *delwri_list = EMPTY_LIST;
869*0Sstevel@tonic-gate 	int i, index;
870*0Sstevel@tonic-gate 	kmutex_t *hmp;
871*0Sstevel@tonic-gate 
872*0Sstevel@tonic-gate 	mutex_enter(&blist_lock);
873*0Sstevel@tonic-gate 	/*
874*0Sstevel@tonic-gate 	 * Wait for any invalidates or flushes ahead of us to finish.
875*0Sstevel@tonic-gate 	 * We really could split blist_lock up per device for better
876*0Sstevel@tonic-gate 	 * parallelism here.
877*0Sstevel@tonic-gate 	 */
878*0Sstevel@tonic-gate 	while (bio_doinginval || bio_doingflush) {
879*0Sstevel@tonic-gate 		bio_flinv_cv_wanted = 1;
880*0Sstevel@tonic-gate 		cv_wait(&bio_flushinval_cv, &blist_lock);
881*0Sstevel@tonic-gate 	}
882*0Sstevel@tonic-gate 	bio_doingflush++;
883*0Sstevel@tonic-gate 	/*
884*0Sstevel@tonic-gate 	 * Gather all B_DELWRI buffer for device.
885*0Sstevel@tonic-gate 	 * Lock ordering is b_sem > hash lock (brelse).
886*0Sstevel@tonic-gate 	 * Since we are finding the buffer via the delayed write list,
887*0Sstevel@tonic-gate 	 * it may be busy and we would block trying to get the
888*0Sstevel@tonic-gate 	 * b_sem lock while holding hash lock. So transfer all the
889*0Sstevel@tonic-gate 	 * candidates on the delwri_list and then drop the hash locks.
890*0Sstevel@tonic-gate 	 */
891*0Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
892*0Sstevel@tonic-gate 		vfs_syncprogress();
893*0Sstevel@tonic-gate 		hmp = &hbuf[i].b_lock;
894*0Sstevel@tonic-gate 		dp = (struct buf *)&dwbuf[i];
895*0Sstevel@tonic-gate 		mutex_enter(hmp);
896*0Sstevel@tonic-gate 		for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
897*0Sstevel@tonic-gate 			if (dev == NODEV || bp->b_edev == dev) {
898*0Sstevel@tonic-gate 				if (bp->b_list == NULL) {
899*0Sstevel@tonic-gate 					bp->b_list = delwri_list;
900*0Sstevel@tonic-gate 					delwri_list = bp;
901*0Sstevel@tonic-gate 				}
902*0Sstevel@tonic-gate 			}
903*0Sstevel@tonic-gate 		}
904*0Sstevel@tonic-gate 		mutex_exit(hmp);
905*0Sstevel@tonic-gate 	}
906*0Sstevel@tonic-gate 	mutex_exit(&blist_lock);
907*0Sstevel@tonic-gate 
908*0Sstevel@tonic-gate 	/*
909*0Sstevel@tonic-gate 	 * Now that the hash locks have been dropped grab the semaphores
910*0Sstevel@tonic-gate 	 * and write back all the buffers that have B_DELWRI set.
911*0Sstevel@tonic-gate 	 */
912*0Sstevel@tonic-gate 	while (delwri_list != EMPTY_LIST) {
913*0Sstevel@tonic-gate 		vfs_syncprogress();
914*0Sstevel@tonic-gate 		bp = delwri_list;
915*0Sstevel@tonic-gate 
916*0Sstevel@tonic-gate 		sema_p(&bp->b_sem);	/* may block */
917*0Sstevel@tonic-gate 		if ((dev != bp->b_edev && dev != NODEV) ||
918*0Sstevel@tonic-gate 		    (panicstr && bp->b_flags & B_BUSY)) {
919*0Sstevel@tonic-gate 			sema_v(&bp->b_sem);
920*0Sstevel@tonic-gate 			delwri_list = bp->b_list;
921*0Sstevel@tonic-gate 			bp->b_list = NULL;
922*0Sstevel@tonic-gate 			continue;	/* No longer a candidate */
923*0Sstevel@tonic-gate 		}
924*0Sstevel@tonic-gate 		if (bp->b_flags & B_DELWRI) {
925*0Sstevel@tonic-gate 			index = bio_bhash(bp->b_edev, bp->b_blkno);
926*0Sstevel@tonic-gate 			hp = &hbuf[index];
927*0Sstevel@tonic-gate 			hmp = &hp->b_lock;
928*0Sstevel@tonic-gate 			dp = (struct buf *)hp;
929*0Sstevel@tonic-gate 
930*0Sstevel@tonic-gate 			bp->b_flags |= B_ASYNC;
931*0Sstevel@tonic-gate 			mutex_enter(hmp);
932*0Sstevel@tonic-gate 			hp->b_length--;
933*0Sstevel@tonic-gate 			notavail(bp);
934*0Sstevel@tonic-gate 			mutex_exit(hmp);
935*0Sstevel@tonic-gate 			if (bp->b_vp == NULL) {		/* !ufs */
936*0Sstevel@tonic-gate 				BWRITE(bp);
937*0Sstevel@tonic-gate 			} else {			/* ufs */
938*0Sstevel@tonic-gate 				UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
939*0Sstevel@tonic-gate 			}
940*0Sstevel@tonic-gate 		} else {
941*0Sstevel@tonic-gate 			sema_v(&bp->b_sem);
942*0Sstevel@tonic-gate 		}
943*0Sstevel@tonic-gate 		delwri_list = bp->b_list;
944*0Sstevel@tonic-gate 		bp->b_list = NULL;
945*0Sstevel@tonic-gate 	}
946*0Sstevel@tonic-gate 	mutex_enter(&blist_lock);
947*0Sstevel@tonic-gate 	bio_doingflush--;
948*0Sstevel@tonic-gate 	if (bio_flinv_cv_wanted) {
949*0Sstevel@tonic-gate 		bio_flinv_cv_wanted = 0;
950*0Sstevel@tonic-gate 		cv_broadcast(&bio_flushinval_cv);
951*0Sstevel@tonic-gate 	}
952*0Sstevel@tonic-gate 	mutex_exit(&blist_lock);
953*0Sstevel@tonic-gate }
954*0Sstevel@tonic-gate 
955*0Sstevel@tonic-gate /*
956*0Sstevel@tonic-gate  * Ensure that a specified block is up-to-date on disk.
957*0Sstevel@tonic-gate  */
958*0Sstevel@tonic-gate void
959*0Sstevel@tonic-gate blkflush(dev_t dev, daddr_t blkno)
960*0Sstevel@tonic-gate {
961*0Sstevel@tonic-gate 	struct buf *bp, *dp;
962*0Sstevel@tonic-gate 	struct hbuf *hp;
963*0Sstevel@tonic-gate 	struct buf *sbp = NULL;
964*0Sstevel@tonic-gate 	uint_t index;
965*0Sstevel@tonic-gate 	kmutex_t *hmp;
966*0Sstevel@tonic-gate 
967*0Sstevel@tonic-gate 	index = bio_bhash(dev, blkno);
968*0Sstevel@tonic-gate 	hp    = &hbuf[index];
969*0Sstevel@tonic-gate 	dp    = (struct buf *)hp;
970*0Sstevel@tonic-gate 	hmp   = &hp->b_lock;
971*0Sstevel@tonic-gate 
972*0Sstevel@tonic-gate 	/*
973*0Sstevel@tonic-gate 	 * Identify the buffer in the cache belonging to
974*0Sstevel@tonic-gate 	 * this device and blkno (if any).
975*0Sstevel@tonic-gate 	 */
976*0Sstevel@tonic-gate 	mutex_enter(hmp);
977*0Sstevel@tonic-gate 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
978*0Sstevel@tonic-gate 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
979*0Sstevel@tonic-gate 		    (bp->b_flags & B_STALE))
980*0Sstevel@tonic-gate 			continue;
981*0Sstevel@tonic-gate 		sbp = bp;
982*0Sstevel@tonic-gate 		break;
983*0Sstevel@tonic-gate 	}
984*0Sstevel@tonic-gate 	mutex_exit(hmp);
985*0Sstevel@tonic-gate 	if (sbp == NULL)
986*0Sstevel@tonic-gate 		return;
987*0Sstevel@tonic-gate 	/*
988*0Sstevel@tonic-gate 	 * Now check the buffer we have identified and
989*0Sstevel@tonic-gate 	 * make sure it still belongs to the device and is B_DELWRI
990*0Sstevel@tonic-gate 	 */
991*0Sstevel@tonic-gate 	sema_p(&sbp->b_sem);
992*0Sstevel@tonic-gate 	if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
993*0Sstevel@tonic-gate 	    (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
994*0Sstevel@tonic-gate 		mutex_enter(hmp);
995*0Sstevel@tonic-gate 		hp->b_length--;
996*0Sstevel@tonic-gate 		notavail(sbp);
997*0Sstevel@tonic-gate 		mutex_exit(hmp);
998*0Sstevel@tonic-gate 		/*
999*0Sstevel@tonic-gate 		 * XXX - There is nothing to guarantee a synchronous
1000*0Sstevel@tonic-gate 		 * write here if the B_ASYNC flag is set.  This needs
1001*0Sstevel@tonic-gate 		 * some investigation.
1002*0Sstevel@tonic-gate 		 */
1003*0Sstevel@tonic-gate 		if (sbp->b_vp == NULL) {		/* !ufs */
1004*0Sstevel@tonic-gate 			BWRITE(sbp);	/* synchronous write */
1005*0Sstevel@tonic-gate 		} else {				/* ufs */
1006*0Sstevel@tonic-gate 			UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
1007*0Sstevel@tonic-gate 		}
1008*0Sstevel@tonic-gate 	} else {
1009*0Sstevel@tonic-gate 		sema_v(&sbp->b_sem);
1010*0Sstevel@tonic-gate 	}
1011*0Sstevel@tonic-gate }
1012*0Sstevel@tonic-gate 
1013*0Sstevel@tonic-gate /*
1014*0Sstevel@tonic-gate  * Same as binval, except can force-invalidate delayed-write buffers
1015*0Sstevel@tonic-gate  * (which are not be already flushed because of device errors).  Also
1016*0Sstevel@tonic-gate  * makes sure that the retry write flag is cleared.
1017*0Sstevel@tonic-gate  */
1018*0Sstevel@tonic-gate int
1019*0Sstevel@tonic-gate bfinval(dev_t dev, int force)
1020*0Sstevel@tonic-gate {
1021*0Sstevel@tonic-gate 	struct buf *dp;
1022*0Sstevel@tonic-gate 	struct buf *bp;
1023*0Sstevel@tonic-gate 	struct buf *binval_list = EMPTY_LIST;
1024*0Sstevel@tonic-gate 	int i, error = 0;
1025*0Sstevel@tonic-gate 	kmutex_t *hmp;
1026*0Sstevel@tonic-gate 	uint_t index;
1027*0Sstevel@tonic-gate 	struct buf **backp;
1028*0Sstevel@tonic-gate 
1029*0Sstevel@tonic-gate 	mutex_enter(&blist_lock);
1030*0Sstevel@tonic-gate 	/*
1031*0Sstevel@tonic-gate 	 * Wait for any flushes ahead of us to finish, it's ok to
1032*0Sstevel@tonic-gate 	 * do invalidates in parallel.
1033*0Sstevel@tonic-gate 	 */
1034*0Sstevel@tonic-gate 	while (bio_doingflush) {
1035*0Sstevel@tonic-gate 		bio_flinv_cv_wanted = 1;
1036*0Sstevel@tonic-gate 		cv_wait(&bio_flushinval_cv, &blist_lock);
1037*0Sstevel@tonic-gate 	}
1038*0Sstevel@tonic-gate 	bio_doinginval++;
1039*0Sstevel@tonic-gate 
1040*0Sstevel@tonic-gate 	/* Gather bp's */
1041*0Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
1042*0Sstevel@tonic-gate 		dp = (struct buf *)&hbuf[i];
1043*0Sstevel@tonic-gate 		hmp = &hbuf[i].b_lock;
1044*0Sstevel@tonic-gate 
1045*0Sstevel@tonic-gate 		mutex_enter(hmp);
1046*0Sstevel@tonic-gate 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1047*0Sstevel@tonic-gate 			if (bp->b_edev == dev) {
1048*0Sstevel@tonic-gate 				if (bp->b_list == NULL) {
1049*0Sstevel@tonic-gate 					bp->b_list = binval_list;
1050*0Sstevel@tonic-gate 					binval_list = bp;
1051*0Sstevel@tonic-gate 				}
1052*0Sstevel@tonic-gate 			}
1053*0Sstevel@tonic-gate 		}
1054*0Sstevel@tonic-gate 		mutex_exit(hmp);
1055*0Sstevel@tonic-gate 	}
1056*0Sstevel@tonic-gate 	mutex_exit(&blist_lock);
1057*0Sstevel@tonic-gate 
1058*0Sstevel@tonic-gate 	/* Invalidate all bp's found */
1059*0Sstevel@tonic-gate 	while (binval_list != EMPTY_LIST) {
1060*0Sstevel@tonic-gate 		bp = binval_list;
1061*0Sstevel@tonic-gate 
1062*0Sstevel@tonic-gate 		sema_p(&bp->b_sem);
1063*0Sstevel@tonic-gate 		if (bp->b_edev == dev) {
1064*0Sstevel@tonic-gate 			if (force && (bp->b_flags & B_DELWRI)) {
1065*0Sstevel@tonic-gate 				/* clear B_DELWRI, move to non-dw freelist */
1066*0Sstevel@tonic-gate 				index = bio_bhash(bp->b_edev, bp->b_blkno);
1067*0Sstevel@tonic-gate 				hmp = &hbuf[index].b_lock;
1068*0Sstevel@tonic-gate 				dp = (struct buf *)&hbuf[index];
1069*0Sstevel@tonic-gate 				mutex_enter(hmp);
1070*0Sstevel@tonic-gate 
1071*0Sstevel@tonic-gate 				/* remove from delayed write freelist */
1072*0Sstevel@tonic-gate 				notavail(bp);
1073*0Sstevel@tonic-gate 
1074*0Sstevel@tonic-gate 				/* add to B_AGE side of non-dw freelist */
1075*0Sstevel@tonic-gate 				backp = &dp->av_forw;
1076*0Sstevel@tonic-gate 				(*backp)->av_back = bp;
1077*0Sstevel@tonic-gate 				bp->av_forw = *backp;
1078*0Sstevel@tonic-gate 				*backp = bp;
1079*0Sstevel@tonic-gate 				bp->av_back = dp;
1080*0Sstevel@tonic-gate 
1081*0Sstevel@tonic-gate 				/*
1082*0Sstevel@tonic-gate 				 * make sure write retries and busy are cleared
1083*0Sstevel@tonic-gate 				 */
1084*0Sstevel@tonic-gate 				bp->b_flags &=
1085*0Sstevel@tonic-gate 				    ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1086*0Sstevel@tonic-gate 				mutex_exit(hmp);
1087*0Sstevel@tonic-gate 			}
1088*0Sstevel@tonic-gate 			if ((bp->b_flags & B_DELWRI) == 0)
1089*0Sstevel@tonic-gate 				bp->b_flags |= B_STALE|B_AGE;
1090*0Sstevel@tonic-gate 			else
1091*0Sstevel@tonic-gate 				error = EIO;
1092*0Sstevel@tonic-gate 		}
1093*0Sstevel@tonic-gate 		sema_v(&bp->b_sem);
1094*0Sstevel@tonic-gate 		binval_list = bp->b_list;
1095*0Sstevel@tonic-gate 		bp->b_list = NULL;
1096*0Sstevel@tonic-gate 	}
1097*0Sstevel@tonic-gate 	mutex_enter(&blist_lock);
1098*0Sstevel@tonic-gate 	bio_doinginval--;
1099*0Sstevel@tonic-gate 	if (bio_flinv_cv_wanted) {
1100*0Sstevel@tonic-gate 		cv_broadcast(&bio_flushinval_cv);
1101*0Sstevel@tonic-gate 		bio_flinv_cv_wanted = 0;
1102*0Sstevel@tonic-gate 	}
1103*0Sstevel@tonic-gate 	mutex_exit(&blist_lock);
1104*0Sstevel@tonic-gate 	return (error);
1105*0Sstevel@tonic-gate }
1106*0Sstevel@tonic-gate 
1107*0Sstevel@tonic-gate /*
1108*0Sstevel@tonic-gate  * If possible, invalidate blocks for a dev on demand
1109*0Sstevel@tonic-gate  */
1110*0Sstevel@tonic-gate void
1111*0Sstevel@tonic-gate binval(dev_t dev)
1112*0Sstevel@tonic-gate {
1113*0Sstevel@tonic-gate 	(void) bfinval(dev, 0);
1114*0Sstevel@tonic-gate }
1115*0Sstevel@tonic-gate 
1116*0Sstevel@tonic-gate /*
1117*0Sstevel@tonic-gate  * Initialize the buffer I/O system by freeing
1118*0Sstevel@tonic-gate  * all buffers and setting all device hash buffer lists to empty.
1119*0Sstevel@tonic-gate  */
1120*0Sstevel@tonic-gate void
1121*0Sstevel@tonic-gate binit(void)
1122*0Sstevel@tonic-gate {
1123*0Sstevel@tonic-gate 	struct buf *bp;
1124*0Sstevel@tonic-gate 	unsigned int i, pct;
1125*0Sstevel@tonic-gate 	ulong_t	bio_max_hwm, bio_default_hwm;
1126*0Sstevel@tonic-gate 
1127*0Sstevel@tonic-gate 	/*
1128*0Sstevel@tonic-gate 	 * Maximum/Default values for bufhwm are set to the smallest of:
1129*0Sstevel@tonic-gate 	 *	- BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1130*0Sstevel@tonic-gate 	 *	- 1/4 of kernel virtual memory
1131*0Sstevel@tonic-gate 	 *	- INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1132*0Sstevel@tonic-gate 	 * Additionally, in order to allow simple tuning by percentage of
1133*0Sstevel@tonic-gate 	 * physical memory, bufhwm_pct is used to calculate the default if
1134*0Sstevel@tonic-gate 	 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1135*0Sstevel@tonic-gate 	 *
1136*0Sstevel@tonic-gate 	 * Since the unit for v.v_bufhwm is kilobytes, this allows for
1137*0Sstevel@tonic-gate 	 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1138*0Sstevel@tonic-gate 	 */
1139*0Sstevel@tonic-gate 	bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1140*0Sstevel@tonic-gate 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1141*0Sstevel@tonic-gate 	bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1142*0Sstevel@tonic-gate 
1143*0Sstevel@tonic-gate 	pct = BIO_BUF_PERCENT;
1144*0Sstevel@tonic-gate 	if (bufhwm_pct != 0 &&
1145*0Sstevel@tonic-gate 	    ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1146*0Sstevel@tonic-gate 		pct = BIO_BUF_PERCENT;
1147*0Sstevel@tonic-gate 		/*
1148*0Sstevel@tonic-gate 		 * Invalid user specified value, emit a warning.
1149*0Sstevel@tonic-gate 		 */
1150*0Sstevel@tonic-gate 		cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1151*0Sstevel@tonic-gate 			range(1..%d). Using %d as default.",
1152*0Sstevel@tonic-gate 			bufhwm_pct,
1153*0Sstevel@tonic-gate 			100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1154*0Sstevel@tonic-gate 	}
1155*0Sstevel@tonic-gate 
1156*0Sstevel@tonic-gate 	bio_default_hwm = MIN(physmem / pct,
1157*0Sstevel@tonic-gate 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1158*0Sstevel@tonic-gate 	bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1159*0Sstevel@tonic-gate 
1160*0Sstevel@tonic-gate 	if ((v.v_bufhwm = bufhwm) == 0)
1161*0Sstevel@tonic-gate 		v.v_bufhwm = bio_default_hwm;
1162*0Sstevel@tonic-gate 
1163*0Sstevel@tonic-gate 	if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1164*0Sstevel@tonic-gate 		v.v_bufhwm = (int)bio_max_hwm;
1165*0Sstevel@tonic-gate 		/*
1166*0Sstevel@tonic-gate 		 * Invalid user specified value, emit a warning.
1167*0Sstevel@tonic-gate 		 */
1168*0Sstevel@tonic-gate 		cmn_err(CE_WARN,
1169*0Sstevel@tonic-gate 			"binit: bufhwm(%d) out \
1170*0Sstevel@tonic-gate 			of range(%d..%lu). Using %lu as default",
1171*0Sstevel@tonic-gate 			bufhwm,
1172*0Sstevel@tonic-gate 			BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1173*0Sstevel@tonic-gate 	}
1174*0Sstevel@tonic-gate 
1175*0Sstevel@tonic-gate 	/*
1176*0Sstevel@tonic-gate 	 * Determine the number of hash buckets. Default is to
1177*0Sstevel@tonic-gate 	 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1178*0Sstevel@tonic-gate 	 * Round up number to the next power of 2.
1179*0Sstevel@tonic-gate 	 */
1180*0Sstevel@tonic-gate 	v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1181*0Sstevel@tonic-gate 	    BIO_HASHLEN);
1182*0Sstevel@tonic-gate 	v.v_hmask = v.v_hbuf - 1;
1183*0Sstevel@tonic-gate 	v.v_buf = BIO_BHDR_POOL;
1184*0Sstevel@tonic-gate 
1185*0Sstevel@tonic-gate 	hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1186*0Sstevel@tonic-gate 
1187*0Sstevel@tonic-gate 	dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1188*0Sstevel@tonic-gate 
1189*0Sstevel@tonic-gate 	bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1190*0Sstevel@tonic-gate 	bp = &bfreelist;
1191*0Sstevel@tonic-gate 	bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1192*0Sstevel@tonic-gate 
1193*0Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
1194*0Sstevel@tonic-gate 		hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1195*0Sstevel@tonic-gate 		hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1196*0Sstevel@tonic-gate 
1197*0Sstevel@tonic-gate 		/*
1198*0Sstevel@tonic-gate 		 * Initialize the delayed write buffer list.
1199*0Sstevel@tonic-gate 		 */
1200*0Sstevel@tonic-gate 		dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1201*0Sstevel@tonic-gate 		dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1202*0Sstevel@tonic-gate 	}
1203*0Sstevel@tonic-gate }
1204*0Sstevel@tonic-gate 
1205*0Sstevel@tonic-gate /*
1206*0Sstevel@tonic-gate  * Wait for I/O completion on the buffer; return error code.
1207*0Sstevel@tonic-gate  * If bp was for synchronous I/O, bp is invalid and associated
1208*0Sstevel@tonic-gate  * resources are freed on return.
1209*0Sstevel@tonic-gate  */
1210*0Sstevel@tonic-gate int
1211*0Sstevel@tonic-gate biowait(struct buf *bp)
1212*0Sstevel@tonic-gate {
1213*0Sstevel@tonic-gate 	int error = 0;
1214*0Sstevel@tonic-gate 	struct cpu *cpup;
1215*0Sstevel@tonic-gate 
1216*0Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
1217*0Sstevel@tonic-gate 
1218*0Sstevel@tonic-gate 	cpup = CPU;
1219*0Sstevel@tonic-gate 	atomic_add_64(&cpup->cpu_stats.sys.iowait, 1);
1220*0Sstevel@tonic-gate 	DTRACE_IO1(wait__start, struct buf *, bp);
1221*0Sstevel@tonic-gate 
1222*0Sstevel@tonic-gate 	/*
1223*0Sstevel@tonic-gate 	 * In case of panic, busy wait for completion
1224*0Sstevel@tonic-gate 	 */
1225*0Sstevel@tonic-gate 	if (panicstr) {
1226*0Sstevel@tonic-gate 		while ((bp->b_flags & B_DONE) == 0)
1227*0Sstevel@tonic-gate 			drv_usecwait(10);
1228*0Sstevel@tonic-gate 	} else
1229*0Sstevel@tonic-gate 		sema_p(&bp->b_io);
1230*0Sstevel@tonic-gate 
1231*0Sstevel@tonic-gate 	DTRACE_IO1(wait__done, struct buf *, bp);
1232*0Sstevel@tonic-gate 	atomic_add_64(&cpup->cpu_stats.sys.iowait, -1);
1233*0Sstevel@tonic-gate 
1234*0Sstevel@tonic-gate 	error = geterror(bp);
1235*0Sstevel@tonic-gate 	if ((bp->b_flags & B_ASYNC) == 0) {
1236*0Sstevel@tonic-gate 		if (bp->b_flags & B_REMAPPED)
1237*0Sstevel@tonic-gate 			bp_mapout(bp);
1238*0Sstevel@tonic-gate 	}
1239*0Sstevel@tonic-gate 	return (error);
1240*0Sstevel@tonic-gate }
1241*0Sstevel@tonic-gate 
1242*0Sstevel@tonic-gate static void
1243*0Sstevel@tonic-gate biodone_tnf_probe(struct buf *bp)
1244*0Sstevel@tonic-gate {
1245*0Sstevel@tonic-gate 	/* Kernel probe */
1246*0Sstevel@tonic-gate 	TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
1247*0Sstevel@tonic-gate 		tnf_device,	device,		bp->b_edev,
1248*0Sstevel@tonic-gate 		tnf_diskaddr,	block,		bp->b_lblkno,
1249*0Sstevel@tonic-gate 		tnf_opaque,	buf,		bp);
1250*0Sstevel@tonic-gate }
1251*0Sstevel@tonic-gate 
1252*0Sstevel@tonic-gate /*
1253*0Sstevel@tonic-gate  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1254*0Sstevel@tonic-gate  * and wake up anyone waiting for it.
1255*0Sstevel@tonic-gate  */
1256*0Sstevel@tonic-gate void
1257*0Sstevel@tonic-gate biodone(struct buf *bp)
1258*0Sstevel@tonic-gate {
1259*0Sstevel@tonic-gate 	if (bp->b_flags & B_STARTED) {
1260*0Sstevel@tonic-gate 		DTRACE_IO1(done, struct buf *, bp);
1261*0Sstevel@tonic-gate 		bp->b_flags &= ~B_STARTED;
1262*0Sstevel@tonic-gate 	}
1263*0Sstevel@tonic-gate 
1264*0Sstevel@tonic-gate 	/*
1265*0Sstevel@tonic-gate 	 * Call the TNF probe here instead of the inline code
1266*0Sstevel@tonic-gate 	 * to force our compiler to use the tail call optimization.
1267*0Sstevel@tonic-gate 	 */
1268*0Sstevel@tonic-gate 	biodone_tnf_probe(bp);
1269*0Sstevel@tonic-gate 
1270*0Sstevel@tonic-gate 	if (bp->b_iodone != NULL) {
1271*0Sstevel@tonic-gate 		(*(bp->b_iodone))(bp);
1272*0Sstevel@tonic-gate 		return;
1273*0Sstevel@tonic-gate 	}
1274*0Sstevel@tonic-gate 	ASSERT((bp->b_flags & B_DONE) == 0);
1275*0Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
1276*0Sstevel@tonic-gate 	bp->b_flags |= B_DONE;
1277*0Sstevel@tonic-gate 	if (bp->b_flags & B_ASYNC) {
1278*0Sstevel@tonic-gate 		if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1279*0Sstevel@tonic-gate 			bio_pageio_done(bp);
1280*0Sstevel@tonic-gate 		else
1281*0Sstevel@tonic-gate 			brelse(bp);	/* release bp to freelist */
1282*0Sstevel@tonic-gate 	} else {
1283*0Sstevel@tonic-gate 		sema_v(&bp->b_io);
1284*0Sstevel@tonic-gate 	}
1285*0Sstevel@tonic-gate }
1286*0Sstevel@tonic-gate 
1287*0Sstevel@tonic-gate /*
1288*0Sstevel@tonic-gate  * Pick up the device's error number and pass it to the user;
1289*0Sstevel@tonic-gate  * if there is an error but the number is 0 set a generalized code.
1290*0Sstevel@tonic-gate  */
1291*0Sstevel@tonic-gate int
1292*0Sstevel@tonic-gate geterror(struct buf *bp)
1293*0Sstevel@tonic-gate {
1294*0Sstevel@tonic-gate 	int error = 0;
1295*0Sstevel@tonic-gate 
1296*0Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
1297*0Sstevel@tonic-gate 	if (bp->b_flags & B_ERROR) {
1298*0Sstevel@tonic-gate 		error = bp->b_error;
1299*0Sstevel@tonic-gate 		if (!error)
1300*0Sstevel@tonic-gate 			error = EIO;
1301*0Sstevel@tonic-gate 	}
1302*0Sstevel@tonic-gate 	return (error);
1303*0Sstevel@tonic-gate }
1304*0Sstevel@tonic-gate 
1305*0Sstevel@tonic-gate /*
1306*0Sstevel@tonic-gate  * Support for pageio buffers.
1307*0Sstevel@tonic-gate  *
1308*0Sstevel@tonic-gate  * This stuff should be generalized to provide a generalized bp
1309*0Sstevel@tonic-gate  * header facility that can be used for things other than pageio.
1310*0Sstevel@tonic-gate  */
1311*0Sstevel@tonic-gate 
1312*0Sstevel@tonic-gate /*
1313*0Sstevel@tonic-gate  * Allocate and initialize a buf struct for use with pageio.
1314*0Sstevel@tonic-gate  */
1315*0Sstevel@tonic-gate struct buf *
1316*0Sstevel@tonic-gate pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1317*0Sstevel@tonic-gate {
1318*0Sstevel@tonic-gate 	struct buf *bp;
1319*0Sstevel@tonic-gate 	struct cpu *cpup;
1320*0Sstevel@tonic-gate 
1321*0Sstevel@tonic-gate 	if (flags & B_READ) {
1322*0Sstevel@tonic-gate 		CPU_STATS_ENTER_K();
1323*0Sstevel@tonic-gate 		cpup = CPU;	/* get pointer AFTER preemption is disabled */
1324*0Sstevel@tonic-gate 		CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1325*0Sstevel@tonic-gate 		CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1326*0Sstevel@tonic-gate 		if ((flags & B_ASYNC) == 0) {
1327*0Sstevel@tonic-gate 			klwp_t *lwp = ttolwp(curthread);
1328*0Sstevel@tonic-gate 			if (lwp != NULL)
1329*0Sstevel@tonic-gate 				lwp->lwp_ru.majflt++;
1330*0Sstevel@tonic-gate 			CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1331*0Sstevel@tonic-gate 			/* Kernel probe */
1332*0Sstevel@tonic-gate 			TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
1333*0Sstevel@tonic-gate 				tnf_opaque,	vnode,		pp->p_vnode,
1334*0Sstevel@tonic-gate 				tnf_offset,	offset,		pp->p_offset);
1335*0Sstevel@tonic-gate 		}
1336*0Sstevel@tonic-gate 		/*
1337*0Sstevel@tonic-gate 		 * Update statistics for pages being paged in
1338*0Sstevel@tonic-gate 		 */
1339*0Sstevel@tonic-gate 		if (pp != NULL && pp->p_vnode != NULL) {
1340*0Sstevel@tonic-gate 			if (IS_SWAPFSVP(pp->p_vnode)) {
1341*0Sstevel@tonic-gate 				CPU_STATS_ADDQ(cpup, vm, anonpgin,
1342*0Sstevel@tonic-gate 						btopr(len));
1343*0Sstevel@tonic-gate 			} else {
1344*0Sstevel@tonic-gate 				if (pp->p_vnode->v_flag & VVMEXEC) {
1345*0Sstevel@tonic-gate 					CPU_STATS_ADDQ(cpup, vm, execpgin,
1346*0Sstevel@tonic-gate 							btopr(len));
1347*0Sstevel@tonic-gate 				} else {
1348*0Sstevel@tonic-gate 					CPU_STATS_ADDQ(cpup, vm, fspgin,
1349*0Sstevel@tonic-gate 							btopr(len));
1350*0Sstevel@tonic-gate 				}
1351*0Sstevel@tonic-gate 			}
1352*0Sstevel@tonic-gate 		}
1353*0Sstevel@tonic-gate 		CPU_STATS_EXIT_K();
1354*0Sstevel@tonic-gate 		TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1355*0Sstevel@tonic-gate 		    "page_ws_in:pp %p", pp);
1356*0Sstevel@tonic-gate 		/* Kernel probe */
1357*0Sstevel@tonic-gate 		TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
1358*0Sstevel@tonic-gate 			tnf_opaque,	vnode,		pp->p_vnode,
1359*0Sstevel@tonic-gate 			tnf_offset,	offset,		pp->p_offset,
1360*0Sstevel@tonic-gate 			tnf_size,	size,		len);
1361*0Sstevel@tonic-gate 	}
1362*0Sstevel@tonic-gate 
1363*0Sstevel@tonic-gate 	bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1364*0Sstevel@tonic-gate 	bp->b_bcount = len;
1365*0Sstevel@tonic-gate 	bp->b_bufsize = len;
1366*0Sstevel@tonic-gate 	bp->b_pages = pp;
1367*0Sstevel@tonic-gate 	bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1368*0Sstevel@tonic-gate 	bp->b_offset = -1;
1369*0Sstevel@tonic-gate 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1370*0Sstevel@tonic-gate 
1371*0Sstevel@tonic-gate 	/* Initialize bp->b_sem in "locked" state */
1372*0Sstevel@tonic-gate 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1373*0Sstevel@tonic-gate 
1374*0Sstevel@tonic-gate 	VN_HOLD(vp);
1375*0Sstevel@tonic-gate 	bp->b_vp = vp;
1376*0Sstevel@tonic-gate 	THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */
1377*0Sstevel@tonic-gate 
1378*0Sstevel@tonic-gate 	/*
1379*0Sstevel@tonic-gate 	 * Caller sets dev & blkno and can adjust
1380*0Sstevel@tonic-gate 	 * b_addr for page offset and can use bp_mapin
1381*0Sstevel@tonic-gate 	 * to make pages kernel addressable.
1382*0Sstevel@tonic-gate 	 */
1383*0Sstevel@tonic-gate 	return (bp);
1384*0Sstevel@tonic-gate }
1385*0Sstevel@tonic-gate 
1386*0Sstevel@tonic-gate void
1387*0Sstevel@tonic-gate pageio_done(struct buf *bp)
1388*0Sstevel@tonic-gate {
1389*0Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
1390*0Sstevel@tonic-gate 	if (bp->b_flags & B_REMAPPED)
1391*0Sstevel@tonic-gate 		bp_mapout(bp);
1392*0Sstevel@tonic-gate 	VN_RELE(bp->b_vp);
1393*0Sstevel@tonic-gate 	bp->b_vp = NULL;
1394*0Sstevel@tonic-gate 	ASSERT((bp->b_flags & B_NOCACHE) != 0);
1395*0Sstevel@tonic-gate 
1396*0Sstevel@tonic-gate 	/* A sema_v(bp->b_sem) is implied if we are destroying it */
1397*0Sstevel@tonic-gate 	sema_destroy(&bp->b_sem);
1398*0Sstevel@tonic-gate 	sema_destroy(&bp->b_io);
1399*0Sstevel@tonic-gate 	kmem_free(bp, sizeof (struct buf));
1400*0Sstevel@tonic-gate }
1401*0Sstevel@tonic-gate 
1402*0Sstevel@tonic-gate /*
1403*0Sstevel@tonic-gate  * Check to see whether the buffers, except the one pointed by sbp,
1404*0Sstevel@tonic-gate  * associated with the device are busy.
1405*0Sstevel@tonic-gate  * NOTE: This expensive operation shall be improved together with ufs_icheck().
1406*0Sstevel@tonic-gate  */
1407*0Sstevel@tonic-gate int
1408*0Sstevel@tonic-gate bcheck(dev_t dev, struct buf *sbp)
1409*0Sstevel@tonic-gate {
1410*0Sstevel@tonic-gate 	struct buf	*bp;
1411*0Sstevel@tonic-gate 	struct buf	*dp;
1412*0Sstevel@tonic-gate 	int i;
1413*0Sstevel@tonic-gate 	kmutex_t *hmp;
1414*0Sstevel@tonic-gate 
1415*0Sstevel@tonic-gate 	/*
1416*0Sstevel@tonic-gate 	 * check for busy bufs for this filesystem
1417*0Sstevel@tonic-gate 	 */
1418*0Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
1419*0Sstevel@tonic-gate 		dp = (struct buf *)&hbuf[i];
1420*0Sstevel@tonic-gate 		hmp = &hbuf[i].b_lock;
1421*0Sstevel@tonic-gate 
1422*0Sstevel@tonic-gate 		mutex_enter(hmp);
1423*0Sstevel@tonic-gate 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1424*0Sstevel@tonic-gate 			/*
1425*0Sstevel@tonic-gate 			 * if buf is busy or dirty, then filesystem is busy
1426*0Sstevel@tonic-gate 			 */
1427*0Sstevel@tonic-gate 			if ((bp->b_edev == dev) &&
1428*0Sstevel@tonic-gate 			    ((bp->b_flags & B_STALE) == 0) &&
1429*0Sstevel@tonic-gate 			    (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1430*0Sstevel@tonic-gate 			    (bp != sbp)) {
1431*0Sstevel@tonic-gate 				mutex_exit(hmp);
1432*0Sstevel@tonic-gate 				return (1);
1433*0Sstevel@tonic-gate 			}
1434*0Sstevel@tonic-gate 		}
1435*0Sstevel@tonic-gate 		mutex_exit(hmp);
1436*0Sstevel@tonic-gate 	}
1437*0Sstevel@tonic-gate 	return (0);
1438*0Sstevel@tonic-gate }
1439*0Sstevel@tonic-gate 
1440*0Sstevel@tonic-gate /*
1441*0Sstevel@tonic-gate  * Hash two 32 bit entities.
1442*0Sstevel@tonic-gate  */
1443*0Sstevel@tonic-gate int
1444*0Sstevel@tonic-gate hash2ints(int x, int y)
1445*0Sstevel@tonic-gate {
1446*0Sstevel@tonic-gate 	int hash = 0;
1447*0Sstevel@tonic-gate 
1448*0Sstevel@tonic-gate 	hash = x - 1;
1449*0Sstevel@tonic-gate 	hash = ((hash * 7) + (x >> 8)) - 1;
1450*0Sstevel@tonic-gate 	hash = ((hash * 7) + (x >> 16)) - 1;
1451*0Sstevel@tonic-gate 	hash = ((hash * 7) + (x >> 24)) - 1;
1452*0Sstevel@tonic-gate 	hash = ((hash * 7) + y) - 1;
1453*0Sstevel@tonic-gate 	hash = ((hash * 7) + (y >> 8)) - 1;
1454*0Sstevel@tonic-gate 	hash = ((hash * 7) + (y >> 16)) - 1;
1455*0Sstevel@tonic-gate 	hash = ((hash * 7) + (y >> 24)) - 1;
1456*0Sstevel@tonic-gate 
1457*0Sstevel@tonic-gate 	return (hash);
1458*0Sstevel@tonic-gate }
1459*0Sstevel@tonic-gate 
1460*0Sstevel@tonic-gate 
1461*0Sstevel@tonic-gate /*
1462*0Sstevel@tonic-gate  * Return a new buffer struct.
1463*0Sstevel@tonic-gate  *	Create a new buffer if we haven't gone over our high water
1464*0Sstevel@tonic-gate  *	mark for memory, otherwise try to get one off the freelist.
1465*0Sstevel@tonic-gate  *
1466*0Sstevel@tonic-gate  * Returns a locked buf that has no id and is not on any hash or free
1467*0Sstevel@tonic-gate  * list.
1468*0Sstevel@tonic-gate  */
1469*0Sstevel@tonic-gate static struct buf *
1470*0Sstevel@tonic-gate bio_getfreeblk(long bsize)
1471*0Sstevel@tonic-gate {
1472*0Sstevel@tonic-gate 	struct buf *bp, *dp;
1473*0Sstevel@tonic-gate 	struct hbuf *hp;
1474*0Sstevel@tonic-gate 	kmutex_t	*hmp;
1475*0Sstevel@tonic-gate 	uint_t		start, end;
1476*0Sstevel@tonic-gate 
1477*0Sstevel@tonic-gate 	/*
1478*0Sstevel@tonic-gate 	 * mutex_enter(&bfree_lock);
1479*0Sstevel@tonic-gate 	 * bfreelist.b_bufsize represents the amount of memory
1480*0Sstevel@tonic-gate 	 * mutex_exit(&bfree_lock); protect ref to bfreelist
1481*0Sstevel@tonic-gate 	 * we are allowed to allocate in the cache before we hit our hwm.
1482*0Sstevel@tonic-gate 	 */
1483*0Sstevel@tonic-gate 	bio_mem_get(bsize);	/* Account for our memory request */
1484*0Sstevel@tonic-gate 
1485*0Sstevel@tonic-gate again:
1486*0Sstevel@tonic-gate 	bp = bio_bhdr_alloc();	/* Get a buf hdr */
1487*0Sstevel@tonic-gate 	sema_p(&bp->b_sem);	/* Should never fail */
1488*0Sstevel@tonic-gate 
1489*0Sstevel@tonic-gate 	ASSERT(bp->b_un.b_addr == NULL);
1490*0Sstevel@tonic-gate 	bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1491*0Sstevel@tonic-gate 	if (bp->b_un.b_addr != NULL) {
1492*0Sstevel@tonic-gate 		/*
1493*0Sstevel@tonic-gate 		 * Make the common path short
1494*0Sstevel@tonic-gate 		 */
1495*0Sstevel@tonic-gate 		bp->b_bufsize = bsize;
1496*0Sstevel@tonic-gate 		ASSERT(SEMA_HELD(&bp->b_sem));
1497*0Sstevel@tonic-gate 		return (bp);
1498*0Sstevel@tonic-gate 	} else {
1499*0Sstevel@tonic-gate 		struct buf *save;
1500*0Sstevel@tonic-gate 
1501*0Sstevel@tonic-gate 		save = bp;	/* Save bp we allocated */
1502*0Sstevel@tonic-gate 		start = end = lastindex;
1503*0Sstevel@tonic-gate 
1504*0Sstevel@tonic-gate 		biostats.bio_bufwant.value.ui32++;
1505*0Sstevel@tonic-gate 
1506*0Sstevel@tonic-gate 		/*
1507*0Sstevel@tonic-gate 		 * Memory isn't available from the system now. Scan
1508*0Sstevel@tonic-gate 		 * the hash buckets till enough space is found.
1509*0Sstevel@tonic-gate 		 */
1510*0Sstevel@tonic-gate 		do {
1511*0Sstevel@tonic-gate 			hp = &hbuf[start];
1512*0Sstevel@tonic-gate 			hmp = &hp->b_lock;
1513*0Sstevel@tonic-gate 			dp = (struct buf *)hp;
1514*0Sstevel@tonic-gate 
1515*0Sstevel@tonic-gate 			mutex_enter(hmp);
1516*0Sstevel@tonic-gate 			bp = dp->av_forw;
1517*0Sstevel@tonic-gate 
1518*0Sstevel@tonic-gate 			while (bp != dp) {
1519*0Sstevel@tonic-gate 
1520*0Sstevel@tonic-gate 				ASSERT(bp != NULL);
1521*0Sstevel@tonic-gate 
1522*0Sstevel@tonic-gate 				if (!sema_tryp(&bp->b_sem)) {
1523*0Sstevel@tonic-gate 					bp = bp->av_forw;
1524*0Sstevel@tonic-gate 					continue;
1525*0Sstevel@tonic-gate 				}
1526*0Sstevel@tonic-gate 
1527*0Sstevel@tonic-gate 				/*
1528*0Sstevel@tonic-gate 				 * Since we are going down the freelist
1529*0Sstevel@tonic-gate 				 * associated with this hash bucket the
1530*0Sstevel@tonic-gate 				 * B_DELWRI flag should not be set.
1531*0Sstevel@tonic-gate 				 */
1532*0Sstevel@tonic-gate 				ASSERT(!(bp->b_flags & B_DELWRI));
1533*0Sstevel@tonic-gate 
1534*0Sstevel@tonic-gate 				if (bp->b_bufsize == bsize) {
1535*0Sstevel@tonic-gate 					hp->b_length--;
1536*0Sstevel@tonic-gate 					notavail(bp);
1537*0Sstevel@tonic-gate 					bremhash(bp);
1538*0Sstevel@tonic-gate 					mutex_exit(hmp);
1539*0Sstevel@tonic-gate 
1540*0Sstevel@tonic-gate 					/*
1541*0Sstevel@tonic-gate 					 * Didn't kmem_alloc any more, so don't
1542*0Sstevel@tonic-gate 					 * count it twice.
1543*0Sstevel@tonic-gate 					 */
1544*0Sstevel@tonic-gate 					mutex_enter(&bfree_lock);
1545*0Sstevel@tonic-gate 					bfreelist.b_bufsize += bsize;
1546*0Sstevel@tonic-gate 					mutex_exit(&bfree_lock);
1547*0Sstevel@tonic-gate 
1548*0Sstevel@tonic-gate 					/*
1549*0Sstevel@tonic-gate 					 * Update the lastindex value.
1550*0Sstevel@tonic-gate 					 */
1551*0Sstevel@tonic-gate 					lastindex = start;
1552*0Sstevel@tonic-gate 
1553*0Sstevel@tonic-gate 					/*
1554*0Sstevel@tonic-gate 					 * Put our saved bp back on the list
1555*0Sstevel@tonic-gate 					 */
1556*0Sstevel@tonic-gate 					sema_v(&save->b_sem);
1557*0Sstevel@tonic-gate 					bio_bhdr_free(save);
1558*0Sstevel@tonic-gate 					ASSERT(SEMA_HELD(&bp->b_sem));
1559*0Sstevel@tonic-gate 					return (bp);
1560*0Sstevel@tonic-gate 				}
1561*0Sstevel@tonic-gate 				sema_v(&bp->b_sem);
1562*0Sstevel@tonic-gate 				bp = bp->av_forw;
1563*0Sstevel@tonic-gate 			}
1564*0Sstevel@tonic-gate 			mutex_exit(hmp);
1565*0Sstevel@tonic-gate 			start = ((start + 1) % v.v_hbuf);
1566*0Sstevel@tonic-gate 		} while (start != end);
1567*0Sstevel@tonic-gate 
1568*0Sstevel@tonic-gate 		biostats.bio_bufwait.value.ui32++;
1569*0Sstevel@tonic-gate 		bp = save;		/* Use original bp */
1570*0Sstevel@tonic-gate 		bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1571*0Sstevel@tonic-gate 	}
1572*0Sstevel@tonic-gate 
1573*0Sstevel@tonic-gate 	bp->b_bufsize = bsize;
1574*0Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
1575*0Sstevel@tonic-gate 	return (bp);
1576*0Sstevel@tonic-gate }
1577*0Sstevel@tonic-gate 
1578*0Sstevel@tonic-gate /*
1579*0Sstevel@tonic-gate  * Allocate a buffer header. If none currently available, allocate
1580*0Sstevel@tonic-gate  * a new pool.
1581*0Sstevel@tonic-gate  */
1582*0Sstevel@tonic-gate static struct buf *
1583*0Sstevel@tonic-gate bio_bhdr_alloc(void)
1584*0Sstevel@tonic-gate {
1585*0Sstevel@tonic-gate 	struct buf *dp, *sdp;
1586*0Sstevel@tonic-gate 	struct buf *bp;
1587*0Sstevel@tonic-gate 	int i;
1588*0Sstevel@tonic-gate 
1589*0Sstevel@tonic-gate 	for (;;) {
1590*0Sstevel@tonic-gate 		mutex_enter(&bhdr_lock);
1591*0Sstevel@tonic-gate 		if (bhdrlist != NULL) {
1592*0Sstevel@tonic-gate 			bp = bhdrlist;
1593*0Sstevel@tonic-gate 			bhdrlist = bp->av_forw;
1594*0Sstevel@tonic-gate 			mutex_exit(&bhdr_lock);
1595*0Sstevel@tonic-gate 			bp->av_forw = NULL;
1596*0Sstevel@tonic-gate 			return (bp);
1597*0Sstevel@tonic-gate 		}
1598*0Sstevel@tonic-gate 		mutex_exit(&bhdr_lock);
1599*0Sstevel@tonic-gate 
1600*0Sstevel@tonic-gate 		/*
1601*0Sstevel@tonic-gate 		 * Need to allocate a new pool. If the system is currently
1602*0Sstevel@tonic-gate 		 * out of memory, then try freeing things on the freelist.
1603*0Sstevel@tonic-gate 		 */
1604*0Sstevel@tonic-gate 		dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1605*0Sstevel@tonic-gate 		if (dp == NULL) {
1606*0Sstevel@tonic-gate 			/*
1607*0Sstevel@tonic-gate 			 * System can't give us a pool of headers, try
1608*0Sstevel@tonic-gate 			 * recycling from the free lists.
1609*0Sstevel@tonic-gate 			 */
1610*0Sstevel@tonic-gate 			bio_recycle(BIO_HEADER, 0);
1611*0Sstevel@tonic-gate 		} else {
1612*0Sstevel@tonic-gate 			sdp = dp;
1613*0Sstevel@tonic-gate 			for (i = 0; i < v.v_buf; i++, dp++) {
1614*0Sstevel@tonic-gate 				/*
1615*0Sstevel@tonic-gate 				 * The next two lines are needed since NODEV
1616*0Sstevel@tonic-gate 				 * is -1 and not NULL
1617*0Sstevel@tonic-gate 				 */
1618*0Sstevel@tonic-gate 				dp->b_dev = (o_dev_t)NODEV;
1619*0Sstevel@tonic-gate 				dp->b_edev = NODEV;
1620*0Sstevel@tonic-gate 				dp->av_forw = dp + 1;
1621*0Sstevel@tonic-gate 				sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1622*0Sstevel@tonic-gate 				    NULL);
1623*0Sstevel@tonic-gate 				sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1624*0Sstevel@tonic-gate 				    NULL);
1625*0Sstevel@tonic-gate 				dp->b_offset = -1;
1626*0Sstevel@tonic-gate 			}
1627*0Sstevel@tonic-gate 			mutex_enter(&bhdr_lock);
1628*0Sstevel@tonic-gate 			(--dp)->av_forw = bhdrlist;	/* Fix last pointer */
1629*0Sstevel@tonic-gate 			bhdrlist = sdp;
1630*0Sstevel@tonic-gate 			nbuf += v.v_buf;
1631*0Sstevel@tonic-gate 			bp = bhdrlist;
1632*0Sstevel@tonic-gate 			bhdrlist = bp->av_forw;
1633*0Sstevel@tonic-gate 			mutex_exit(&bhdr_lock);
1634*0Sstevel@tonic-gate 
1635*0Sstevel@tonic-gate 			bp->av_forw = NULL;
1636*0Sstevel@tonic-gate 			return (bp);
1637*0Sstevel@tonic-gate 		}
1638*0Sstevel@tonic-gate 	}
1639*0Sstevel@tonic-gate }
1640*0Sstevel@tonic-gate 
1641*0Sstevel@tonic-gate static  void
1642*0Sstevel@tonic-gate bio_bhdr_free(struct buf *bp)
1643*0Sstevel@tonic-gate {
1644*0Sstevel@tonic-gate 	ASSERT(bp->b_back == NULL);
1645*0Sstevel@tonic-gate 	ASSERT(bp->b_forw == NULL);
1646*0Sstevel@tonic-gate 	ASSERT(bp->av_back == NULL);
1647*0Sstevel@tonic-gate 	ASSERT(bp->av_forw == NULL);
1648*0Sstevel@tonic-gate 	ASSERT(bp->b_un.b_addr == NULL);
1649*0Sstevel@tonic-gate 	ASSERT(bp->b_dev == (o_dev_t)NODEV);
1650*0Sstevel@tonic-gate 	ASSERT(bp->b_edev == NODEV);
1651*0Sstevel@tonic-gate 	ASSERT(bp->b_flags == 0);
1652*0Sstevel@tonic-gate 
1653*0Sstevel@tonic-gate 	mutex_enter(&bhdr_lock);
1654*0Sstevel@tonic-gate 	bp->av_forw = bhdrlist;
1655*0Sstevel@tonic-gate 	bhdrlist = bp;
1656*0Sstevel@tonic-gate 	mutex_exit(&bhdr_lock);
1657*0Sstevel@tonic-gate }
1658*0Sstevel@tonic-gate 
1659*0Sstevel@tonic-gate /*
1660*0Sstevel@tonic-gate  * If we haven't gone over the high water mark, it's o.k. to
1661*0Sstevel@tonic-gate  * allocate more buffer space, otherwise recycle buffers
1662*0Sstevel@tonic-gate  * from the freelist until enough memory is free for a bsize request.
1663*0Sstevel@tonic-gate  *
1664*0Sstevel@tonic-gate  * We account for this memory, even though
1665*0Sstevel@tonic-gate  * we don't allocate it here.
1666*0Sstevel@tonic-gate  */
1667*0Sstevel@tonic-gate static void
1668*0Sstevel@tonic-gate bio_mem_get(long bsize)
1669*0Sstevel@tonic-gate {
1670*0Sstevel@tonic-gate 	mutex_enter(&bfree_lock);
1671*0Sstevel@tonic-gate 	if (bfreelist.b_bufsize > bsize) {
1672*0Sstevel@tonic-gate 		bfreelist.b_bufsize -= bsize;
1673*0Sstevel@tonic-gate 		mutex_exit(&bfree_lock);
1674*0Sstevel@tonic-gate 		return;
1675*0Sstevel@tonic-gate 	}
1676*0Sstevel@tonic-gate 	mutex_exit(&bfree_lock);
1677*0Sstevel@tonic-gate 	bio_recycle(BIO_MEM, bsize);
1678*0Sstevel@tonic-gate }
1679*0Sstevel@tonic-gate 
1680*0Sstevel@tonic-gate /*
1681*0Sstevel@tonic-gate  * flush a list of delayed write buffers.
1682*0Sstevel@tonic-gate  * (currently used only by bio_recycle below.)
1683*0Sstevel@tonic-gate  */
1684*0Sstevel@tonic-gate static void
1685*0Sstevel@tonic-gate bio_flushlist(struct buf *delwri_list)
1686*0Sstevel@tonic-gate {
1687*0Sstevel@tonic-gate 	struct buf *bp;
1688*0Sstevel@tonic-gate 
1689*0Sstevel@tonic-gate 	while (delwri_list != EMPTY_LIST) {
1690*0Sstevel@tonic-gate 		bp = delwri_list;
1691*0Sstevel@tonic-gate 		bp->b_flags |= B_AGE | B_ASYNC;
1692*0Sstevel@tonic-gate 		if (bp->b_vp == NULL) {		/* !ufs */
1693*0Sstevel@tonic-gate 			BWRITE(bp);
1694*0Sstevel@tonic-gate 		} else {			/* ufs */
1695*0Sstevel@tonic-gate 			UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1696*0Sstevel@tonic-gate 		}
1697*0Sstevel@tonic-gate 		delwri_list = bp->b_list;
1698*0Sstevel@tonic-gate 		bp->b_list = NULL;
1699*0Sstevel@tonic-gate 	}
1700*0Sstevel@tonic-gate }
1701*0Sstevel@tonic-gate 
1702*0Sstevel@tonic-gate /*
1703*0Sstevel@tonic-gate  * Start recycling buffers on the freelist for one of 2 reasons:
1704*0Sstevel@tonic-gate  *	- we need a buffer header
1705*0Sstevel@tonic-gate  *	- we need to free up memory
1706*0Sstevel@tonic-gate  * Once started we continue to recycle buffers until the B_AGE
1707*0Sstevel@tonic-gate  * buffers are gone.
1708*0Sstevel@tonic-gate  */
1709*0Sstevel@tonic-gate static void
1710*0Sstevel@tonic-gate bio_recycle(int want, long bsize)
1711*0Sstevel@tonic-gate {
1712*0Sstevel@tonic-gate 	struct buf *bp, *dp, *dwp, *nbp;
1713*0Sstevel@tonic-gate 	struct hbuf *hp;
1714*0Sstevel@tonic-gate 	int	found = 0;
1715*0Sstevel@tonic-gate 	kmutex_t	*hmp;
1716*0Sstevel@tonic-gate 	int		start, end;
1717*0Sstevel@tonic-gate 	struct buf *delwri_list = EMPTY_LIST;
1718*0Sstevel@tonic-gate 
1719*0Sstevel@tonic-gate 	/*
1720*0Sstevel@tonic-gate 	 * Recycle buffers.
1721*0Sstevel@tonic-gate 	 */
1722*0Sstevel@tonic-gate top:
1723*0Sstevel@tonic-gate 	start = end = lastindex;
1724*0Sstevel@tonic-gate 	do {
1725*0Sstevel@tonic-gate 		hp = &hbuf[start];
1726*0Sstevel@tonic-gate 		hmp = &hp->b_lock;
1727*0Sstevel@tonic-gate 		dp = (struct buf *)hp;
1728*0Sstevel@tonic-gate 
1729*0Sstevel@tonic-gate 		mutex_enter(hmp);
1730*0Sstevel@tonic-gate 		bp = dp->av_forw;
1731*0Sstevel@tonic-gate 
1732*0Sstevel@tonic-gate 		while (bp != dp) {
1733*0Sstevel@tonic-gate 
1734*0Sstevel@tonic-gate 			ASSERT(bp != NULL);
1735*0Sstevel@tonic-gate 
1736*0Sstevel@tonic-gate 			if (!sema_tryp(&bp->b_sem)) {
1737*0Sstevel@tonic-gate 				bp = bp->av_forw;
1738*0Sstevel@tonic-gate 				continue;
1739*0Sstevel@tonic-gate 			}
1740*0Sstevel@tonic-gate 			/*
1741*0Sstevel@tonic-gate 			 * Do we really want to nuke all of the B_AGE stuff??
1742*0Sstevel@tonic-gate 			 */
1743*0Sstevel@tonic-gate 			if ((bp->b_flags & B_AGE) == 0 && found) {
1744*0Sstevel@tonic-gate 				sema_v(&bp->b_sem);
1745*0Sstevel@tonic-gate 				mutex_exit(hmp);
1746*0Sstevel@tonic-gate 				lastindex = start;
1747*0Sstevel@tonic-gate 				return;	/* All done */
1748*0Sstevel@tonic-gate 			}
1749*0Sstevel@tonic-gate 
1750*0Sstevel@tonic-gate 			ASSERT(MUTEX_HELD(&hp->b_lock));
1751*0Sstevel@tonic-gate 			ASSERT(!(bp->b_flags & B_DELWRI));
1752*0Sstevel@tonic-gate 			hp->b_length--;
1753*0Sstevel@tonic-gate 			notavail(bp);
1754*0Sstevel@tonic-gate 
1755*0Sstevel@tonic-gate 			/*
1756*0Sstevel@tonic-gate 			 * Remove bhdr from cache, free up memory,
1757*0Sstevel@tonic-gate 			 * and add the hdr to the freelist.
1758*0Sstevel@tonic-gate 			 */
1759*0Sstevel@tonic-gate 			bremhash(bp);
1760*0Sstevel@tonic-gate 			mutex_exit(hmp);
1761*0Sstevel@tonic-gate 
1762*0Sstevel@tonic-gate 			if (bp->b_bufsize) {
1763*0Sstevel@tonic-gate 				kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1764*0Sstevel@tonic-gate 				bp->b_un.b_addr = NULL;
1765*0Sstevel@tonic-gate 				mutex_enter(&bfree_lock);
1766*0Sstevel@tonic-gate 				bfreelist.b_bufsize += bp->b_bufsize;
1767*0Sstevel@tonic-gate 				mutex_exit(&bfree_lock);
1768*0Sstevel@tonic-gate 			}
1769*0Sstevel@tonic-gate 
1770*0Sstevel@tonic-gate 			bp->b_dev = (o_dev_t)NODEV;
1771*0Sstevel@tonic-gate 			bp->b_edev = NODEV;
1772*0Sstevel@tonic-gate 			bp->b_flags = 0;
1773*0Sstevel@tonic-gate 			sema_v(&bp->b_sem);
1774*0Sstevel@tonic-gate 			bio_bhdr_free(bp);
1775*0Sstevel@tonic-gate 			if (want == BIO_HEADER) {
1776*0Sstevel@tonic-gate 				found = 1;
1777*0Sstevel@tonic-gate 			} else {
1778*0Sstevel@tonic-gate 				ASSERT(want == BIO_MEM);
1779*0Sstevel@tonic-gate 				if (!found && bfreelist.b_bufsize >= bsize) {
1780*0Sstevel@tonic-gate 					/* Account for the memory we want */
1781*0Sstevel@tonic-gate 					mutex_enter(&bfree_lock);
1782*0Sstevel@tonic-gate 					if (bfreelist.b_bufsize >= bsize) {
1783*0Sstevel@tonic-gate 						bfreelist.b_bufsize -= bsize;
1784*0Sstevel@tonic-gate 						found = 1;
1785*0Sstevel@tonic-gate 					}
1786*0Sstevel@tonic-gate 					mutex_exit(&bfree_lock);
1787*0Sstevel@tonic-gate 				}
1788*0Sstevel@tonic-gate 			}
1789*0Sstevel@tonic-gate 
1790*0Sstevel@tonic-gate 			/*
1791*0Sstevel@tonic-gate 			 * Since we dropped hmp start from the
1792*0Sstevel@tonic-gate 			 * begining.
1793*0Sstevel@tonic-gate 			 */
1794*0Sstevel@tonic-gate 			mutex_enter(hmp);
1795*0Sstevel@tonic-gate 			bp = dp->av_forw;
1796*0Sstevel@tonic-gate 		}
1797*0Sstevel@tonic-gate 		mutex_exit(hmp);
1798*0Sstevel@tonic-gate 
1799*0Sstevel@tonic-gate 		/*
1800*0Sstevel@tonic-gate 		 * Look at the delayed write list.
1801*0Sstevel@tonic-gate 		 * First gather into a private list, then write them.
1802*0Sstevel@tonic-gate 		 */
1803*0Sstevel@tonic-gate 		dwp = (struct buf *)&dwbuf[start];
1804*0Sstevel@tonic-gate 		mutex_enter(&blist_lock);
1805*0Sstevel@tonic-gate 		bio_doingflush++;
1806*0Sstevel@tonic-gate 		mutex_enter(hmp);
1807*0Sstevel@tonic-gate 		for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1808*0Sstevel@tonic-gate 
1809*0Sstevel@tonic-gate 			ASSERT(bp != NULL);
1810*0Sstevel@tonic-gate 			nbp = bp->av_forw;
1811*0Sstevel@tonic-gate 
1812*0Sstevel@tonic-gate 			if (!sema_tryp(&bp->b_sem))
1813*0Sstevel@tonic-gate 				continue;
1814*0Sstevel@tonic-gate 			ASSERT(bp->b_flags & B_DELWRI);
1815*0Sstevel@tonic-gate 			/*
1816*0Sstevel@tonic-gate 			 * Do we really want to nuke all of the B_AGE stuff??
1817*0Sstevel@tonic-gate 			 */
1818*0Sstevel@tonic-gate 
1819*0Sstevel@tonic-gate 			if ((bp->b_flags & B_AGE) == 0 && found) {
1820*0Sstevel@tonic-gate 				sema_v(&bp->b_sem);
1821*0Sstevel@tonic-gate 				mutex_exit(hmp);
1822*0Sstevel@tonic-gate 				lastindex = start;
1823*0Sstevel@tonic-gate 				mutex_exit(&blist_lock);
1824*0Sstevel@tonic-gate 				bio_flushlist(delwri_list);
1825*0Sstevel@tonic-gate 				mutex_enter(&blist_lock);
1826*0Sstevel@tonic-gate 				bio_doingflush--;
1827*0Sstevel@tonic-gate 				if (bio_flinv_cv_wanted) {
1828*0Sstevel@tonic-gate 					bio_flinv_cv_wanted = 0;
1829*0Sstevel@tonic-gate 					cv_broadcast(&bio_flushinval_cv);
1830*0Sstevel@tonic-gate 				}
1831*0Sstevel@tonic-gate 				mutex_exit(&blist_lock);
1832*0Sstevel@tonic-gate 				return; /* All done */
1833*0Sstevel@tonic-gate 			}
1834*0Sstevel@tonic-gate 
1835*0Sstevel@tonic-gate 			/*
1836*0Sstevel@tonic-gate 			 * If the buffer is already on a flush or
1837*0Sstevel@tonic-gate 			 * invalidate list then just skip it.
1838*0Sstevel@tonic-gate 			 */
1839*0Sstevel@tonic-gate 			if (bp->b_list != NULL) {
1840*0Sstevel@tonic-gate 				sema_v(&bp->b_sem);
1841*0Sstevel@tonic-gate 				continue;
1842*0Sstevel@tonic-gate 			}
1843*0Sstevel@tonic-gate 			/*
1844*0Sstevel@tonic-gate 			 * We are still on the same bucket.
1845*0Sstevel@tonic-gate 			 */
1846*0Sstevel@tonic-gate 			hp->b_length--;
1847*0Sstevel@tonic-gate 			notavail(bp);
1848*0Sstevel@tonic-gate 			bp->b_list = delwri_list;
1849*0Sstevel@tonic-gate 			delwri_list = bp;
1850*0Sstevel@tonic-gate 		}
1851*0Sstevel@tonic-gate 		mutex_exit(hmp);
1852*0Sstevel@tonic-gate 		mutex_exit(&blist_lock);
1853*0Sstevel@tonic-gate 		bio_flushlist(delwri_list);
1854*0Sstevel@tonic-gate 		delwri_list = EMPTY_LIST;
1855*0Sstevel@tonic-gate 		mutex_enter(&blist_lock);
1856*0Sstevel@tonic-gate 		bio_doingflush--;
1857*0Sstevel@tonic-gate 		if (bio_flinv_cv_wanted) {
1858*0Sstevel@tonic-gate 			bio_flinv_cv_wanted = 0;
1859*0Sstevel@tonic-gate 			cv_broadcast(&bio_flushinval_cv);
1860*0Sstevel@tonic-gate 		}
1861*0Sstevel@tonic-gate 		mutex_exit(&blist_lock);
1862*0Sstevel@tonic-gate 		start = (start + 1) % v.v_hbuf;
1863*0Sstevel@tonic-gate 
1864*0Sstevel@tonic-gate 	} while (start != end);
1865*0Sstevel@tonic-gate 
1866*0Sstevel@tonic-gate 	if (found)
1867*0Sstevel@tonic-gate 		return;
1868*0Sstevel@tonic-gate 
1869*0Sstevel@tonic-gate 	/*
1870*0Sstevel@tonic-gate 	 * Free lists exhausted and we haven't satisfied the request.
1871*0Sstevel@tonic-gate 	 * Wait here for more entries to be added to freelist.
1872*0Sstevel@tonic-gate 	 * Because this might have just happened, make it timed.
1873*0Sstevel@tonic-gate 	 */
1874*0Sstevel@tonic-gate 	mutex_enter(&bfree_lock);
1875*0Sstevel@tonic-gate 	bfreelist.b_flags |= B_WANTED;
1876*0Sstevel@tonic-gate 	(void) cv_timedwait(&bio_mem_cv, &bfree_lock, lbolt+hz);
1877*0Sstevel@tonic-gate 	mutex_exit(&bfree_lock);
1878*0Sstevel@tonic-gate 	goto top;
1879*0Sstevel@tonic-gate }
1880*0Sstevel@tonic-gate 
1881*0Sstevel@tonic-gate /*
1882*0Sstevel@tonic-gate  * See if the block is associated with some buffer
1883*0Sstevel@tonic-gate  * (mainly to avoid getting hung up on a wait in breada).
1884*0Sstevel@tonic-gate  */
1885*0Sstevel@tonic-gate static int
1886*0Sstevel@tonic-gate bio_incore(dev_t dev, daddr_t blkno)
1887*0Sstevel@tonic-gate {
1888*0Sstevel@tonic-gate 	struct buf *bp;
1889*0Sstevel@tonic-gate 	struct buf *dp;
1890*0Sstevel@tonic-gate 	uint_t index;
1891*0Sstevel@tonic-gate 	kmutex_t *hmp;
1892*0Sstevel@tonic-gate 
1893*0Sstevel@tonic-gate 	index = bio_bhash(dev, blkno);
1894*0Sstevel@tonic-gate 	dp = (struct buf *)&hbuf[index];
1895*0Sstevel@tonic-gate 	hmp = &hbuf[index].b_lock;
1896*0Sstevel@tonic-gate 
1897*0Sstevel@tonic-gate 	mutex_enter(hmp);
1898*0Sstevel@tonic-gate 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1899*0Sstevel@tonic-gate 		if (bp->b_blkno == blkno && bp->b_edev == dev &&
1900*0Sstevel@tonic-gate 		    (bp->b_flags & B_STALE) == 0) {
1901*0Sstevel@tonic-gate 			mutex_exit(hmp);
1902*0Sstevel@tonic-gate 			return (1);
1903*0Sstevel@tonic-gate 		}
1904*0Sstevel@tonic-gate 	}
1905*0Sstevel@tonic-gate 	mutex_exit(hmp);
1906*0Sstevel@tonic-gate 	return (0);
1907*0Sstevel@tonic-gate }
1908*0Sstevel@tonic-gate 
1909*0Sstevel@tonic-gate static void
1910*0Sstevel@tonic-gate bio_pageio_done(struct buf *bp)
1911*0Sstevel@tonic-gate {
1912*0Sstevel@tonic-gate 	if (bp->b_flags & B_PAGEIO) {
1913*0Sstevel@tonic-gate 
1914*0Sstevel@tonic-gate 		if (bp->b_flags & B_REMAPPED)
1915*0Sstevel@tonic-gate 			bp_mapout(bp);
1916*0Sstevel@tonic-gate 
1917*0Sstevel@tonic-gate 		if (bp->b_flags & B_READ)
1918*0Sstevel@tonic-gate 			pvn_read_done(bp->b_pages, bp->b_flags);
1919*0Sstevel@tonic-gate 		else
1920*0Sstevel@tonic-gate 			pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1921*0Sstevel@tonic-gate 		pageio_done(bp);
1922*0Sstevel@tonic-gate 	} else {
1923*0Sstevel@tonic-gate 		ASSERT(bp->b_flags & B_REMAPPED);
1924*0Sstevel@tonic-gate 		bp_mapout(bp);
1925*0Sstevel@tonic-gate 		brelse(bp);
1926*0Sstevel@tonic-gate 	}
1927*0Sstevel@tonic-gate }
1928*0Sstevel@tonic-gate 
1929*0Sstevel@tonic-gate /*
1930*0Sstevel@tonic-gate  * bioerror(9F) - indicate error in buffer header
1931*0Sstevel@tonic-gate  * If 'error' is zero, remove the error indication.
1932*0Sstevel@tonic-gate  */
1933*0Sstevel@tonic-gate void
1934*0Sstevel@tonic-gate bioerror(struct buf *bp, int error)
1935*0Sstevel@tonic-gate {
1936*0Sstevel@tonic-gate 	ASSERT(bp != NULL);
1937*0Sstevel@tonic-gate 	ASSERT(error >= 0);
1938*0Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
1939*0Sstevel@tonic-gate 
1940*0Sstevel@tonic-gate 	if (error != 0) {
1941*0Sstevel@tonic-gate 		bp->b_flags |= B_ERROR;
1942*0Sstevel@tonic-gate 	} else {
1943*0Sstevel@tonic-gate 		bp->b_flags &= ~B_ERROR;
1944*0Sstevel@tonic-gate 	}
1945*0Sstevel@tonic-gate 	bp->b_error = error;
1946*0Sstevel@tonic-gate }
1947*0Sstevel@tonic-gate 
1948*0Sstevel@tonic-gate /*
1949*0Sstevel@tonic-gate  * bioreset(9F) - reuse a private buffer header after I/O is complete
1950*0Sstevel@tonic-gate  */
1951*0Sstevel@tonic-gate void
1952*0Sstevel@tonic-gate bioreset(struct buf *bp)
1953*0Sstevel@tonic-gate {
1954*0Sstevel@tonic-gate 	ASSERT(bp != NULL);
1955*0Sstevel@tonic-gate 
1956*0Sstevel@tonic-gate 	biofini(bp);
1957*0Sstevel@tonic-gate 	bioinit(bp);
1958*0Sstevel@tonic-gate }
1959*0Sstevel@tonic-gate 
1960*0Sstevel@tonic-gate /*
1961*0Sstevel@tonic-gate  * biosize(9F) - return size of a buffer header
1962*0Sstevel@tonic-gate  */
1963*0Sstevel@tonic-gate size_t
1964*0Sstevel@tonic-gate biosize(void)
1965*0Sstevel@tonic-gate {
1966*0Sstevel@tonic-gate 	return (sizeof (struct buf));
1967*0Sstevel@tonic-gate }
1968*0Sstevel@tonic-gate 
1969*0Sstevel@tonic-gate /*
1970*0Sstevel@tonic-gate  * biomodified(9F) - check if buffer is modified
1971*0Sstevel@tonic-gate  */
1972*0Sstevel@tonic-gate int
1973*0Sstevel@tonic-gate biomodified(struct buf *bp)
1974*0Sstevel@tonic-gate {
1975*0Sstevel@tonic-gate 	int npf;
1976*0Sstevel@tonic-gate 	int ppattr;
1977*0Sstevel@tonic-gate 	struct page *pp;
1978*0Sstevel@tonic-gate 
1979*0Sstevel@tonic-gate 	ASSERT(bp != NULL);
1980*0Sstevel@tonic-gate 
1981*0Sstevel@tonic-gate 	if ((bp->b_flags & B_PAGEIO) == 0) {
1982*0Sstevel@tonic-gate 		return (-1);
1983*0Sstevel@tonic-gate 	}
1984*0Sstevel@tonic-gate 	pp = bp->b_pages;
1985*0Sstevel@tonic-gate 	npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1986*0Sstevel@tonic-gate 
1987*0Sstevel@tonic-gate 	while (npf > 0) {
1988*0Sstevel@tonic-gate 		ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1989*0Sstevel@tonic-gate 				HAT_SYNC_STOPON_MOD);
1990*0Sstevel@tonic-gate 		if (ppattr & P_MOD)
1991*0Sstevel@tonic-gate 			return (1);
1992*0Sstevel@tonic-gate 		pp = pp->p_next;
1993*0Sstevel@tonic-gate 		npf--;
1994*0Sstevel@tonic-gate 	}
1995*0Sstevel@tonic-gate 
1996*0Sstevel@tonic-gate 	return (0);
1997*0Sstevel@tonic-gate }
1998*0Sstevel@tonic-gate 
1999*0Sstevel@tonic-gate /*
2000*0Sstevel@tonic-gate  * bioinit(9F) - initialize a buffer structure
2001*0Sstevel@tonic-gate  */
2002*0Sstevel@tonic-gate void
2003*0Sstevel@tonic-gate bioinit(struct buf *bp)
2004*0Sstevel@tonic-gate {
2005*0Sstevel@tonic-gate 	bzero(bp, sizeof (struct buf));
2006*0Sstevel@tonic-gate 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
2007*0Sstevel@tonic-gate 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
2008*0Sstevel@tonic-gate 	bp->b_offset = -1;
2009*0Sstevel@tonic-gate }
2010*0Sstevel@tonic-gate 
2011*0Sstevel@tonic-gate /*
2012*0Sstevel@tonic-gate  * biofini(9F) - uninitialize a buffer structure
2013*0Sstevel@tonic-gate  */
2014*0Sstevel@tonic-gate void
2015*0Sstevel@tonic-gate biofini(struct buf *bp)
2016*0Sstevel@tonic-gate {
2017*0Sstevel@tonic-gate 	sema_destroy(&bp->b_io);
2018*0Sstevel@tonic-gate 	sema_destroy(&bp->b_sem);
2019*0Sstevel@tonic-gate }
2020*0Sstevel@tonic-gate 
2021*0Sstevel@tonic-gate /*
2022*0Sstevel@tonic-gate  * bioclone(9F) - clone a buffer
2023*0Sstevel@tonic-gate  */
2024*0Sstevel@tonic-gate struct buf *
2025*0Sstevel@tonic-gate bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
2026*0Sstevel@tonic-gate     int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
2027*0Sstevel@tonic-gate {
2028*0Sstevel@tonic-gate 	struct buf *bufp;
2029*0Sstevel@tonic-gate 
2030*0Sstevel@tonic-gate 	ASSERT(bp);
2031*0Sstevel@tonic-gate 	if (bp_mem == NULL) {
2032*0Sstevel@tonic-gate 		bufp = kmem_alloc(sizeof (struct buf), sleep);
2033*0Sstevel@tonic-gate 		if (bufp == NULL) {
2034*0Sstevel@tonic-gate 			return (NULL);
2035*0Sstevel@tonic-gate 		}
2036*0Sstevel@tonic-gate 		bioinit(bufp);
2037*0Sstevel@tonic-gate 	} else {
2038*0Sstevel@tonic-gate 		bufp = bp_mem;
2039*0Sstevel@tonic-gate 		bioreset(bufp);
2040*0Sstevel@tonic-gate 	}
2041*0Sstevel@tonic-gate 
2042*0Sstevel@tonic-gate #define	BUF_CLONE_FLAGS	(B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2043*0Sstevel@tonic-gate 	B_ABRWRITE)
2044*0Sstevel@tonic-gate 
2045*0Sstevel@tonic-gate 	/*
2046*0Sstevel@tonic-gate 	 * the cloned buffer does not inherit the B_REMAPPED flag. A separate
2047*0Sstevel@tonic-gate 	 * bp_mapin(9F) has to be done to get a kernel mapping.
2048*0Sstevel@tonic-gate 	 */
2049*0Sstevel@tonic-gate 	bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS)  | B_BUSY;
2050*0Sstevel@tonic-gate 	bufp->b_bcount = len;
2051*0Sstevel@tonic-gate 	bufp->b_blkno = blkno;
2052*0Sstevel@tonic-gate 	bufp->b_iodone = iodone;
2053*0Sstevel@tonic-gate 	bufp->b_proc = bp->b_proc;
2054*0Sstevel@tonic-gate 	bufp->b_edev = dev;
2055*0Sstevel@tonic-gate 	bufp->b_file = bp->b_file;
2056*0Sstevel@tonic-gate 	bufp->b_offset = bp->b_offset;
2057*0Sstevel@tonic-gate 
2058*0Sstevel@tonic-gate 	if (bp->b_flags & B_SHADOW) {
2059*0Sstevel@tonic-gate 		ASSERT(bp->b_shadow);
2060*0Sstevel@tonic-gate 		ASSERT(bp->b_flags & B_PHYS);
2061*0Sstevel@tonic-gate 
2062*0Sstevel@tonic-gate 		bufp->b_shadow = bp->b_shadow +
2063*0Sstevel@tonic-gate 			btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
2064*0Sstevel@tonic-gate 		bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2065*0Sstevel@tonic-gate 	} else {
2066*0Sstevel@tonic-gate 		if (bp->b_flags & B_PAGEIO) {
2067*0Sstevel@tonic-gate 			struct page *pp;
2068*0Sstevel@tonic-gate 			off_t o;
2069*0Sstevel@tonic-gate 			int i;
2070*0Sstevel@tonic-gate 
2071*0Sstevel@tonic-gate 			pp = bp->b_pages;
2072*0Sstevel@tonic-gate 			o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2073*0Sstevel@tonic-gate 			for (i = btop(o); i > 0; i--) {
2074*0Sstevel@tonic-gate 				pp = pp->p_next;
2075*0Sstevel@tonic-gate 			}
2076*0Sstevel@tonic-gate 			bufp->b_pages = pp;
2077*0Sstevel@tonic-gate 			bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2078*0Sstevel@tonic-gate 		} else {
2079*0Sstevel@tonic-gate 			bufp->b_un.b_addr =
2080*0Sstevel@tonic-gate 				(caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2081*0Sstevel@tonic-gate 			if (bp->b_flags & B_REMAPPED)
2082*0Sstevel@tonic-gate 				bufp->b_proc = NULL;
2083*0Sstevel@tonic-gate 		}
2084*0Sstevel@tonic-gate 	}
2085*0Sstevel@tonic-gate 	return (bufp);
2086*0Sstevel@tonic-gate }
2087