xref: /onnv-gate/usr/src/uts/common/fs/ufs/lufs_thread.c (revision 0:68f95e015346)
1*0Sstevel@tonic-gate /*
2*0Sstevel@tonic-gate  * CDDL HEADER START
3*0Sstevel@tonic-gate  *
4*0Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*0Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*0Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*0Sstevel@tonic-gate  * with the License.
8*0Sstevel@tonic-gate  *
9*0Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*0Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*0Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*0Sstevel@tonic-gate  * and limitations under the License.
13*0Sstevel@tonic-gate  *
14*0Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*0Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*0Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*0Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*0Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*0Sstevel@tonic-gate  *
20*0Sstevel@tonic-gate  * CDDL HEADER END
21*0Sstevel@tonic-gate  */
22*0Sstevel@tonic-gate /*
23*0Sstevel@tonic-gate  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24*0Sstevel@tonic-gate  * Use is subject to license terms.
25*0Sstevel@tonic-gate  */
26*0Sstevel@tonic-gate 
27*0Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*0Sstevel@tonic-gate 
29*0Sstevel@tonic-gate #include <sys/systm.h>
30*0Sstevel@tonic-gate #include <sys/types.h>
31*0Sstevel@tonic-gate #include <sys/vnode.h>
32*0Sstevel@tonic-gate #include <sys/errno.h>
33*0Sstevel@tonic-gate #include <sys/sysmacros.h>
34*0Sstevel@tonic-gate #include <sys/debug.h>
35*0Sstevel@tonic-gate #include <sys/kmem.h>
36*0Sstevel@tonic-gate #include <sys/conf.h>
37*0Sstevel@tonic-gate #include <sys/proc.h>
38*0Sstevel@tonic-gate #include <sys/cmn_err.h>
39*0Sstevel@tonic-gate #include <sys/fssnap_if.h>
40*0Sstevel@tonic-gate #include <sys/fs/ufs_inode.h>
41*0Sstevel@tonic-gate #include <sys/fs/ufs_filio.h>
42*0Sstevel@tonic-gate #include <sys/fs/ufs_log.h>
43*0Sstevel@tonic-gate #include <sys/fs/ufs_bio.h>
44*0Sstevel@tonic-gate #include <sys/inttypes.h>
45*0Sstevel@tonic-gate #include <sys/callb.h>
46*0Sstevel@tonic-gate #include <sys/tnf_probe.h>
47*0Sstevel@tonic-gate 
48*0Sstevel@tonic-gate /*
49*0Sstevel@tonic-gate  * Kernel threads for logging
50*0Sstevel@tonic-gate  * Currently only one for rolling the log (one per log).
51*0Sstevel@tonic-gate  */
52*0Sstevel@tonic-gate 
53*0Sstevel@tonic-gate #define	LUFS_DEFAULT_NUM_ROLL_BUFS 16
54*0Sstevel@tonic-gate #define	LUFS_DEFAULT_MIN_ROLL_BUFS 4
55*0Sstevel@tonic-gate #define	LUFS_DEFAULT_MAX_ROLL_BUFS 64
56*0Sstevel@tonic-gate 
57*0Sstevel@tonic-gate /*
58*0Sstevel@tonic-gate  * Macros
59*0Sstevel@tonic-gate  */
60*0Sstevel@tonic-gate #define	logmap_need_roll(logmap) ((logmap)->mtm_nme > logmap_maxnme)
61*0Sstevel@tonic-gate #define	ldl_empty(ul) ((ul)->un_head_lof == (ul)->un_tail_lof)
62*0Sstevel@tonic-gate 
63*0Sstevel@tonic-gate /*
64*0Sstevel@tonic-gate  * Tunables
65*0Sstevel@tonic-gate  */
66*0Sstevel@tonic-gate uint32_t lufs_num_roll_bufs = LUFS_DEFAULT_NUM_ROLL_BUFS;
67*0Sstevel@tonic-gate uint32_t lufs_min_roll_bufs = LUFS_DEFAULT_MIN_ROLL_BUFS;
68*0Sstevel@tonic-gate uint32_t lufs_max_roll_bufs = LUFS_DEFAULT_MAX_ROLL_BUFS;
69*0Sstevel@tonic-gate long logmap_maxnme = 1536;
70*0Sstevel@tonic-gate int trans_roll_tics = 0;
71*0Sstevel@tonic-gate uint64_t trans_roll_new_delta = 0;
72*0Sstevel@tonic-gate uint64_t lrr_wait = 0;
73*0Sstevel@tonic-gate /*
74*0Sstevel@tonic-gate  * Key for thread specific data for the roll thread to
75*0Sstevel@tonic-gate  * bypass snapshot throttling
76*0Sstevel@tonic-gate  */
77*0Sstevel@tonic-gate uint_t bypass_snapshot_throttle_key;
78*0Sstevel@tonic-gate 
79*0Sstevel@tonic-gate /*
80*0Sstevel@tonic-gate  * externs
81*0Sstevel@tonic-gate  */
82*0Sstevel@tonic-gate extern kmutex_t		ml_scan;
83*0Sstevel@tonic-gate extern kcondvar_t	ml_scan_cv;
84*0Sstevel@tonic-gate extern int		maxphys;
85*0Sstevel@tonic-gate 
86*0Sstevel@tonic-gate static void
87*0Sstevel@tonic-gate trans_roll_wait(mt_map_t *logmap, callb_cpr_t *cprinfop)
88*0Sstevel@tonic-gate {
89*0Sstevel@tonic-gate 	mutex_enter(&logmap->mtm_mutex);
90*0Sstevel@tonic-gate 	logmap->mtm_ref = 0;
91*0Sstevel@tonic-gate 	if (logmap->mtm_flags & MTM_FORCE_ROLL) {
92*0Sstevel@tonic-gate 		cv_broadcast(&logmap->mtm_from_roll_cv);
93*0Sstevel@tonic-gate 	}
94*0Sstevel@tonic-gate 	logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLLING);
95*0Sstevel@tonic-gate 	CALLB_CPR_SAFE_BEGIN(cprinfop);
96*0Sstevel@tonic-gate 	(void) cv_timedwait(&logmap->mtm_to_roll_cv, &logmap->mtm_mutex,
97*0Sstevel@tonic-gate 			lbolt + trans_roll_tics);
98*0Sstevel@tonic-gate 	CALLB_CPR_SAFE_END(cprinfop, &logmap->mtm_mutex);
99*0Sstevel@tonic-gate 	logmap->mtm_flags |= MTM_ROLLING;
100*0Sstevel@tonic-gate 	mutex_exit(&logmap->mtm_mutex);
101*0Sstevel@tonic-gate }
102*0Sstevel@tonic-gate 
103*0Sstevel@tonic-gate /*
104*0Sstevel@tonic-gate  * returns the number of 8K buffers to use for rolling the log
105*0Sstevel@tonic-gate  */
106*0Sstevel@tonic-gate static uint32_t
107*0Sstevel@tonic-gate log_roll_buffers()
108*0Sstevel@tonic-gate {
109*0Sstevel@tonic-gate 	/*
110*0Sstevel@tonic-gate 	 * sanity validate the tunable lufs_num_roll_bufs
111*0Sstevel@tonic-gate 	 */
112*0Sstevel@tonic-gate 	if (lufs_num_roll_bufs < lufs_min_roll_bufs) {
113*0Sstevel@tonic-gate 		return (lufs_min_roll_bufs);
114*0Sstevel@tonic-gate 	}
115*0Sstevel@tonic-gate 	if (lufs_num_roll_bufs > lufs_max_roll_bufs) {
116*0Sstevel@tonic-gate 		return (lufs_max_roll_bufs);
117*0Sstevel@tonic-gate 	}
118*0Sstevel@tonic-gate 	return (lufs_num_roll_bufs);
119*0Sstevel@tonic-gate }
120*0Sstevel@tonic-gate 
121*0Sstevel@tonic-gate /*
122*0Sstevel@tonic-gate  * Find something to roll, then if we don't have cached roll buffers
123*0Sstevel@tonic-gate  * covering all the deltas in that MAPBLOCK then read the master
124*0Sstevel@tonic-gate  * and overlay the deltas.
125*0Sstevel@tonic-gate  * returns;
126*0Sstevel@tonic-gate  * 	0 if sucessful
127*0Sstevel@tonic-gate  *	1 on finding nothing to roll
128*0Sstevel@tonic-gate  *	2 on error
129*0Sstevel@tonic-gate  */
130*0Sstevel@tonic-gate int
131*0Sstevel@tonic-gate log_roll_read(ml_unit_t *ul, rollbuf_t *rbs, int nmblk, caddr_t roll_bufs,
132*0Sstevel@tonic-gate     int *retnbuf)
133*0Sstevel@tonic-gate {
134*0Sstevel@tonic-gate 	offset_t	mof;
135*0Sstevel@tonic-gate 	buf_t		*bp;
136*0Sstevel@tonic-gate 	rollbuf_t	*rbp;
137*0Sstevel@tonic-gate 	mt_map_t	*logmap = ul->un_logmap;
138*0Sstevel@tonic-gate 	daddr_t		mblkno;
139*0Sstevel@tonic-gate 	int		i;
140*0Sstevel@tonic-gate 	int		error;
141*0Sstevel@tonic-gate 	int		nbuf;
142*0Sstevel@tonic-gate 
143*0Sstevel@tonic-gate 	/*
144*0Sstevel@tonic-gate 	 * Make sure there is really something to roll
145*0Sstevel@tonic-gate 	 */
146*0Sstevel@tonic-gate 	mof = 0;
147*0Sstevel@tonic-gate 	if (!logmap_next_roll(logmap, &mof)) {
148*0Sstevel@tonic-gate 		return (1);
149*0Sstevel@tonic-gate 	}
150*0Sstevel@tonic-gate 
151*0Sstevel@tonic-gate 	/*
152*0Sstevel@tonic-gate 	 * build some master blocks + deltas to roll forward
153*0Sstevel@tonic-gate 	 */
154*0Sstevel@tonic-gate 	rw_enter(&logmap->mtm_rwlock, RW_READER);
155*0Sstevel@tonic-gate 	nbuf = 0;
156*0Sstevel@tonic-gate 	do {
157*0Sstevel@tonic-gate 		mof = mof & (offset_t)MAPBLOCKMASK;
158*0Sstevel@tonic-gate 		mblkno = lbtodb(mof);
159*0Sstevel@tonic-gate 
160*0Sstevel@tonic-gate 		/*
161*0Sstevel@tonic-gate 		 * Check for the case of a new delta to a set up buffer
162*0Sstevel@tonic-gate 		 */
163*0Sstevel@tonic-gate 		for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
164*0Sstevel@tonic-gate 			if (P2ALIGN(rbp->rb_bh.b_blkno,
165*0Sstevel@tonic-gate 			    MAPBLOCKSIZE / DEV_BSIZE) == mblkno) {
166*0Sstevel@tonic-gate 				TNF_PROBE_0(trans_roll_new_delta, "lufs",
167*0Sstevel@tonic-gate 				    /* CSTYLED */);
168*0Sstevel@tonic-gate 				trans_roll_new_delta++;
169*0Sstevel@tonic-gate 				/* Flush out the current set of buffers */
170*0Sstevel@tonic-gate 				goto flush_bufs;
171*0Sstevel@tonic-gate 			}
172*0Sstevel@tonic-gate 		}
173*0Sstevel@tonic-gate 
174*0Sstevel@tonic-gate 		/*
175*0Sstevel@tonic-gate 		 * Work out what to roll next. If it isn't cached then read
176*0Sstevel@tonic-gate 		 * it asynchronously from the master.
177*0Sstevel@tonic-gate 		 */
178*0Sstevel@tonic-gate 		bp = &rbp->rb_bh;
179*0Sstevel@tonic-gate 		bp->b_blkno = mblkno;
180*0Sstevel@tonic-gate 		bp->b_flags = B_READ;
181*0Sstevel@tonic-gate 		bp->b_un.b_addr = roll_bufs + (nbuf << MAPBLOCKSHIFT);
182*0Sstevel@tonic-gate 		bp->b_bufsize = MAPBLOCKSIZE;
183*0Sstevel@tonic-gate 		if (top_read_roll(rbp, ul)) {
184*0Sstevel@tonic-gate 			/* logmap deltas were in use */
185*0Sstevel@tonic-gate 			if (nbuf == 0) {
186*0Sstevel@tonic-gate 				/*
187*0Sstevel@tonic-gate 				 * On first buffer wait for the logmap user
188*0Sstevel@tonic-gate 				 * to finish by grabbing the logmap lock
189*0Sstevel@tonic-gate 				 * exclusively rather than spinning
190*0Sstevel@tonic-gate 				 */
191*0Sstevel@tonic-gate 				rw_exit(&logmap->mtm_rwlock);
192*0Sstevel@tonic-gate 				lrr_wait++;
193*0Sstevel@tonic-gate 				rw_enter(&logmap->mtm_rwlock, RW_WRITER);
194*0Sstevel@tonic-gate 				rw_exit(&logmap->mtm_rwlock);
195*0Sstevel@tonic-gate 				return (1);
196*0Sstevel@tonic-gate 			}
197*0Sstevel@tonic-gate 			/* we have at least one buffer - flush it */
198*0Sstevel@tonic-gate 			goto flush_bufs;
199*0Sstevel@tonic-gate 		}
200*0Sstevel@tonic-gate 		if ((bp->b_flags & B_INVAL) == 0) {
201*0Sstevel@tonic-gate 			nbuf++;
202*0Sstevel@tonic-gate 		}
203*0Sstevel@tonic-gate 		mof += MAPBLOCKSIZE;
204*0Sstevel@tonic-gate 	} while ((nbuf < nmblk) && logmap_next_roll(logmap, &mof));
205*0Sstevel@tonic-gate 
206*0Sstevel@tonic-gate 	/*
207*0Sstevel@tonic-gate 	 * If there was nothing to roll cycle back
208*0Sstevel@tonic-gate 	 */
209*0Sstevel@tonic-gate 	if (nbuf == 0) {
210*0Sstevel@tonic-gate 		rw_exit(&logmap->mtm_rwlock);
211*0Sstevel@tonic-gate 		return (1);
212*0Sstevel@tonic-gate 	}
213*0Sstevel@tonic-gate 
214*0Sstevel@tonic-gate flush_bufs:
215*0Sstevel@tonic-gate 	/*
216*0Sstevel@tonic-gate 	 * For each buffer, if it isn't cached then wait for the read to
217*0Sstevel@tonic-gate 	 * finish and overlay the deltas.
218*0Sstevel@tonic-gate 	 */
219*0Sstevel@tonic-gate 	for (error = 0, i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
220*0Sstevel@tonic-gate 		if (!rbp->rb_crb) {
221*0Sstevel@tonic-gate 			bp = &rbp->rb_bh;
222*0Sstevel@tonic-gate 			if (trans_not_wait(bp)) {
223*0Sstevel@tonic-gate 				ldl_seterror(ul,
224*0Sstevel@tonic-gate 				    "Error reading master during ufs log roll");
225*0Sstevel@tonic-gate 				error = 1;
226*0Sstevel@tonic-gate 			}
227*0Sstevel@tonic-gate 			/*
228*0Sstevel@tonic-gate 			 * sync read the data from the log
229*0Sstevel@tonic-gate 			 */
230*0Sstevel@tonic-gate 			if (ldl_read(ul, bp->b_un.b_addr,
231*0Sstevel@tonic-gate 			    ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK,
232*0Sstevel@tonic-gate 			    MAPBLOCKSIZE, rbp->rb_age)) {
233*0Sstevel@tonic-gate 				error = 1;
234*0Sstevel@tonic-gate 			}
235*0Sstevel@tonic-gate 		}
236*0Sstevel@tonic-gate 
237*0Sstevel@tonic-gate 		/*
238*0Sstevel@tonic-gate 		 * reset the age bit in the age list
239*0Sstevel@tonic-gate 		 */
240*0Sstevel@tonic-gate 		logmap_list_put_roll(logmap, rbp->rb_age);
241*0Sstevel@tonic-gate 
242*0Sstevel@tonic-gate 		if (ul->un_flags & LDL_ERROR) {
243*0Sstevel@tonic-gate 			error = 1;
244*0Sstevel@tonic-gate 		}
245*0Sstevel@tonic-gate 	}
246*0Sstevel@tonic-gate 	rw_exit(&logmap->mtm_rwlock);
247*0Sstevel@tonic-gate 	if (error)
248*0Sstevel@tonic-gate 		return (2);
249*0Sstevel@tonic-gate 	*retnbuf = nbuf;
250*0Sstevel@tonic-gate 	return (0);
251*0Sstevel@tonic-gate }
252*0Sstevel@tonic-gate 
253*0Sstevel@tonic-gate /*
254*0Sstevel@tonic-gate  * Write out a cached roll buffer
255*0Sstevel@tonic-gate  */
256*0Sstevel@tonic-gate void
257*0Sstevel@tonic-gate log_roll_write_crb(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
258*0Sstevel@tonic-gate {
259*0Sstevel@tonic-gate 	crb_t *crb = rbp->rb_crb;
260*0Sstevel@tonic-gate 	buf_t *bp = &rbp->rb_bh;
261*0Sstevel@tonic-gate 
262*0Sstevel@tonic-gate 	bp->b_blkno = lbtodb(crb->c_mof);
263*0Sstevel@tonic-gate 	bp->b_un.b_addr = crb->c_buf;
264*0Sstevel@tonic-gate 	bp->b_bcount = crb->c_nb;
265*0Sstevel@tonic-gate 	bp->b_bufsize = crb->c_nb;
266*0Sstevel@tonic-gate 	ASSERT((crb->c_nb & DEV_BMASK) == 0);
267*0Sstevel@tonic-gate 	bp->b_flags = B_WRITE;
268*0Sstevel@tonic-gate 	logstats.ls_rwrites.value.ui64++;
269*0Sstevel@tonic-gate 
270*0Sstevel@tonic-gate 	/* if snapshots are enabled, call it */
271*0Sstevel@tonic-gate 	if (ufsvfsp->vfs_snapshot) {
272*0Sstevel@tonic-gate 		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
273*0Sstevel@tonic-gate 	} else {
274*0Sstevel@tonic-gate 		(void) bdev_strategy(bp);
275*0Sstevel@tonic-gate 	}
276*0Sstevel@tonic-gate }
277*0Sstevel@tonic-gate 
278*0Sstevel@tonic-gate /*
279*0Sstevel@tonic-gate  * Write out a set of non cached roll buffers
280*0Sstevel@tonic-gate  */
281*0Sstevel@tonic-gate void
282*0Sstevel@tonic-gate log_roll_write_bufs(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
283*0Sstevel@tonic-gate {
284*0Sstevel@tonic-gate 	buf_t		*bp = &rbp->rb_bh;
285*0Sstevel@tonic-gate 	buf_t		*bp2;
286*0Sstevel@tonic-gate 	rbsecmap_t	secmap = rbp->rb_secmap;
287*0Sstevel@tonic-gate 	int		j, k;
288*0Sstevel@tonic-gate 
289*0Sstevel@tonic-gate 	ASSERT(secmap);
290*0Sstevel@tonic-gate 	ASSERT((bp->b_flags & B_INVAL) == 0);
291*0Sstevel@tonic-gate 
292*0Sstevel@tonic-gate 	do { /* for each contiguous block of sectors */
293*0Sstevel@tonic-gate 		/* find start of next sector to write */
294*0Sstevel@tonic-gate 		for (j = 0; j < 16; ++j) {
295*0Sstevel@tonic-gate 			if (secmap & UINT16_C(1))
296*0Sstevel@tonic-gate 				break;
297*0Sstevel@tonic-gate 			secmap >>= 1;
298*0Sstevel@tonic-gate 		}
299*0Sstevel@tonic-gate 		bp->b_un.b_addr += (j << DEV_BSHIFT);
300*0Sstevel@tonic-gate 		bp->b_blkno += j;
301*0Sstevel@tonic-gate 
302*0Sstevel@tonic-gate 		/* calculate number of sectors */
303*0Sstevel@tonic-gate 		secmap >>= 1;
304*0Sstevel@tonic-gate 		j++;
305*0Sstevel@tonic-gate 		for (k = 1; j < 16; ++j) {
306*0Sstevel@tonic-gate 			if ((secmap & UINT16_C(1)) == 0)
307*0Sstevel@tonic-gate 				break;
308*0Sstevel@tonic-gate 			secmap >>= 1;
309*0Sstevel@tonic-gate 			k++;
310*0Sstevel@tonic-gate 		}
311*0Sstevel@tonic-gate 		bp->b_bcount = k << DEV_BSHIFT;
312*0Sstevel@tonic-gate 		bp->b_flags = B_WRITE;
313*0Sstevel@tonic-gate 		logstats.ls_rwrites.value.ui64++;
314*0Sstevel@tonic-gate 
315*0Sstevel@tonic-gate 		/* if snapshots are enabled, call it */
316*0Sstevel@tonic-gate 		if (ufsvfsp->vfs_snapshot)
317*0Sstevel@tonic-gate 			fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
318*0Sstevel@tonic-gate 		else
319*0Sstevel@tonic-gate 			(void) bdev_strategy(bp);
320*0Sstevel@tonic-gate 		if (secmap) {
321*0Sstevel@tonic-gate 			/*
322*0Sstevel@tonic-gate 			 * Allocate another buf_t to handle
323*0Sstevel@tonic-gate 			 * the next write in this MAPBLOCK
324*0Sstevel@tonic-gate 			 * Chain them via b_list.
325*0Sstevel@tonic-gate 			 */
326*0Sstevel@tonic-gate 			bp2 = kmem_alloc(sizeof (buf_t), KM_SLEEP);
327*0Sstevel@tonic-gate 			bp->b_list = bp2;
328*0Sstevel@tonic-gate 			bioinit(bp2);
329*0Sstevel@tonic-gate 			bp2->b_iodone = trans_not_done;
330*0Sstevel@tonic-gate 			bp2->b_bufsize = MAPBLOCKSIZE;
331*0Sstevel@tonic-gate 			bp2->b_edev = bp->b_edev;
332*0Sstevel@tonic-gate 			bp2->b_un.b_addr =
333*0Sstevel@tonic-gate 			    bp->b_un.b_addr + bp->b_bcount;
334*0Sstevel@tonic-gate 			bp2->b_blkno = bp->b_blkno + k;
335*0Sstevel@tonic-gate 			bp = bp2;
336*0Sstevel@tonic-gate 		}
337*0Sstevel@tonic-gate 	} while (secmap);
338*0Sstevel@tonic-gate }
339*0Sstevel@tonic-gate 
340*0Sstevel@tonic-gate /*
341*0Sstevel@tonic-gate  * Asynchronously roll the deltas, using the sector map
342*0Sstevel@tonic-gate  * in each rollbuf_t.
343*0Sstevel@tonic-gate  */
344*0Sstevel@tonic-gate int
345*0Sstevel@tonic-gate log_roll_write(ml_unit_t *ul, rollbuf_t *rbs, int nbuf)
346*0Sstevel@tonic-gate {
347*0Sstevel@tonic-gate 
348*0Sstevel@tonic-gate 	ufsvfs_t	*ufsvfsp = ul->un_ufsvfs;
349*0Sstevel@tonic-gate 	rollbuf_t	*rbp;
350*0Sstevel@tonic-gate 	buf_t		*bp, *bp2;
351*0Sstevel@tonic-gate 	rollbuf_t	*head, *prev, *rbp2;
352*0Sstevel@tonic-gate 
353*0Sstevel@tonic-gate 	/*
354*0Sstevel@tonic-gate 	 * Order the buffers by blkno
355*0Sstevel@tonic-gate 	 */
356*0Sstevel@tonic-gate 	ASSERT(nbuf > 0);
357*0Sstevel@tonic-gate #ifdef lint
358*0Sstevel@tonic-gate 	prev = rbs;
359*0Sstevel@tonic-gate #endif
360*0Sstevel@tonic-gate 	for (head = rbs, rbp = rbs + 1; rbp < rbs + nbuf; rbp++) {
361*0Sstevel@tonic-gate 		for (rbp2 = head; rbp2; prev = rbp2, rbp2 = rbp2->rb_next) {
362*0Sstevel@tonic-gate 			if (rbp->rb_bh.b_blkno < rbp2->rb_bh.b_blkno) {
363*0Sstevel@tonic-gate 				if (rbp2 == head) {
364*0Sstevel@tonic-gate 					rbp->rb_next = head;
365*0Sstevel@tonic-gate 					head = rbp;
366*0Sstevel@tonic-gate 				} else {
367*0Sstevel@tonic-gate 					prev->rb_next = rbp;
368*0Sstevel@tonic-gate 					rbp->rb_next = rbp2;
369*0Sstevel@tonic-gate 				}
370*0Sstevel@tonic-gate 				break;
371*0Sstevel@tonic-gate 			}
372*0Sstevel@tonic-gate 		}
373*0Sstevel@tonic-gate 		if (rbp2 == NULL) {
374*0Sstevel@tonic-gate 			prev->rb_next = rbp;
375*0Sstevel@tonic-gate 			rbp->rb_next = NULL;
376*0Sstevel@tonic-gate 		}
377*0Sstevel@tonic-gate 	}
378*0Sstevel@tonic-gate 
379*0Sstevel@tonic-gate 	/*
380*0Sstevel@tonic-gate 	 * issue the in-order writes
381*0Sstevel@tonic-gate 	 */
382*0Sstevel@tonic-gate 	for (rbp = head; rbp; rbp = rbp2) {
383*0Sstevel@tonic-gate 		if (rbp->rb_crb) {
384*0Sstevel@tonic-gate 			log_roll_write_crb(ufsvfsp, rbp);
385*0Sstevel@tonic-gate 		} else {
386*0Sstevel@tonic-gate 			log_roll_write_bufs(ufsvfsp, rbp);
387*0Sstevel@tonic-gate 		}
388*0Sstevel@tonic-gate 		/* null out the rb_next link for next set of rolling */
389*0Sstevel@tonic-gate 		rbp2 = rbp->rb_next;
390*0Sstevel@tonic-gate 		rbp->rb_next = NULL;
391*0Sstevel@tonic-gate 	}
392*0Sstevel@tonic-gate 
393*0Sstevel@tonic-gate 	/*
394*0Sstevel@tonic-gate 	 * wait for all the writes to finish
395*0Sstevel@tonic-gate 	 */
396*0Sstevel@tonic-gate 	for (rbp = rbs; rbp < rbs + nbuf; rbp++) {
397*0Sstevel@tonic-gate 		bp = &rbp->rb_bh;
398*0Sstevel@tonic-gate 		if (trans_not_wait(bp)) {
399*0Sstevel@tonic-gate 			ldl_seterror(ul,
400*0Sstevel@tonic-gate 			    "Error writing master during ufs log roll");
401*0Sstevel@tonic-gate 		}
402*0Sstevel@tonic-gate 
403*0Sstevel@tonic-gate 		/*
404*0Sstevel@tonic-gate 		 * Now wait for all the "cloned" buffer writes (if any)
405*0Sstevel@tonic-gate 		 * and free those headers
406*0Sstevel@tonic-gate 		 */
407*0Sstevel@tonic-gate 		bp2 = bp->b_list;
408*0Sstevel@tonic-gate 		bp->b_list = NULL;
409*0Sstevel@tonic-gate 		while (bp2) {
410*0Sstevel@tonic-gate 			if (trans_not_wait(bp2)) {
411*0Sstevel@tonic-gate 				ldl_seterror(ul,
412*0Sstevel@tonic-gate 				    "Error writing master during ufs log roll");
413*0Sstevel@tonic-gate 			}
414*0Sstevel@tonic-gate 			bp = bp2;
415*0Sstevel@tonic-gate 			bp2 = bp2->b_list;
416*0Sstevel@tonic-gate 			kmem_free(bp, sizeof (buf_t));
417*0Sstevel@tonic-gate 		}
418*0Sstevel@tonic-gate 	}
419*0Sstevel@tonic-gate 
420*0Sstevel@tonic-gate 	if (ul->un_flags & LDL_ERROR)
421*0Sstevel@tonic-gate 		return (1);
422*0Sstevel@tonic-gate 	return (0);
423*0Sstevel@tonic-gate }
424*0Sstevel@tonic-gate 
425*0Sstevel@tonic-gate void
426*0Sstevel@tonic-gate trans_roll(ml_unit_t *ul)
427*0Sstevel@tonic-gate {
428*0Sstevel@tonic-gate 	callb_cpr_t	cprinfo;
429*0Sstevel@tonic-gate 	mt_map_t	*logmap = ul->un_logmap;
430*0Sstevel@tonic-gate 	rollbuf_t	*rbs;
431*0Sstevel@tonic-gate 	rollbuf_t	*rbp;
432*0Sstevel@tonic-gate 	buf_t		*bp;
433*0Sstevel@tonic-gate 	caddr_t		roll_bufs;
434*0Sstevel@tonic-gate 	uint32_t	nmblk;
435*0Sstevel@tonic-gate 	int		i;
436*0Sstevel@tonic-gate 	int		doingforceroll;
437*0Sstevel@tonic-gate 	int		nbuf;
438*0Sstevel@tonic-gate 
439*0Sstevel@tonic-gate 	CALLB_CPR_INIT(&cprinfo, &logmap->mtm_mutex, callb_generic_cpr,
440*0Sstevel@tonic-gate 	    "trans_roll");
441*0Sstevel@tonic-gate 
442*0Sstevel@tonic-gate 	/*
443*0Sstevel@tonic-gate 	 * We do not want the roll thread's writes to be
444*0Sstevel@tonic-gate 	 * throttled by the snapshot.
445*0Sstevel@tonic-gate 	 * If they are throttled then we can have a deadlock
446*0Sstevel@tonic-gate 	 * between the roll thread and the snapshot taskq thread:
447*0Sstevel@tonic-gate 	 * roll thread wants the throttling semaphore and
448*0Sstevel@tonic-gate 	 * the snapshot taskq thread cannot release the semaphore
449*0Sstevel@tonic-gate 	 * because it is writing to the log and the log is full.
450*0Sstevel@tonic-gate 	 */
451*0Sstevel@tonic-gate 
452*0Sstevel@tonic-gate 	(void) tsd_set(bypass_snapshot_throttle_key, (void*)1);
453*0Sstevel@tonic-gate 
454*0Sstevel@tonic-gate 	/*
455*0Sstevel@tonic-gate 	 * setup some roll parameters
456*0Sstevel@tonic-gate 	 */
457*0Sstevel@tonic-gate 	if (trans_roll_tics == 0)
458*0Sstevel@tonic-gate 		trans_roll_tics = 5 * hz;
459*0Sstevel@tonic-gate 	nmblk = log_roll_buffers();
460*0Sstevel@tonic-gate 
461*0Sstevel@tonic-gate 	/*
462*0Sstevel@tonic-gate 	 * allocate the buffers and buffer headers
463*0Sstevel@tonic-gate 	 */
464*0Sstevel@tonic-gate 	roll_bufs = kmem_alloc(nmblk * MAPBLOCKSIZE, KM_SLEEP);
465*0Sstevel@tonic-gate 	rbs = kmem_alloc(nmblk * sizeof (rollbuf_t), KM_SLEEP);
466*0Sstevel@tonic-gate 
467*0Sstevel@tonic-gate 	/*
468*0Sstevel@tonic-gate 	 * initialize the buffer headers
469*0Sstevel@tonic-gate 	 */
470*0Sstevel@tonic-gate 	for (i = 0, rbp = rbs; i < nmblk; ++i, ++rbp) {
471*0Sstevel@tonic-gate 		rbp->rb_next = NULL;
472*0Sstevel@tonic-gate 		bp = &rbp->rb_bh;
473*0Sstevel@tonic-gate 		bioinit(bp);
474*0Sstevel@tonic-gate 		bp->b_edev = ul->un_dev;
475*0Sstevel@tonic-gate 		bp->b_iodone = trans_not_done;
476*0Sstevel@tonic-gate 		bp->b_bufsize = MAPBLOCKSIZE;
477*0Sstevel@tonic-gate 	}
478*0Sstevel@tonic-gate 
479*0Sstevel@tonic-gate 	doingforceroll = 0;
480*0Sstevel@tonic-gate 
481*0Sstevel@tonic-gate again:
482*0Sstevel@tonic-gate 	/*
483*0Sstevel@tonic-gate 	 * LOOP FOREVER
484*0Sstevel@tonic-gate 	 */
485*0Sstevel@tonic-gate 
486*0Sstevel@tonic-gate 	/*
487*0Sstevel@tonic-gate 	 * exit on demand
488*0Sstevel@tonic-gate 	 */
489*0Sstevel@tonic-gate 	mutex_enter(&logmap->mtm_mutex);
490*0Sstevel@tonic-gate 	if ((ul->un_flags & LDL_ERROR) || (logmap->mtm_flags & MTM_ROLL_EXIT)) {
491*0Sstevel@tonic-gate 		kmem_free(rbs, nmblk * sizeof (rollbuf_t));
492*0Sstevel@tonic-gate 		kmem_free(roll_bufs, nmblk * MAPBLOCKSIZE);
493*0Sstevel@tonic-gate 		logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_RUNNING |
494*0Sstevel@tonic-gate 		    MTM_ROLL_EXIT | MTM_ROLLING);
495*0Sstevel@tonic-gate 		cv_broadcast(&logmap->mtm_from_roll_cv);
496*0Sstevel@tonic-gate 		CALLB_CPR_EXIT(&cprinfo);
497*0Sstevel@tonic-gate 		thread_exit();
498*0Sstevel@tonic-gate 		/* NOTREACHED */
499*0Sstevel@tonic-gate 	}
500*0Sstevel@tonic-gate 
501*0Sstevel@tonic-gate 	/*
502*0Sstevel@tonic-gate 	 * MT_SCAN debug mode
503*0Sstevel@tonic-gate 	 *	don't roll except in FORCEROLL situations
504*0Sstevel@tonic-gate 	 */
505*0Sstevel@tonic-gate 	if (logmap->mtm_debug & MT_SCAN)
506*0Sstevel@tonic-gate 		if ((logmap->mtm_flags & MTM_FORCE_ROLL) == 0) {
507*0Sstevel@tonic-gate 			mutex_exit(&logmap->mtm_mutex);
508*0Sstevel@tonic-gate 			trans_roll_wait(logmap, &cprinfo);
509*0Sstevel@tonic-gate 			goto again;
510*0Sstevel@tonic-gate 		}
511*0Sstevel@tonic-gate 	ASSERT(logmap->mtm_trimlof == 0);
512*0Sstevel@tonic-gate 
513*0Sstevel@tonic-gate 	/*
514*0Sstevel@tonic-gate 	 * If we've finished a force roll cycle then wakeup any
515*0Sstevel@tonic-gate 	 * waiters.
516*0Sstevel@tonic-gate 	 */
517*0Sstevel@tonic-gate 	if (doingforceroll) {
518*0Sstevel@tonic-gate 		doingforceroll = 0;
519*0Sstevel@tonic-gate 		logmap->mtm_flags &= ~MTM_FORCE_ROLL;
520*0Sstevel@tonic-gate 		mutex_exit(&logmap->mtm_mutex);
521*0Sstevel@tonic-gate 		cv_broadcast(&logmap->mtm_from_roll_cv);
522*0Sstevel@tonic-gate 	} else {
523*0Sstevel@tonic-gate 		mutex_exit(&logmap->mtm_mutex);
524*0Sstevel@tonic-gate 	}
525*0Sstevel@tonic-gate 
526*0Sstevel@tonic-gate 	/*
527*0Sstevel@tonic-gate 	 * If someone wants us to roll something; then do it
528*0Sstevel@tonic-gate 	 */
529*0Sstevel@tonic-gate 	if (logmap->mtm_flags & MTM_FORCE_ROLL) {
530*0Sstevel@tonic-gate 		doingforceroll = 1;
531*0Sstevel@tonic-gate 		goto rollsomething;
532*0Sstevel@tonic-gate 	}
533*0Sstevel@tonic-gate 
534*0Sstevel@tonic-gate 	/*
535*0Sstevel@tonic-gate 	 * Log is busy, check if logmap is getting full.
536*0Sstevel@tonic-gate 	 */
537*0Sstevel@tonic-gate 	if (logmap_need_roll(logmap)) {
538*0Sstevel@tonic-gate 		goto rollsomething;
539*0Sstevel@tonic-gate 	}
540*0Sstevel@tonic-gate 
541*0Sstevel@tonic-gate 	/*
542*0Sstevel@tonic-gate 	 * Check if the log is idle and is not empty
543*0Sstevel@tonic-gate 	 */
544*0Sstevel@tonic-gate 	if (!logmap->mtm_ref && !ldl_empty(ul)) {
545*0Sstevel@tonic-gate 		goto rollsomething;
546*0Sstevel@tonic-gate 	}
547*0Sstevel@tonic-gate 
548*0Sstevel@tonic-gate 	/*
549*0Sstevel@tonic-gate 	 * Log is busy, check if its getting full
550*0Sstevel@tonic-gate 	 */
551*0Sstevel@tonic-gate 	if (ldl_need_roll(ul)) {
552*0Sstevel@tonic-gate 		goto rollsomething;
553*0Sstevel@tonic-gate 	}
554*0Sstevel@tonic-gate 
555*0Sstevel@tonic-gate 	/*
556*0Sstevel@tonic-gate 	 * nothing to do; wait a bit and then start over
557*0Sstevel@tonic-gate 	 */
558*0Sstevel@tonic-gate 	trans_roll_wait(logmap, &cprinfo);
559*0Sstevel@tonic-gate 	goto again;
560*0Sstevel@tonic-gate 
561*0Sstevel@tonic-gate 	/*
562*0Sstevel@tonic-gate 	 * ROLL SOMETHING
563*0Sstevel@tonic-gate 	 */
564*0Sstevel@tonic-gate 
565*0Sstevel@tonic-gate rollsomething:
566*0Sstevel@tonic-gate 	/*
567*0Sstevel@tonic-gate 	 * Use the cached roll buffers, or read the master
568*0Sstevel@tonic-gate 	 * and overlay the deltas
569*0Sstevel@tonic-gate 	 */
570*0Sstevel@tonic-gate 	switch (log_roll_read(ul, rbs, nmblk, roll_bufs, &nbuf)) {
571*0Sstevel@tonic-gate 	case 1: trans_roll_wait(logmap, &cprinfo);
572*0Sstevel@tonic-gate 		/* FALLTHROUGH */
573*0Sstevel@tonic-gate 	case 2: goto again;
574*0Sstevel@tonic-gate 	/* default case is success */
575*0Sstevel@tonic-gate 	}
576*0Sstevel@tonic-gate 
577*0Sstevel@tonic-gate 	/*
578*0Sstevel@tonic-gate 	 * Asynchronously write out the deltas
579*0Sstevel@tonic-gate 	 */
580*0Sstevel@tonic-gate 	if (log_roll_write(ul, rbs, nbuf))
581*0Sstevel@tonic-gate 		goto again;
582*0Sstevel@tonic-gate 
583*0Sstevel@tonic-gate 	/*
584*0Sstevel@tonic-gate 	 * free up the deltas in the logmap
585*0Sstevel@tonic-gate 	 */
586*0Sstevel@tonic-gate 	for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
587*0Sstevel@tonic-gate 		bp = &rbp->rb_bh;
588*0Sstevel@tonic-gate 		logmap_remove_roll(logmap,
589*0Sstevel@tonic-gate 		    ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, MAPBLOCKSIZE);
590*0Sstevel@tonic-gate 	}
591*0Sstevel@tonic-gate 
592*0Sstevel@tonic-gate 	/*
593*0Sstevel@tonic-gate 	 * free up log space; if possible
594*0Sstevel@tonic-gate 	 */
595*0Sstevel@tonic-gate 	logmap_sethead(logmap, ul);
596*0Sstevel@tonic-gate 
597*0Sstevel@tonic-gate 	/*
598*0Sstevel@tonic-gate 	 * LOOP
599*0Sstevel@tonic-gate 	 */
600*0Sstevel@tonic-gate 	goto again;
601*0Sstevel@tonic-gate }
602