xref: /onnv-gate/usr/src/uts/common/fs/ufs/lufs_thread.c (revision 4662:9c48274ded8b)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*4662Sfrankho  * Common Development and Distribution License (the "License").
6*4662Sfrankho  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
22*4662Sfrankho  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
270Sstevel@tonic-gate 
280Sstevel@tonic-gate #include <sys/systm.h>
290Sstevel@tonic-gate #include <sys/types.h>
300Sstevel@tonic-gate #include <sys/vnode.h>
310Sstevel@tonic-gate #include <sys/errno.h>
320Sstevel@tonic-gate #include <sys/sysmacros.h>
330Sstevel@tonic-gate #include <sys/debug.h>
340Sstevel@tonic-gate #include <sys/kmem.h>
350Sstevel@tonic-gate #include <sys/conf.h>
360Sstevel@tonic-gate #include <sys/proc.h>
370Sstevel@tonic-gate #include <sys/cmn_err.h>
380Sstevel@tonic-gate #include <sys/fssnap_if.h>
390Sstevel@tonic-gate #include <sys/fs/ufs_inode.h>
400Sstevel@tonic-gate #include <sys/fs/ufs_filio.h>
410Sstevel@tonic-gate #include <sys/fs/ufs_log.h>
420Sstevel@tonic-gate #include <sys/fs/ufs_bio.h>
430Sstevel@tonic-gate #include <sys/inttypes.h>
440Sstevel@tonic-gate #include <sys/callb.h>
450Sstevel@tonic-gate #include <sys/tnf_probe.h>
460Sstevel@tonic-gate 
470Sstevel@tonic-gate /*
480Sstevel@tonic-gate  * Kernel threads for logging
490Sstevel@tonic-gate  * Currently only one for rolling the log (one per log).
500Sstevel@tonic-gate  */
510Sstevel@tonic-gate 
520Sstevel@tonic-gate #define	LUFS_DEFAULT_NUM_ROLL_BUFS 16
530Sstevel@tonic-gate #define	LUFS_DEFAULT_MIN_ROLL_BUFS 4
540Sstevel@tonic-gate #define	LUFS_DEFAULT_MAX_ROLL_BUFS 64
550Sstevel@tonic-gate 
560Sstevel@tonic-gate /*
570Sstevel@tonic-gate  * Macros
580Sstevel@tonic-gate  */
590Sstevel@tonic-gate #define	logmap_need_roll(logmap) ((logmap)->mtm_nme > logmap_maxnme)
600Sstevel@tonic-gate #define	ldl_empty(ul) ((ul)->un_head_lof == (ul)->un_tail_lof)
610Sstevel@tonic-gate 
620Sstevel@tonic-gate /*
630Sstevel@tonic-gate  * Tunables
640Sstevel@tonic-gate  */
650Sstevel@tonic-gate uint32_t lufs_num_roll_bufs = LUFS_DEFAULT_NUM_ROLL_BUFS;
660Sstevel@tonic-gate uint32_t lufs_min_roll_bufs = LUFS_DEFAULT_MIN_ROLL_BUFS;
670Sstevel@tonic-gate uint32_t lufs_max_roll_bufs = LUFS_DEFAULT_MAX_ROLL_BUFS;
680Sstevel@tonic-gate long logmap_maxnme = 1536;
690Sstevel@tonic-gate int trans_roll_tics = 0;
700Sstevel@tonic-gate uint64_t trans_roll_new_delta = 0;
710Sstevel@tonic-gate uint64_t lrr_wait = 0;
720Sstevel@tonic-gate /*
730Sstevel@tonic-gate  * Key for thread specific data for the roll thread to
740Sstevel@tonic-gate  * bypass snapshot throttling
750Sstevel@tonic-gate  */
760Sstevel@tonic-gate uint_t bypass_snapshot_throttle_key;
770Sstevel@tonic-gate 
780Sstevel@tonic-gate /*
790Sstevel@tonic-gate  * externs
800Sstevel@tonic-gate  */
810Sstevel@tonic-gate extern kmutex_t		ml_scan;
820Sstevel@tonic-gate extern kcondvar_t	ml_scan_cv;
830Sstevel@tonic-gate extern int		maxphys;
840Sstevel@tonic-gate 
850Sstevel@tonic-gate static void
860Sstevel@tonic-gate trans_roll_wait(mt_map_t *logmap, callb_cpr_t *cprinfop)
870Sstevel@tonic-gate {
880Sstevel@tonic-gate 	mutex_enter(&logmap->mtm_mutex);
890Sstevel@tonic-gate 	logmap->mtm_ref = 0;
900Sstevel@tonic-gate 	if (logmap->mtm_flags & MTM_FORCE_ROLL) {
910Sstevel@tonic-gate 		cv_broadcast(&logmap->mtm_from_roll_cv);
920Sstevel@tonic-gate 	}
930Sstevel@tonic-gate 	logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLLING);
940Sstevel@tonic-gate 	CALLB_CPR_SAFE_BEGIN(cprinfop);
950Sstevel@tonic-gate 	(void) cv_timedwait(&logmap->mtm_to_roll_cv, &logmap->mtm_mutex,
96*4662Sfrankho 	    lbolt + trans_roll_tics);
970Sstevel@tonic-gate 	CALLB_CPR_SAFE_END(cprinfop, &logmap->mtm_mutex);
980Sstevel@tonic-gate 	logmap->mtm_flags |= MTM_ROLLING;
990Sstevel@tonic-gate 	mutex_exit(&logmap->mtm_mutex);
1000Sstevel@tonic-gate }
1010Sstevel@tonic-gate 
1020Sstevel@tonic-gate /*
1030Sstevel@tonic-gate  * returns the number of 8K buffers to use for rolling the log
1040Sstevel@tonic-gate  */
1050Sstevel@tonic-gate static uint32_t
1060Sstevel@tonic-gate log_roll_buffers()
1070Sstevel@tonic-gate {
1080Sstevel@tonic-gate 	/*
1090Sstevel@tonic-gate 	 * sanity validate the tunable lufs_num_roll_bufs
1100Sstevel@tonic-gate 	 */
1110Sstevel@tonic-gate 	if (lufs_num_roll_bufs < lufs_min_roll_bufs) {
1120Sstevel@tonic-gate 		return (lufs_min_roll_bufs);
1130Sstevel@tonic-gate 	}
1140Sstevel@tonic-gate 	if (lufs_num_roll_bufs > lufs_max_roll_bufs) {
1150Sstevel@tonic-gate 		return (lufs_max_roll_bufs);
1160Sstevel@tonic-gate 	}
1170Sstevel@tonic-gate 	return (lufs_num_roll_bufs);
1180Sstevel@tonic-gate }
1190Sstevel@tonic-gate 
1200Sstevel@tonic-gate /*
1210Sstevel@tonic-gate  * Find something to roll, then if we don't have cached roll buffers
1220Sstevel@tonic-gate  * covering all the deltas in that MAPBLOCK then read the master
1230Sstevel@tonic-gate  * and overlay the deltas.
1240Sstevel@tonic-gate  * returns;
1250Sstevel@tonic-gate  * 	0 if sucessful
1260Sstevel@tonic-gate  *	1 on finding nothing to roll
1270Sstevel@tonic-gate  *	2 on error
1280Sstevel@tonic-gate  */
1290Sstevel@tonic-gate int
1300Sstevel@tonic-gate log_roll_read(ml_unit_t *ul, rollbuf_t *rbs, int nmblk, caddr_t roll_bufs,
1310Sstevel@tonic-gate     int *retnbuf)
1320Sstevel@tonic-gate {
1330Sstevel@tonic-gate 	offset_t	mof;
1340Sstevel@tonic-gate 	buf_t		*bp;
1350Sstevel@tonic-gate 	rollbuf_t	*rbp;
1360Sstevel@tonic-gate 	mt_map_t	*logmap = ul->un_logmap;
1370Sstevel@tonic-gate 	daddr_t		mblkno;
1380Sstevel@tonic-gate 	int		i;
1390Sstevel@tonic-gate 	int		error;
1400Sstevel@tonic-gate 	int		nbuf;
1410Sstevel@tonic-gate 
1420Sstevel@tonic-gate 	/*
1430Sstevel@tonic-gate 	 * Make sure there is really something to roll
1440Sstevel@tonic-gate 	 */
1450Sstevel@tonic-gate 	mof = 0;
1460Sstevel@tonic-gate 	if (!logmap_next_roll(logmap, &mof)) {
1470Sstevel@tonic-gate 		return (1);
1480Sstevel@tonic-gate 	}
1490Sstevel@tonic-gate 
1500Sstevel@tonic-gate 	/*
1510Sstevel@tonic-gate 	 * build some master blocks + deltas to roll forward
1520Sstevel@tonic-gate 	 */
1530Sstevel@tonic-gate 	rw_enter(&logmap->mtm_rwlock, RW_READER);
1540Sstevel@tonic-gate 	nbuf = 0;
1550Sstevel@tonic-gate 	do {
1560Sstevel@tonic-gate 		mof = mof & (offset_t)MAPBLOCKMASK;
1570Sstevel@tonic-gate 		mblkno = lbtodb(mof);
1580Sstevel@tonic-gate 
1590Sstevel@tonic-gate 		/*
1600Sstevel@tonic-gate 		 * Check for the case of a new delta to a set up buffer
1610Sstevel@tonic-gate 		 */
1620Sstevel@tonic-gate 		for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
1630Sstevel@tonic-gate 			if (P2ALIGN(rbp->rb_bh.b_blkno,
1640Sstevel@tonic-gate 			    MAPBLOCKSIZE / DEV_BSIZE) == mblkno) {
1650Sstevel@tonic-gate 				TNF_PROBE_0(trans_roll_new_delta, "lufs",
1660Sstevel@tonic-gate 				    /* CSTYLED */);
1670Sstevel@tonic-gate 				trans_roll_new_delta++;
1680Sstevel@tonic-gate 				/* Flush out the current set of buffers */
1690Sstevel@tonic-gate 				goto flush_bufs;
1700Sstevel@tonic-gate 			}
1710Sstevel@tonic-gate 		}
1720Sstevel@tonic-gate 
1730Sstevel@tonic-gate 		/*
1740Sstevel@tonic-gate 		 * Work out what to roll next. If it isn't cached then read
1750Sstevel@tonic-gate 		 * it asynchronously from the master.
1760Sstevel@tonic-gate 		 */
1770Sstevel@tonic-gate 		bp = &rbp->rb_bh;
1780Sstevel@tonic-gate 		bp->b_blkno = mblkno;
1790Sstevel@tonic-gate 		bp->b_flags = B_READ;
1800Sstevel@tonic-gate 		bp->b_un.b_addr = roll_bufs + (nbuf << MAPBLOCKSHIFT);
1810Sstevel@tonic-gate 		bp->b_bufsize = MAPBLOCKSIZE;
1820Sstevel@tonic-gate 		if (top_read_roll(rbp, ul)) {
1830Sstevel@tonic-gate 			/* logmap deltas were in use */
1840Sstevel@tonic-gate 			if (nbuf == 0) {
1850Sstevel@tonic-gate 				/*
1860Sstevel@tonic-gate 				 * On first buffer wait for the logmap user
1870Sstevel@tonic-gate 				 * to finish by grabbing the logmap lock
1880Sstevel@tonic-gate 				 * exclusively rather than spinning
1890Sstevel@tonic-gate 				 */
1900Sstevel@tonic-gate 				rw_exit(&logmap->mtm_rwlock);
1910Sstevel@tonic-gate 				lrr_wait++;
1920Sstevel@tonic-gate 				rw_enter(&logmap->mtm_rwlock, RW_WRITER);
1930Sstevel@tonic-gate 				rw_exit(&logmap->mtm_rwlock);
1940Sstevel@tonic-gate 				return (1);
1950Sstevel@tonic-gate 			}
1960Sstevel@tonic-gate 			/* we have at least one buffer - flush it */
1970Sstevel@tonic-gate 			goto flush_bufs;
1980Sstevel@tonic-gate 		}
1990Sstevel@tonic-gate 		if ((bp->b_flags & B_INVAL) == 0) {
2000Sstevel@tonic-gate 			nbuf++;
2010Sstevel@tonic-gate 		}
2020Sstevel@tonic-gate 		mof += MAPBLOCKSIZE;
2030Sstevel@tonic-gate 	} while ((nbuf < nmblk) && logmap_next_roll(logmap, &mof));
2040Sstevel@tonic-gate 
2050Sstevel@tonic-gate 	/*
2060Sstevel@tonic-gate 	 * If there was nothing to roll cycle back
2070Sstevel@tonic-gate 	 */
2080Sstevel@tonic-gate 	if (nbuf == 0) {
2090Sstevel@tonic-gate 		rw_exit(&logmap->mtm_rwlock);
2100Sstevel@tonic-gate 		return (1);
2110Sstevel@tonic-gate 	}
2120Sstevel@tonic-gate 
2130Sstevel@tonic-gate flush_bufs:
2140Sstevel@tonic-gate 	/*
2150Sstevel@tonic-gate 	 * For each buffer, if it isn't cached then wait for the read to
2160Sstevel@tonic-gate 	 * finish and overlay the deltas.
2170Sstevel@tonic-gate 	 */
2180Sstevel@tonic-gate 	for (error = 0, i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
2190Sstevel@tonic-gate 		if (!rbp->rb_crb) {
2200Sstevel@tonic-gate 			bp = &rbp->rb_bh;
2210Sstevel@tonic-gate 			if (trans_not_wait(bp)) {
2220Sstevel@tonic-gate 				ldl_seterror(ul,
2230Sstevel@tonic-gate 				    "Error reading master during ufs log roll");
2240Sstevel@tonic-gate 				error = 1;
2250Sstevel@tonic-gate 			}
2260Sstevel@tonic-gate 			/*
2270Sstevel@tonic-gate 			 * sync read the data from the log
2280Sstevel@tonic-gate 			 */
2290Sstevel@tonic-gate 			if (ldl_read(ul, bp->b_un.b_addr,
2300Sstevel@tonic-gate 			    ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK,
2310Sstevel@tonic-gate 			    MAPBLOCKSIZE, rbp->rb_age)) {
2320Sstevel@tonic-gate 				error = 1;
2330Sstevel@tonic-gate 			}
2340Sstevel@tonic-gate 		}
2350Sstevel@tonic-gate 
2360Sstevel@tonic-gate 		/*
2370Sstevel@tonic-gate 		 * reset the age bit in the age list
2380Sstevel@tonic-gate 		 */
2390Sstevel@tonic-gate 		logmap_list_put_roll(logmap, rbp->rb_age);
2400Sstevel@tonic-gate 
2410Sstevel@tonic-gate 		if (ul->un_flags & LDL_ERROR) {
2420Sstevel@tonic-gate 			error = 1;
2430Sstevel@tonic-gate 		}
2440Sstevel@tonic-gate 	}
2450Sstevel@tonic-gate 	rw_exit(&logmap->mtm_rwlock);
2460Sstevel@tonic-gate 	if (error)
2470Sstevel@tonic-gate 		return (2);
2480Sstevel@tonic-gate 	*retnbuf = nbuf;
2490Sstevel@tonic-gate 	return (0);
2500Sstevel@tonic-gate }
2510Sstevel@tonic-gate 
2520Sstevel@tonic-gate /*
2530Sstevel@tonic-gate  * Write out a cached roll buffer
2540Sstevel@tonic-gate  */
2550Sstevel@tonic-gate void
2560Sstevel@tonic-gate log_roll_write_crb(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
2570Sstevel@tonic-gate {
2580Sstevel@tonic-gate 	crb_t *crb = rbp->rb_crb;
2590Sstevel@tonic-gate 	buf_t *bp = &rbp->rb_bh;
2600Sstevel@tonic-gate 
2610Sstevel@tonic-gate 	bp->b_blkno = lbtodb(crb->c_mof);
2620Sstevel@tonic-gate 	bp->b_un.b_addr = crb->c_buf;
2630Sstevel@tonic-gate 	bp->b_bcount = crb->c_nb;
2640Sstevel@tonic-gate 	bp->b_bufsize = crb->c_nb;
2650Sstevel@tonic-gate 	ASSERT((crb->c_nb & DEV_BMASK) == 0);
2660Sstevel@tonic-gate 	bp->b_flags = B_WRITE;
2670Sstevel@tonic-gate 	logstats.ls_rwrites.value.ui64++;
2680Sstevel@tonic-gate 
2690Sstevel@tonic-gate 	/* if snapshots are enabled, call it */
2700Sstevel@tonic-gate 	if (ufsvfsp->vfs_snapshot) {
2710Sstevel@tonic-gate 		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
2720Sstevel@tonic-gate 	} else {
2730Sstevel@tonic-gate 		(void) bdev_strategy(bp);
2740Sstevel@tonic-gate 	}
2750Sstevel@tonic-gate }
2760Sstevel@tonic-gate 
2770Sstevel@tonic-gate /*
2780Sstevel@tonic-gate  * Write out a set of non cached roll buffers
2790Sstevel@tonic-gate  */
2800Sstevel@tonic-gate void
2810Sstevel@tonic-gate log_roll_write_bufs(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
2820Sstevel@tonic-gate {
2830Sstevel@tonic-gate 	buf_t		*bp = &rbp->rb_bh;
2840Sstevel@tonic-gate 	buf_t		*bp2;
2850Sstevel@tonic-gate 	rbsecmap_t	secmap = rbp->rb_secmap;
2860Sstevel@tonic-gate 	int		j, k;
2870Sstevel@tonic-gate 
2880Sstevel@tonic-gate 	ASSERT(secmap);
2890Sstevel@tonic-gate 	ASSERT((bp->b_flags & B_INVAL) == 0);
2900Sstevel@tonic-gate 
2910Sstevel@tonic-gate 	do { /* for each contiguous block of sectors */
2920Sstevel@tonic-gate 		/* find start of next sector to write */
2930Sstevel@tonic-gate 		for (j = 0; j < 16; ++j) {
2940Sstevel@tonic-gate 			if (secmap & UINT16_C(1))
2950Sstevel@tonic-gate 				break;
2960Sstevel@tonic-gate 			secmap >>= 1;
2970Sstevel@tonic-gate 		}
2980Sstevel@tonic-gate 		bp->b_un.b_addr += (j << DEV_BSHIFT);
2990Sstevel@tonic-gate 		bp->b_blkno += j;
3000Sstevel@tonic-gate 
3010Sstevel@tonic-gate 		/* calculate number of sectors */
3020Sstevel@tonic-gate 		secmap >>= 1;
3030Sstevel@tonic-gate 		j++;
3040Sstevel@tonic-gate 		for (k = 1; j < 16; ++j) {
3050Sstevel@tonic-gate 			if ((secmap & UINT16_C(1)) == 0)
3060Sstevel@tonic-gate 				break;
3070Sstevel@tonic-gate 			secmap >>= 1;
3080Sstevel@tonic-gate 			k++;
3090Sstevel@tonic-gate 		}
3100Sstevel@tonic-gate 		bp->b_bcount = k << DEV_BSHIFT;
3110Sstevel@tonic-gate 		bp->b_flags = B_WRITE;
3120Sstevel@tonic-gate 		logstats.ls_rwrites.value.ui64++;
3130Sstevel@tonic-gate 
3140Sstevel@tonic-gate 		/* if snapshots are enabled, call it */
3150Sstevel@tonic-gate 		if (ufsvfsp->vfs_snapshot)
3160Sstevel@tonic-gate 			fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
3170Sstevel@tonic-gate 		else
3180Sstevel@tonic-gate 			(void) bdev_strategy(bp);
3190Sstevel@tonic-gate 		if (secmap) {
3200Sstevel@tonic-gate 			/*
3210Sstevel@tonic-gate 			 * Allocate another buf_t to handle
3220Sstevel@tonic-gate 			 * the next write in this MAPBLOCK
3230Sstevel@tonic-gate 			 * Chain them via b_list.
3240Sstevel@tonic-gate 			 */
3250Sstevel@tonic-gate 			bp2 = kmem_alloc(sizeof (buf_t), KM_SLEEP);
3260Sstevel@tonic-gate 			bp->b_list = bp2;
3270Sstevel@tonic-gate 			bioinit(bp2);
3280Sstevel@tonic-gate 			bp2->b_iodone = trans_not_done;
3290Sstevel@tonic-gate 			bp2->b_bufsize = MAPBLOCKSIZE;
3300Sstevel@tonic-gate 			bp2->b_edev = bp->b_edev;
3310Sstevel@tonic-gate 			bp2->b_un.b_addr =
3320Sstevel@tonic-gate 			    bp->b_un.b_addr + bp->b_bcount;
3330Sstevel@tonic-gate 			bp2->b_blkno = bp->b_blkno + k;
3340Sstevel@tonic-gate 			bp = bp2;
3350Sstevel@tonic-gate 		}
3360Sstevel@tonic-gate 	} while (secmap);
3370Sstevel@tonic-gate }
3380Sstevel@tonic-gate 
3390Sstevel@tonic-gate /*
3400Sstevel@tonic-gate  * Asynchronously roll the deltas, using the sector map
3410Sstevel@tonic-gate  * in each rollbuf_t.
3420Sstevel@tonic-gate  */
3430Sstevel@tonic-gate int
3440Sstevel@tonic-gate log_roll_write(ml_unit_t *ul, rollbuf_t *rbs, int nbuf)
3450Sstevel@tonic-gate {
3460Sstevel@tonic-gate 
3470Sstevel@tonic-gate 	ufsvfs_t	*ufsvfsp = ul->un_ufsvfs;
3480Sstevel@tonic-gate 	rollbuf_t	*rbp;
3490Sstevel@tonic-gate 	buf_t		*bp, *bp2;
3500Sstevel@tonic-gate 	rollbuf_t	*head, *prev, *rbp2;
3510Sstevel@tonic-gate 
3520Sstevel@tonic-gate 	/*
3530Sstevel@tonic-gate 	 * Order the buffers by blkno
3540Sstevel@tonic-gate 	 */
3550Sstevel@tonic-gate 	ASSERT(nbuf > 0);
3560Sstevel@tonic-gate #ifdef lint
3570Sstevel@tonic-gate 	prev = rbs;
3580Sstevel@tonic-gate #endif
3590Sstevel@tonic-gate 	for (head = rbs, rbp = rbs + 1; rbp < rbs + nbuf; rbp++) {
3600Sstevel@tonic-gate 		for (rbp2 = head; rbp2; prev = rbp2, rbp2 = rbp2->rb_next) {
3610Sstevel@tonic-gate 			if (rbp->rb_bh.b_blkno < rbp2->rb_bh.b_blkno) {
3620Sstevel@tonic-gate 				if (rbp2 == head) {
3630Sstevel@tonic-gate 					rbp->rb_next = head;
3640Sstevel@tonic-gate 					head = rbp;
3650Sstevel@tonic-gate 				} else {
3660Sstevel@tonic-gate 					prev->rb_next = rbp;
3670Sstevel@tonic-gate 					rbp->rb_next = rbp2;
3680Sstevel@tonic-gate 				}
3690Sstevel@tonic-gate 				break;
3700Sstevel@tonic-gate 			}
3710Sstevel@tonic-gate 		}
3720Sstevel@tonic-gate 		if (rbp2 == NULL) {
3730Sstevel@tonic-gate 			prev->rb_next = rbp;
3740Sstevel@tonic-gate 			rbp->rb_next = NULL;
3750Sstevel@tonic-gate 		}
3760Sstevel@tonic-gate 	}
3770Sstevel@tonic-gate 
3780Sstevel@tonic-gate 	/*
3790Sstevel@tonic-gate 	 * issue the in-order writes
3800Sstevel@tonic-gate 	 */
3810Sstevel@tonic-gate 	for (rbp = head; rbp; rbp = rbp2) {
3820Sstevel@tonic-gate 		if (rbp->rb_crb) {
3830Sstevel@tonic-gate 			log_roll_write_crb(ufsvfsp, rbp);
3840Sstevel@tonic-gate 		} else {
3850Sstevel@tonic-gate 			log_roll_write_bufs(ufsvfsp, rbp);
3860Sstevel@tonic-gate 		}
3870Sstevel@tonic-gate 		/* null out the rb_next link for next set of rolling */
3880Sstevel@tonic-gate 		rbp2 = rbp->rb_next;
3890Sstevel@tonic-gate 		rbp->rb_next = NULL;
3900Sstevel@tonic-gate 	}
3910Sstevel@tonic-gate 
3920Sstevel@tonic-gate 	/*
3930Sstevel@tonic-gate 	 * wait for all the writes to finish
3940Sstevel@tonic-gate 	 */
3950Sstevel@tonic-gate 	for (rbp = rbs; rbp < rbs + nbuf; rbp++) {
3960Sstevel@tonic-gate 		bp = &rbp->rb_bh;
3970Sstevel@tonic-gate 		if (trans_not_wait(bp)) {
3980Sstevel@tonic-gate 			ldl_seterror(ul,
3990Sstevel@tonic-gate 			    "Error writing master during ufs log roll");
4000Sstevel@tonic-gate 		}
4010Sstevel@tonic-gate 
4020Sstevel@tonic-gate 		/*
4030Sstevel@tonic-gate 		 * Now wait for all the "cloned" buffer writes (if any)
4040Sstevel@tonic-gate 		 * and free those headers
4050Sstevel@tonic-gate 		 */
4060Sstevel@tonic-gate 		bp2 = bp->b_list;
4070Sstevel@tonic-gate 		bp->b_list = NULL;
4080Sstevel@tonic-gate 		while (bp2) {
4090Sstevel@tonic-gate 			if (trans_not_wait(bp2)) {
4100Sstevel@tonic-gate 				ldl_seterror(ul,
4110Sstevel@tonic-gate 				    "Error writing master during ufs log roll");
4120Sstevel@tonic-gate 			}
4130Sstevel@tonic-gate 			bp = bp2;
4140Sstevel@tonic-gate 			bp2 = bp2->b_list;
4150Sstevel@tonic-gate 			kmem_free(bp, sizeof (buf_t));
4160Sstevel@tonic-gate 		}
4170Sstevel@tonic-gate 	}
4180Sstevel@tonic-gate 
4190Sstevel@tonic-gate 	if (ul->un_flags & LDL_ERROR)
4200Sstevel@tonic-gate 		return (1);
4210Sstevel@tonic-gate 	return (0);
4220Sstevel@tonic-gate }
4230Sstevel@tonic-gate 
4240Sstevel@tonic-gate void
4250Sstevel@tonic-gate trans_roll(ml_unit_t *ul)
4260Sstevel@tonic-gate {
4270Sstevel@tonic-gate 	callb_cpr_t	cprinfo;
4280Sstevel@tonic-gate 	mt_map_t	*logmap = ul->un_logmap;
4290Sstevel@tonic-gate 	rollbuf_t	*rbs;
4300Sstevel@tonic-gate 	rollbuf_t	*rbp;
4310Sstevel@tonic-gate 	buf_t		*bp;
4320Sstevel@tonic-gate 	caddr_t		roll_bufs;
4330Sstevel@tonic-gate 	uint32_t	nmblk;
4340Sstevel@tonic-gate 	int		i;
4350Sstevel@tonic-gate 	int		doingforceroll;
4360Sstevel@tonic-gate 	int		nbuf;
4370Sstevel@tonic-gate 
4380Sstevel@tonic-gate 	CALLB_CPR_INIT(&cprinfo, &logmap->mtm_mutex, callb_generic_cpr,
4390Sstevel@tonic-gate 	    "trans_roll");
4400Sstevel@tonic-gate 
4410Sstevel@tonic-gate 	/*
4420Sstevel@tonic-gate 	 * We do not want the roll thread's writes to be
4430Sstevel@tonic-gate 	 * throttled by the snapshot.
4440Sstevel@tonic-gate 	 * If they are throttled then we can have a deadlock
4450Sstevel@tonic-gate 	 * between the roll thread and the snapshot taskq thread:
4460Sstevel@tonic-gate 	 * roll thread wants the throttling semaphore and
4470Sstevel@tonic-gate 	 * the snapshot taskq thread cannot release the semaphore
4480Sstevel@tonic-gate 	 * because it is writing to the log and the log is full.
4490Sstevel@tonic-gate 	 */
4500Sstevel@tonic-gate 
4510Sstevel@tonic-gate 	(void) tsd_set(bypass_snapshot_throttle_key, (void*)1);
4520Sstevel@tonic-gate 
4530Sstevel@tonic-gate 	/*
4540Sstevel@tonic-gate 	 * setup some roll parameters
4550Sstevel@tonic-gate 	 */
4560Sstevel@tonic-gate 	if (trans_roll_tics == 0)
4570Sstevel@tonic-gate 		trans_roll_tics = 5 * hz;
4580Sstevel@tonic-gate 	nmblk = log_roll_buffers();
4590Sstevel@tonic-gate 
4600Sstevel@tonic-gate 	/*
4610Sstevel@tonic-gate 	 * allocate the buffers and buffer headers
4620Sstevel@tonic-gate 	 */
4630Sstevel@tonic-gate 	roll_bufs = kmem_alloc(nmblk * MAPBLOCKSIZE, KM_SLEEP);
4640Sstevel@tonic-gate 	rbs = kmem_alloc(nmblk * sizeof (rollbuf_t), KM_SLEEP);
4650Sstevel@tonic-gate 
4660Sstevel@tonic-gate 	/*
4670Sstevel@tonic-gate 	 * initialize the buffer headers
4680Sstevel@tonic-gate 	 */
4690Sstevel@tonic-gate 	for (i = 0, rbp = rbs; i < nmblk; ++i, ++rbp) {
4700Sstevel@tonic-gate 		rbp->rb_next = NULL;
4710Sstevel@tonic-gate 		bp = &rbp->rb_bh;
4720Sstevel@tonic-gate 		bioinit(bp);
4730Sstevel@tonic-gate 		bp->b_edev = ul->un_dev;
4740Sstevel@tonic-gate 		bp->b_iodone = trans_not_done;
4750Sstevel@tonic-gate 		bp->b_bufsize = MAPBLOCKSIZE;
4760Sstevel@tonic-gate 	}
4770Sstevel@tonic-gate 
4780Sstevel@tonic-gate 	doingforceroll = 0;
4790Sstevel@tonic-gate 
4800Sstevel@tonic-gate again:
4810Sstevel@tonic-gate 	/*
4820Sstevel@tonic-gate 	 * LOOP FOREVER
4830Sstevel@tonic-gate 	 */
4840Sstevel@tonic-gate 
4850Sstevel@tonic-gate 	/*
4860Sstevel@tonic-gate 	 * exit on demand
4870Sstevel@tonic-gate 	 */
4880Sstevel@tonic-gate 	mutex_enter(&logmap->mtm_mutex);
4890Sstevel@tonic-gate 	if ((ul->un_flags & LDL_ERROR) || (logmap->mtm_flags & MTM_ROLL_EXIT)) {
4900Sstevel@tonic-gate 		kmem_free(rbs, nmblk * sizeof (rollbuf_t));
4910Sstevel@tonic-gate 		kmem_free(roll_bufs, nmblk * MAPBLOCKSIZE);
4920Sstevel@tonic-gate 		logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_RUNNING |
4930Sstevel@tonic-gate 		    MTM_ROLL_EXIT | MTM_ROLLING);
4940Sstevel@tonic-gate 		cv_broadcast(&logmap->mtm_from_roll_cv);
4950Sstevel@tonic-gate 		CALLB_CPR_EXIT(&cprinfo);
4960Sstevel@tonic-gate 		thread_exit();
4970Sstevel@tonic-gate 		/* NOTREACHED */
4980Sstevel@tonic-gate 	}
4990Sstevel@tonic-gate 
5000Sstevel@tonic-gate 	/*
5010Sstevel@tonic-gate 	 * MT_SCAN debug mode
5020Sstevel@tonic-gate 	 *	don't roll except in FORCEROLL situations
5030Sstevel@tonic-gate 	 */
5040Sstevel@tonic-gate 	if (logmap->mtm_debug & MT_SCAN)
5050Sstevel@tonic-gate 		if ((logmap->mtm_flags & MTM_FORCE_ROLL) == 0) {
5060Sstevel@tonic-gate 			mutex_exit(&logmap->mtm_mutex);
5070Sstevel@tonic-gate 			trans_roll_wait(logmap, &cprinfo);
5080Sstevel@tonic-gate 			goto again;
5090Sstevel@tonic-gate 		}
5100Sstevel@tonic-gate 	ASSERT(logmap->mtm_trimlof == 0);
5110Sstevel@tonic-gate 
5120Sstevel@tonic-gate 	/*
5130Sstevel@tonic-gate 	 * If we've finished a force roll cycle then wakeup any
5140Sstevel@tonic-gate 	 * waiters.
5150Sstevel@tonic-gate 	 */
5160Sstevel@tonic-gate 	if (doingforceroll) {
5170Sstevel@tonic-gate 		doingforceroll = 0;
5180Sstevel@tonic-gate 		logmap->mtm_flags &= ~MTM_FORCE_ROLL;
5190Sstevel@tonic-gate 		mutex_exit(&logmap->mtm_mutex);
5200Sstevel@tonic-gate 		cv_broadcast(&logmap->mtm_from_roll_cv);
5210Sstevel@tonic-gate 	} else {
5220Sstevel@tonic-gate 		mutex_exit(&logmap->mtm_mutex);
5230Sstevel@tonic-gate 	}
5240Sstevel@tonic-gate 
5250Sstevel@tonic-gate 	/*
5260Sstevel@tonic-gate 	 * If someone wants us to roll something; then do it
5270Sstevel@tonic-gate 	 */
5280Sstevel@tonic-gate 	if (logmap->mtm_flags & MTM_FORCE_ROLL) {
5290Sstevel@tonic-gate 		doingforceroll = 1;
5300Sstevel@tonic-gate 		goto rollsomething;
5310Sstevel@tonic-gate 	}
5320Sstevel@tonic-gate 
5330Sstevel@tonic-gate 	/*
5340Sstevel@tonic-gate 	 * Log is busy, check if logmap is getting full.
5350Sstevel@tonic-gate 	 */
5360Sstevel@tonic-gate 	if (logmap_need_roll(logmap)) {
5370Sstevel@tonic-gate 		goto rollsomething;
5380Sstevel@tonic-gate 	}
5390Sstevel@tonic-gate 
5400Sstevel@tonic-gate 	/*
5410Sstevel@tonic-gate 	 * Check if the log is idle and is not empty
5420Sstevel@tonic-gate 	 */
5430Sstevel@tonic-gate 	if (!logmap->mtm_ref && !ldl_empty(ul)) {
5440Sstevel@tonic-gate 		goto rollsomething;
5450Sstevel@tonic-gate 	}
5460Sstevel@tonic-gate 
5470Sstevel@tonic-gate 	/*
5480Sstevel@tonic-gate 	 * Log is busy, check if its getting full
5490Sstevel@tonic-gate 	 */
5500Sstevel@tonic-gate 	if (ldl_need_roll(ul)) {
5510Sstevel@tonic-gate 		goto rollsomething;
5520Sstevel@tonic-gate 	}
5530Sstevel@tonic-gate 
5540Sstevel@tonic-gate 	/*
5550Sstevel@tonic-gate 	 * nothing to do; wait a bit and then start over
5560Sstevel@tonic-gate 	 */
5570Sstevel@tonic-gate 	trans_roll_wait(logmap, &cprinfo);
5580Sstevel@tonic-gate 	goto again;
5590Sstevel@tonic-gate 
5600Sstevel@tonic-gate 	/*
5610Sstevel@tonic-gate 	 * ROLL SOMETHING
5620Sstevel@tonic-gate 	 */
5630Sstevel@tonic-gate 
5640Sstevel@tonic-gate rollsomething:
5650Sstevel@tonic-gate 	/*
5660Sstevel@tonic-gate 	 * Use the cached roll buffers, or read the master
5670Sstevel@tonic-gate 	 * and overlay the deltas
5680Sstevel@tonic-gate 	 */
5690Sstevel@tonic-gate 	switch (log_roll_read(ul, rbs, nmblk, roll_bufs, &nbuf)) {
5700Sstevel@tonic-gate 	case 1: trans_roll_wait(logmap, &cprinfo);
5710Sstevel@tonic-gate 		/* FALLTHROUGH */
5720Sstevel@tonic-gate 	case 2: goto again;
5730Sstevel@tonic-gate 	/* default case is success */
5740Sstevel@tonic-gate 	}
5750Sstevel@tonic-gate 
5760Sstevel@tonic-gate 	/*
5770Sstevel@tonic-gate 	 * Asynchronously write out the deltas
5780Sstevel@tonic-gate 	 */
5790Sstevel@tonic-gate 	if (log_roll_write(ul, rbs, nbuf))
5800Sstevel@tonic-gate 		goto again;
5810Sstevel@tonic-gate 
5820Sstevel@tonic-gate 	/*
5830Sstevel@tonic-gate 	 * free up the deltas in the logmap
5840Sstevel@tonic-gate 	 */
5850Sstevel@tonic-gate 	for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
5860Sstevel@tonic-gate 		bp = &rbp->rb_bh;
5870Sstevel@tonic-gate 		logmap_remove_roll(logmap,
5880Sstevel@tonic-gate 		    ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, MAPBLOCKSIZE);
5890Sstevel@tonic-gate 	}
5900Sstevel@tonic-gate 
5910Sstevel@tonic-gate 	/*
5920Sstevel@tonic-gate 	 * free up log space; if possible
5930Sstevel@tonic-gate 	 */
5940Sstevel@tonic-gate 	logmap_sethead(logmap, ul);
5950Sstevel@tonic-gate 
5960Sstevel@tonic-gate 	/*
5970Sstevel@tonic-gate 	 * LOOP
5980Sstevel@tonic-gate 	 */
5990Sstevel@tonic-gate 	goto again;
6000Sstevel@tonic-gate }
601