10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*4662Sfrankho * Common Development and Distribution License (the "License"). 6*4662Sfrankho * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate /* 22*4662Sfrankho * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 230Sstevel@tonic-gate * Use is subject to license terms. 240Sstevel@tonic-gate */ 250Sstevel@tonic-gate 260Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 270Sstevel@tonic-gate 280Sstevel@tonic-gate #include <sys/systm.h> 290Sstevel@tonic-gate #include <sys/types.h> 300Sstevel@tonic-gate #include <sys/vnode.h> 310Sstevel@tonic-gate #include <sys/errno.h> 320Sstevel@tonic-gate #include <sys/sysmacros.h> 330Sstevel@tonic-gate #include <sys/debug.h> 340Sstevel@tonic-gate #include <sys/kmem.h> 350Sstevel@tonic-gate #include <sys/conf.h> 360Sstevel@tonic-gate #include <sys/proc.h> 370Sstevel@tonic-gate #include <sys/cmn_err.h> 380Sstevel@tonic-gate #include <sys/fssnap_if.h> 390Sstevel@tonic-gate #include <sys/fs/ufs_inode.h> 400Sstevel@tonic-gate #include <sys/fs/ufs_filio.h> 410Sstevel@tonic-gate #include <sys/fs/ufs_log.h> 420Sstevel@tonic-gate #include <sys/fs/ufs_bio.h> 430Sstevel@tonic-gate #include <sys/inttypes.h> 440Sstevel@tonic-gate #include <sys/callb.h> 450Sstevel@tonic-gate #include <sys/tnf_probe.h> 460Sstevel@tonic-gate 470Sstevel@tonic-gate /* 480Sstevel@tonic-gate * Kernel threads for logging 490Sstevel@tonic-gate * Currently only one for rolling the log (one per log). 500Sstevel@tonic-gate */ 510Sstevel@tonic-gate 520Sstevel@tonic-gate #define LUFS_DEFAULT_NUM_ROLL_BUFS 16 530Sstevel@tonic-gate #define LUFS_DEFAULT_MIN_ROLL_BUFS 4 540Sstevel@tonic-gate #define LUFS_DEFAULT_MAX_ROLL_BUFS 64 550Sstevel@tonic-gate 560Sstevel@tonic-gate /* 570Sstevel@tonic-gate * Macros 580Sstevel@tonic-gate */ 590Sstevel@tonic-gate #define logmap_need_roll(logmap) ((logmap)->mtm_nme > logmap_maxnme) 600Sstevel@tonic-gate #define ldl_empty(ul) ((ul)->un_head_lof == (ul)->un_tail_lof) 610Sstevel@tonic-gate 620Sstevel@tonic-gate /* 630Sstevel@tonic-gate * Tunables 640Sstevel@tonic-gate */ 650Sstevel@tonic-gate uint32_t lufs_num_roll_bufs = LUFS_DEFAULT_NUM_ROLL_BUFS; 660Sstevel@tonic-gate uint32_t lufs_min_roll_bufs = LUFS_DEFAULT_MIN_ROLL_BUFS; 670Sstevel@tonic-gate uint32_t lufs_max_roll_bufs = LUFS_DEFAULT_MAX_ROLL_BUFS; 680Sstevel@tonic-gate long logmap_maxnme = 1536; 690Sstevel@tonic-gate int trans_roll_tics = 0; 700Sstevel@tonic-gate uint64_t trans_roll_new_delta = 0; 710Sstevel@tonic-gate uint64_t lrr_wait = 0; 720Sstevel@tonic-gate /* 730Sstevel@tonic-gate * Key for thread specific data for the roll thread to 740Sstevel@tonic-gate * bypass snapshot throttling 750Sstevel@tonic-gate */ 760Sstevel@tonic-gate uint_t bypass_snapshot_throttle_key; 770Sstevel@tonic-gate 780Sstevel@tonic-gate /* 790Sstevel@tonic-gate * externs 800Sstevel@tonic-gate */ 810Sstevel@tonic-gate extern kmutex_t ml_scan; 820Sstevel@tonic-gate extern kcondvar_t ml_scan_cv; 830Sstevel@tonic-gate extern int maxphys; 840Sstevel@tonic-gate 850Sstevel@tonic-gate static void 860Sstevel@tonic-gate trans_roll_wait(mt_map_t *logmap, callb_cpr_t *cprinfop) 870Sstevel@tonic-gate { 880Sstevel@tonic-gate mutex_enter(&logmap->mtm_mutex); 890Sstevel@tonic-gate logmap->mtm_ref = 0; 900Sstevel@tonic-gate if (logmap->mtm_flags & MTM_FORCE_ROLL) { 910Sstevel@tonic-gate cv_broadcast(&logmap->mtm_from_roll_cv); 920Sstevel@tonic-gate } 930Sstevel@tonic-gate logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLLING); 940Sstevel@tonic-gate CALLB_CPR_SAFE_BEGIN(cprinfop); 950Sstevel@tonic-gate (void) cv_timedwait(&logmap->mtm_to_roll_cv, &logmap->mtm_mutex, 96*4662Sfrankho lbolt + trans_roll_tics); 970Sstevel@tonic-gate CALLB_CPR_SAFE_END(cprinfop, &logmap->mtm_mutex); 980Sstevel@tonic-gate logmap->mtm_flags |= MTM_ROLLING; 990Sstevel@tonic-gate mutex_exit(&logmap->mtm_mutex); 1000Sstevel@tonic-gate } 1010Sstevel@tonic-gate 1020Sstevel@tonic-gate /* 1030Sstevel@tonic-gate * returns the number of 8K buffers to use for rolling the log 1040Sstevel@tonic-gate */ 1050Sstevel@tonic-gate static uint32_t 1060Sstevel@tonic-gate log_roll_buffers() 1070Sstevel@tonic-gate { 1080Sstevel@tonic-gate /* 1090Sstevel@tonic-gate * sanity validate the tunable lufs_num_roll_bufs 1100Sstevel@tonic-gate */ 1110Sstevel@tonic-gate if (lufs_num_roll_bufs < lufs_min_roll_bufs) { 1120Sstevel@tonic-gate return (lufs_min_roll_bufs); 1130Sstevel@tonic-gate } 1140Sstevel@tonic-gate if (lufs_num_roll_bufs > lufs_max_roll_bufs) { 1150Sstevel@tonic-gate return (lufs_max_roll_bufs); 1160Sstevel@tonic-gate } 1170Sstevel@tonic-gate return (lufs_num_roll_bufs); 1180Sstevel@tonic-gate } 1190Sstevel@tonic-gate 1200Sstevel@tonic-gate /* 1210Sstevel@tonic-gate * Find something to roll, then if we don't have cached roll buffers 1220Sstevel@tonic-gate * covering all the deltas in that MAPBLOCK then read the master 1230Sstevel@tonic-gate * and overlay the deltas. 1240Sstevel@tonic-gate * returns; 1250Sstevel@tonic-gate * 0 if sucessful 1260Sstevel@tonic-gate * 1 on finding nothing to roll 1270Sstevel@tonic-gate * 2 on error 1280Sstevel@tonic-gate */ 1290Sstevel@tonic-gate int 1300Sstevel@tonic-gate log_roll_read(ml_unit_t *ul, rollbuf_t *rbs, int nmblk, caddr_t roll_bufs, 1310Sstevel@tonic-gate int *retnbuf) 1320Sstevel@tonic-gate { 1330Sstevel@tonic-gate offset_t mof; 1340Sstevel@tonic-gate buf_t *bp; 1350Sstevel@tonic-gate rollbuf_t *rbp; 1360Sstevel@tonic-gate mt_map_t *logmap = ul->un_logmap; 1370Sstevel@tonic-gate daddr_t mblkno; 1380Sstevel@tonic-gate int i; 1390Sstevel@tonic-gate int error; 1400Sstevel@tonic-gate int nbuf; 1410Sstevel@tonic-gate 1420Sstevel@tonic-gate /* 1430Sstevel@tonic-gate * Make sure there is really something to roll 1440Sstevel@tonic-gate */ 1450Sstevel@tonic-gate mof = 0; 1460Sstevel@tonic-gate if (!logmap_next_roll(logmap, &mof)) { 1470Sstevel@tonic-gate return (1); 1480Sstevel@tonic-gate } 1490Sstevel@tonic-gate 1500Sstevel@tonic-gate /* 1510Sstevel@tonic-gate * build some master blocks + deltas to roll forward 1520Sstevel@tonic-gate */ 1530Sstevel@tonic-gate rw_enter(&logmap->mtm_rwlock, RW_READER); 1540Sstevel@tonic-gate nbuf = 0; 1550Sstevel@tonic-gate do { 1560Sstevel@tonic-gate mof = mof & (offset_t)MAPBLOCKMASK; 1570Sstevel@tonic-gate mblkno = lbtodb(mof); 1580Sstevel@tonic-gate 1590Sstevel@tonic-gate /* 1600Sstevel@tonic-gate * Check for the case of a new delta to a set up buffer 1610Sstevel@tonic-gate */ 1620Sstevel@tonic-gate for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) { 1630Sstevel@tonic-gate if (P2ALIGN(rbp->rb_bh.b_blkno, 1640Sstevel@tonic-gate MAPBLOCKSIZE / DEV_BSIZE) == mblkno) { 1650Sstevel@tonic-gate TNF_PROBE_0(trans_roll_new_delta, "lufs", 1660Sstevel@tonic-gate /* CSTYLED */); 1670Sstevel@tonic-gate trans_roll_new_delta++; 1680Sstevel@tonic-gate /* Flush out the current set of buffers */ 1690Sstevel@tonic-gate goto flush_bufs; 1700Sstevel@tonic-gate } 1710Sstevel@tonic-gate } 1720Sstevel@tonic-gate 1730Sstevel@tonic-gate /* 1740Sstevel@tonic-gate * Work out what to roll next. If it isn't cached then read 1750Sstevel@tonic-gate * it asynchronously from the master. 1760Sstevel@tonic-gate */ 1770Sstevel@tonic-gate bp = &rbp->rb_bh; 1780Sstevel@tonic-gate bp->b_blkno = mblkno; 1790Sstevel@tonic-gate bp->b_flags = B_READ; 1800Sstevel@tonic-gate bp->b_un.b_addr = roll_bufs + (nbuf << MAPBLOCKSHIFT); 1810Sstevel@tonic-gate bp->b_bufsize = MAPBLOCKSIZE; 1820Sstevel@tonic-gate if (top_read_roll(rbp, ul)) { 1830Sstevel@tonic-gate /* logmap deltas were in use */ 1840Sstevel@tonic-gate if (nbuf == 0) { 1850Sstevel@tonic-gate /* 1860Sstevel@tonic-gate * On first buffer wait for the logmap user 1870Sstevel@tonic-gate * to finish by grabbing the logmap lock 1880Sstevel@tonic-gate * exclusively rather than spinning 1890Sstevel@tonic-gate */ 1900Sstevel@tonic-gate rw_exit(&logmap->mtm_rwlock); 1910Sstevel@tonic-gate lrr_wait++; 1920Sstevel@tonic-gate rw_enter(&logmap->mtm_rwlock, RW_WRITER); 1930Sstevel@tonic-gate rw_exit(&logmap->mtm_rwlock); 1940Sstevel@tonic-gate return (1); 1950Sstevel@tonic-gate } 1960Sstevel@tonic-gate /* we have at least one buffer - flush it */ 1970Sstevel@tonic-gate goto flush_bufs; 1980Sstevel@tonic-gate } 1990Sstevel@tonic-gate if ((bp->b_flags & B_INVAL) == 0) { 2000Sstevel@tonic-gate nbuf++; 2010Sstevel@tonic-gate } 2020Sstevel@tonic-gate mof += MAPBLOCKSIZE; 2030Sstevel@tonic-gate } while ((nbuf < nmblk) && logmap_next_roll(logmap, &mof)); 2040Sstevel@tonic-gate 2050Sstevel@tonic-gate /* 2060Sstevel@tonic-gate * If there was nothing to roll cycle back 2070Sstevel@tonic-gate */ 2080Sstevel@tonic-gate if (nbuf == 0) { 2090Sstevel@tonic-gate rw_exit(&logmap->mtm_rwlock); 2100Sstevel@tonic-gate return (1); 2110Sstevel@tonic-gate } 2120Sstevel@tonic-gate 2130Sstevel@tonic-gate flush_bufs: 2140Sstevel@tonic-gate /* 2150Sstevel@tonic-gate * For each buffer, if it isn't cached then wait for the read to 2160Sstevel@tonic-gate * finish and overlay the deltas. 2170Sstevel@tonic-gate */ 2180Sstevel@tonic-gate for (error = 0, i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) { 2190Sstevel@tonic-gate if (!rbp->rb_crb) { 2200Sstevel@tonic-gate bp = &rbp->rb_bh; 2210Sstevel@tonic-gate if (trans_not_wait(bp)) { 2220Sstevel@tonic-gate ldl_seterror(ul, 2230Sstevel@tonic-gate "Error reading master during ufs log roll"); 2240Sstevel@tonic-gate error = 1; 2250Sstevel@tonic-gate } 2260Sstevel@tonic-gate /* 2270Sstevel@tonic-gate * sync read the data from the log 2280Sstevel@tonic-gate */ 2290Sstevel@tonic-gate if (ldl_read(ul, bp->b_un.b_addr, 2300Sstevel@tonic-gate ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, 2310Sstevel@tonic-gate MAPBLOCKSIZE, rbp->rb_age)) { 2320Sstevel@tonic-gate error = 1; 2330Sstevel@tonic-gate } 2340Sstevel@tonic-gate } 2350Sstevel@tonic-gate 2360Sstevel@tonic-gate /* 2370Sstevel@tonic-gate * reset the age bit in the age list 2380Sstevel@tonic-gate */ 2390Sstevel@tonic-gate logmap_list_put_roll(logmap, rbp->rb_age); 2400Sstevel@tonic-gate 2410Sstevel@tonic-gate if (ul->un_flags & LDL_ERROR) { 2420Sstevel@tonic-gate error = 1; 2430Sstevel@tonic-gate } 2440Sstevel@tonic-gate } 2450Sstevel@tonic-gate rw_exit(&logmap->mtm_rwlock); 2460Sstevel@tonic-gate if (error) 2470Sstevel@tonic-gate return (2); 2480Sstevel@tonic-gate *retnbuf = nbuf; 2490Sstevel@tonic-gate return (0); 2500Sstevel@tonic-gate } 2510Sstevel@tonic-gate 2520Sstevel@tonic-gate /* 2530Sstevel@tonic-gate * Write out a cached roll buffer 2540Sstevel@tonic-gate */ 2550Sstevel@tonic-gate void 2560Sstevel@tonic-gate log_roll_write_crb(ufsvfs_t *ufsvfsp, rollbuf_t *rbp) 2570Sstevel@tonic-gate { 2580Sstevel@tonic-gate crb_t *crb = rbp->rb_crb; 2590Sstevel@tonic-gate buf_t *bp = &rbp->rb_bh; 2600Sstevel@tonic-gate 2610Sstevel@tonic-gate bp->b_blkno = lbtodb(crb->c_mof); 2620Sstevel@tonic-gate bp->b_un.b_addr = crb->c_buf; 2630Sstevel@tonic-gate bp->b_bcount = crb->c_nb; 2640Sstevel@tonic-gate bp->b_bufsize = crb->c_nb; 2650Sstevel@tonic-gate ASSERT((crb->c_nb & DEV_BMASK) == 0); 2660Sstevel@tonic-gate bp->b_flags = B_WRITE; 2670Sstevel@tonic-gate logstats.ls_rwrites.value.ui64++; 2680Sstevel@tonic-gate 2690Sstevel@tonic-gate /* if snapshots are enabled, call it */ 2700Sstevel@tonic-gate if (ufsvfsp->vfs_snapshot) { 2710Sstevel@tonic-gate fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 2720Sstevel@tonic-gate } else { 2730Sstevel@tonic-gate (void) bdev_strategy(bp); 2740Sstevel@tonic-gate } 2750Sstevel@tonic-gate } 2760Sstevel@tonic-gate 2770Sstevel@tonic-gate /* 2780Sstevel@tonic-gate * Write out a set of non cached roll buffers 2790Sstevel@tonic-gate */ 2800Sstevel@tonic-gate void 2810Sstevel@tonic-gate log_roll_write_bufs(ufsvfs_t *ufsvfsp, rollbuf_t *rbp) 2820Sstevel@tonic-gate { 2830Sstevel@tonic-gate buf_t *bp = &rbp->rb_bh; 2840Sstevel@tonic-gate buf_t *bp2; 2850Sstevel@tonic-gate rbsecmap_t secmap = rbp->rb_secmap; 2860Sstevel@tonic-gate int j, k; 2870Sstevel@tonic-gate 2880Sstevel@tonic-gate ASSERT(secmap); 2890Sstevel@tonic-gate ASSERT((bp->b_flags & B_INVAL) == 0); 2900Sstevel@tonic-gate 2910Sstevel@tonic-gate do { /* for each contiguous block of sectors */ 2920Sstevel@tonic-gate /* find start of next sector to write */ 2930Sstevel@tonic-gate for (j = 0; j < 16; ++j) { 2940Sstevel@tonic-gate if (secmap & UINT16_C(1)) 2950Sstevel@tonic-gate break; 2960Sstevel@tonic-gate secmap >>= 1; 2970Sstevel@tonic-gate } 2980Sstevel@tonic-gate bp->b_un.b_addr += (j << DEV_BSHIFT); 2990Sstevel@tonic-gate bp->b_blkno += j; 3000Sstevel@tonic-gate 3010Sstevel@tonic-gate /* calculate number of sectors */ 3020Sstevel@tonic-gate secmap >>= 1; 3030Sstevel@tonic-gate j++; 3040Sstevel@tonic-gate for (k = 1; j < 16; ++j) { 3050Sstevel@tonic-gate if ((secmap & UINT16_C(1)) == 0) 3060Sstevel@tonic-gate break; 3070Sstevel@tonic-gate secmap >>= 1; 3080Sstevel@tonic-gate k++; 3090Sstevel@tonic-gate } 3100Sstevel@tonic-gate bp->b_bcount = k << DEV_BSHIFT; 3110Sstevel@tonic-gate bp->b_flags = B_WRITE; 3120Sstevel@tonic-gate logstats.ls_rwrites.value.ui64++; 3130Sstevel@tonic-gate 3140Sstevel@tonic-gate /* if snapshots are enabled, call it */ 3150Sstevel@tonic-gate if (ufsvfsp->vfs_snapshot) 3160Sstevel@tonic-gate fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 3170Sstevel@tonic-gate else 3180Sstevel@tonic-gate (void) bdev_strategy(bp); 3190Sstevel@tonic-gate if (secmap) { 3200Sstevel@tonic-gate /* 3210Sstevel@tonic-gate * Allocate another buf_t to handle 3220Sstevel@tonic-gate * the next write in this MAPBLOCK 3230Sstevel@tonic-gate * Chain them via b_list. 3240Sstevel@tonic-gate */ 3250Sstevel@tonic-gate bp2 = kmem_alloc(sizeof (buf_t), KM_SLEEP); 3260Sstevel@tonic-gate bp->b_list = bp2; 3270Sstevel@tonic-gate bioinit(bp2); 3280Sstevel@tonic-gate bp2->b_iodone = trans_not_done; 3290Sstevel@tonic-gate bp2->b_bufsize = MAPBLOCKSIZE; 3300Sstevel@tonic-gate bp2->b_edev = bp->b_edev; 3310Sstevel@tonic-gate bp2->b_un.b_addr = 3320Sstevel@tonic-gate bp->b_un.b_addr + bp->b_bcount; 3330Sstevel@tonic-gate bp2->b_blkno = bp->b_blkno + k; 3340Sstevel@tonic-gate bp = bp2; 3350Sstevel@tonic-gate } 3360Sstevel@tonic-gate } while (secmap); 3370Sstevel@tonic-gate } 3380Sstevel@tonic-gate 3390Sstevel@tonic-gate /* 3400Sstevel@tonic-gate * Asynchronously roll the deltas, using the sector map 3410Sstevel@tonic-gate * in each rollbuf_t. 3420Sstevel@tonic-gate */ 3430Sstevel@tonic-gate int 3440Sstevel@tonic-gate log_roll_write(ml_unit_t *ul, rollbuf_t *rbs, int nbuf) 3450Sstevel@tonic-gate { 3460Sstevel@tonic-gate 3470Sstevel@tonic-gate ufsvfs_t *ufsvfsp = ul->un_ufsvfs; 3480Sstevel@tonic-gate rollbuf_t *rbp; 3490Sstevel@tonic-gate buf_t *bp, *bp2; 3500Sstevel@tonic-gate rollbuf_t *head, *prev, *rbp2; 3510Sstevel@tonic-gate 3520Sstevel@tonic-gate /* 3530Sstevel@tonic-gate * Order the buffers by blkno 3540Sstevel@tonic-gate */ 3550Sstevel@tonic-gate ASSERT(nbuf > 0); 3560Sstevel@tonic-gate #ifdef lint 3570Sstevel@tonic-gate prev = rbs; 3580Sstevel@tonic-gate #endif 3590Sstevel@tonic-gate for (head = rbs, rbp = rbs + 1; rbp < rbs + nbuf; rbp++) { 3600Sstevel@tonic-gate for (rbp2 = head; rbp2; prev = rbp2, rbp2 = rbp2->rb_next) { 3610Sstevel@tonic-gate if (rbp->rb_bh.b_blkno < rbp2->rb_bh.b_blkno) { 3620Sstevel@tonic-gate if (rbp2 == head) { 3630Sstevel@tonic-gate rbp->rb_next = head; 3640Sstevel@tonic-gate head = rbp; 3650Sstevel@tonic-gate } else { 3660Sstevel@tonic-gate prev->rb_next = rbp; 3670Sstevel@tonic-gate rbp->rb_next = rbp2; 3680Sstevel@tonic-gate } 3690Sstevel@tonic-gate break; 3700Sstevel@tonic-gate } 3710Sstevel@tonic-gate } 3720Sstevel@tonic-gate if (rbp2 == NULL) { 3730Sstevel@tonic-gate prev->rb_next = rbp; 3740Sstevel@tonic-gate rbp->rb_next = NULL; 3750Sstevel@tonic-gate } 3760Sstevel@tonic-gate } 3770Sstevel@tonic-gate 3780Sstevel@tonic-gate /* 3790Sstevel@tonic-gate * issue the in-order writes 3800Sstevel@tonic-gate */ 3810Sstevel@tonic-gate for (rbp = head; rbp; rbp = rbp2) { 3820Sstevel@tonic-gate if (rbp->rb_crb) { 3830Sstevel@tonic-gate log_roll_write_crb(ufsvfsp, rbp); 3840Sstevel@tonic-gate } else { 3850Sstevel@tonic-gate log_roll_write_bufs(ufsvfsp, rbp); 3860Sstevel@tonic-gate } 3870Sstevel@tonic-gate /* null out the rb_next link for next set of rolling */ 3880Sstevel@tonic-gate rbp2 = rbp->rb_next; 3890Sstevel@tonic-gate rbp->rb_next = NULL; 3900Sstevel@tonic-gate } 3910Sstevel@tonic-gate 3920Sstevel@tonic-gate /* 3930Sstevel@tonic-gate * wait for all the writes to finish 3940Sstevel@tonic-gate */ 3950Sstevel@tonic-gate for (rbp = rbs; rbp < rbs + nbuf; rbp++) { 3960Sstevel@tonic-gate bp = &rbp->rb_bh; 3970Sstevel@tonic-gate if (trans_not_wait(bp)) { 3980Sstevel@tonic-gate ldl_seterror(ul, 3990Sstevel@tonic-gate "Error writing master during ufs log roll"); 4000Sstevel@tonic-gate } 4010Sstevel@tonic-gate 4020Sstevel@tonic-gate /* 4030Sstevel@tonic-gate * Now wait for all the "cloned" buffer writes (if any) 4040Sstevel@tonic-gate * and free those headers 4050Sstevel@tonic-gate */ 4060Sstevel@tonic-gate bp2 = bp->b_list; 4070Sstevel@tonic-gate bp->b_list = NULL; 4080Sstevel@tonic-gate while (bp2) { 4090Sstevel@tonic-gate if (trans_not_wait(bp2)) { 4100Sstevel@tonic-gate ldl_seterror(ul, 4110Sstevel@tonic-gate "Error writing master during ufs log roll"); 4120Sstevel@tonic-gate } 4130Sstevel@tonic-gate bp = bp2; 4140Sstevel@tonic-gate bp2 = bp2->b_list; 4150Sstevel@tonic-gate kmem_free(bp, sizeof (buf_t)); 4160Sstevel@tonic-gate } 4170Sstevel@tonic-gate } 4180Sstevel@tonic-gate 4190Sstevel@tonic-gate if (ul->un_flags & LDL_ERROR) 4200Sstevel@tonic-gate return (1); 4210Sstevel@tonic-gate return (0); 4220Sstevel@tonic-gate } 4230Sstevel@tonic-gate 4240Sstevel@tonic-gate void 4250Sstevel@tonic-gate trans_roll(ml_unit_t *ul) 4260Sstevel@tonic-gate { 4270Sstevel@tonic-gate callb_cpr_t cprinfo; 4280Sstevel@tonic-gate mt_map_t *logmap = ul->un_logmap; 4290Sstevel@tonic-gate rollbuf_t *rbs; 4300Sstevel@tonic-gate rollbuf_t *rbp; 4310Sstevel@tonic-gate buf_t *bp; 4320Sstevel@tonic-gate caddr_t roll_bufs; 4330Sstevel@tonic-gate uint32_t nmblk; 4340Sstevel@tonic-gate int i; 4350Sstevel@tonic-gate int doingforceroll; 4360Sstevel@tonic-gate int nbuf; 4370Sstevel@tonic-gate 4380Sstevel@tonic-gate CALLB_CPR_INIT(&cprinfo, &logmap->mtm_mutex, callb_generic_cpr, 4390Sstevel@tonic-gate "trans_roll"); 4400Sstevel@tonic-gate 4410Sstevel@tonic-gate /* 4420Sstevel@tonic-gate * We do not want the roll thread's writes to be 4430Sstevel@tonic-gate * throttled by the snapshot. 4440Sstevel@tonic-gate * If they are throttled then we can have a deadlock 4450Sstevel@tonic-gate * between the roll thread and the snapshot taskq thread: 4460Sstevel@tonic-gate * roll thread wants the throttling semaphore and 4470Sstevel@tonic-gate * the snapshot taskq thread cannot release the semaphore 4480Sstevel@tonic-gate * because it is writing to the log and the log is full. 4490Sstevel@tonic-gate */ 4500Sstevel@tonic-gate 4510Sstevel@tonic-gate (void) tsd_set(bypass_snapshot_throttle_key, (void*)1); 4520Sstevel@tonic-gate 4530Sstevel@tonic-gate /* 4540Sstevel@tonic-gate * setup some roll parameters 4550Sstevel@tonic-gate */ 4560Sstevel@tonic-gate if (trans_roll_tics == 0) 4570Sstevel@tonic-gate trans_roll_tics = 5 * hz; 4580Sstevel@tonic-gate nmblk = log_roll_buffers(); 4590Sstevel@tonic-gate 4600Sstevel@tonic-gate /* 4610Sstevel@tonic-gate * allocate the buffers and buffer headers 4620Sstevel@tonic-gate */ 4630Sstevel@tonic-gate roll_bufs = kmem_alloc(nmblk * MAPBLOCKSIZE, KM_SLEEP); 4640Sstevel@tonic-gate rbs = kmem_alloc(nmblk * sizeof (rollbuf_t), KM_SLEEP); 4650Sstevel@tonic-gate 4660Sstevel@tonic-gate /* 4670Sstevel@tonic-gate * initialize the buffer headers 4680Sstevel@tonic-gate */ 4690Sstevel@tonic-gate for (i = 0, rbp = rbs; i < nmblk; ++i, ++rbp) { 4700Sstevel@tonic-gate rbp->rb_next = NULL; 4710Sstevel@tonic-gate bp = &rbp->rb_bh; 4720Sstevel@tonic-gate bioinit(bp); 4730Sstevel@tonic-gate bp->b_edev = ul->un_dev; 4740Sstevel@tonic-gate bp->b_iodone = trans_not_done; 4750Sstevel@tonic-gate bp->b_bufsize = MAPBLOCKSIZE; 4760Sstevel@tonic-gate } 4770Sstevel@tonic-gate 4780Sstevel@tonic-gate doingforceroll = 0; 4790Sstevel@tonic-gate 4800Sstevel@tonic-gate again: 4810Sstevel@tonic-gate /* 4820Sstevel@tonic-gate * LOOP FOREVER 4830Sstevel@tonic-gate */ 4840Sstevel@tonic-gate 4850Sstevel@tonic-gate /* 4860Sstevel@tonic-gate * exit on demand 4870Sstevel@tonic-gate */ 4880Sstevel@tonic-gate mutex_enter(&logmap->mtm_mutex); 4890Sstevel@tonic-gate if ((ul->un_flags & LDL_ERROR) || (logmap->mtm_flags & MTM_ROLL_EXIT)) { 4900Sstevel@tonic-gate kmem_free(rbs, nmblk * sizeof (rollbuf_t)); 4910Sstevel@tonic-gate kmem_free(roll_bufs, nmblk * MAPBLOCKSIZE); 4920Sstevel@tonic-gate logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_RUNNING | 4930Sstevel@tonic-gate MTM_ROLL_EXIT | MTM_ROLLING); 4940Sstevel@tonic-gate cv_broadcast(&logmap->mtm_from_roll_cv); 4950Sstevel@tonic-gate CALLB_CPR_EXIT(&cprinfo); 4960Sstevel@tonic-gate thread_exit(); 4970Sstevel@tonic-gate /* NOTREACHED */ 4980Sstevel@tonic-gate } 4990Sstevel@tonic-gate 5000Sstevel@tonic-gate /* 5010Sstevel@tonic-gate * MT_SCAN debug mode 5020Sstevel@tonic-gate * don't roll except in FORCEROLL situations 5030Sstevel@tonic-gate */ 5040Sstevel@tonic-gate if (logmap->mtm_debug & MT_SCAN) 5050Sstevel@tonic-gate if ((logmap->mtm_flags & MTM_FORCE_ROLL) == 0) { 5060Sstevel@tonic-gate mutex_exit(&logmap->mtm_mutex); 5070Sstevel@tonic-gate trans_roll_wait(logmap, &cprinfo); 5080Sstevel@tonic-gate goto again; 5090Sstevel@tonic-gate } 5100Sstevel@tonic-gate ASSERT(logmap->mtm_trimlof == 0); 5110Sstevel@tonic-gate 5120Sstevel@tonic-gate /* 5130Sstevel@tonic-gate * If we've finished a force roll cycle then wakeup any 5140Sstevel@tonic-gate * waiters. 5150Sstevel@tonic-gate */ 5160Sstevel@tonic-gate if (doingforceroll) { 5170Sstevel@tonic-gate doingforceroll = 0; 5180Sstevel@tonic-gate logmap->mtm_flags &= ~MTM_FORCE_ROLL; 5190Sstevel@tonic-gate mutex_exit(&logmap->mtm_mutex); 5200Sstevel@tonic-gate cv_broadcast(&logmap->mtm_from_roll_cv); 5210Sstevel@tonic-gate } else { 5220Sstevel@tonic-gate mutex_exit(&logmap->mtm_mutex); 5230Sstevel@tonic-gate } 5240Sstevel@tonic-gate 5250Sstevel@tonic-gate /* 5260Sstevel@tonic-gate * If someone wants us to roll something; then do it 5270Sstevel@tonic-gate */ 5280Sstevel@tonic-gate if (logmap->mtm_flags & MTM_FORCE_ROLL) { 5290Sstevel@tonic-gate doingforceroll = 1; 5300Sstevel@tonic-gate goto rollsomething; 5310Sstevel@tonic-gate } 5320Sstevel@tonic-gate 5330Sstevel@tonic-gate /* 5340Sstevel@tonic-gate * Log is busy, check if logmap is getting full. 5350Sstevel@tonic-gate */ 5360Sstevel@tonic-gate if (logmap_need_roll(logmap)) { 5370Sstevel@tonic-gate goto rollsomething; 5380Sstevel@tonic-gate } 5390Sstevel@tonic-gate 5400Sstevel@tonic-gate /* 5410Sstevel@tonic-gate * Check if the log is idle and is not empty 5420Sstevel@tonic-gate */ 5430Sstevel@tonic-gate if (!logmap->mtm_ref && !ldl_empty(ul)) { 5440Sstevel@tonic-gate goto rollsomething; 5450Sstevel@tonic-gate } 5460Sstevel@tonic-gate 5470Sstevel@tonic-gate /* 5480Sstevel@tonic-gate * Log is busy, check if its getting full 5490Sstevel@tonic-gate */ 5500Sstevel@tonic-gate if (ldl_need_roll(ul)) { 5510Sstevel@tonic-gate goto rollsomething; 5520Sstevel@tonic-gate } 5530Sstevel@tonic-gate 5540Sstevel@tonic-gate /* 5550Sstevel@tonic-gate * nothing to do; wait a bit and then start over 5560Sstevel@tonic-gate */ 5570Sstevel@tonic-gate trans_roll_wait(logmap, &cprinfo); 5580Sstevel@tonic-gate goto again; 5590Sstevel@tonic-gate 5600Sstevel@tonic-gate /* 5610Sstevel@tonic-gate * ROLL SOMETHING 5620Sstevel@tonic-gate */ 5630Sstevel@tonic-gate 5640Sstevel@tonic-gate rollsomething: 5650Sstevel@tonic-gate /* 5660Sstevel@tonic-gate * Use the cached roll buffers, or read the master 5670Sstevel@tonic-gate * and overlay the deltas 5680Sstevel@tonic-gate */ 5690Sstevel@tonic-gate switch (log_roll_read(ul, rbs, nmblk, roll_bufs, &nbuf)) { 5700Sstevel@tonic-gate case 1: trans_roll_wait(logmap, &cprinfo); 5710Sstevel@tonic-gate /* FALLTHROUGH */ 5720Sstevel@tonic-gate case 2: goto again; 5730Sstevel@tonic-gate /* default case is success */ 5740Sstevel@tonic-gate } 5750Sstevel@tonic-gate 5760Sstevel@tonic-gate /* 5770Sstevel@tonic-gate * Asynchronously write out the deltas 5780Sstevel@tonic-gate */ 5790Sstevel@tonic-gate if (log_roll_write(ul, rbs, nbuf)) 5800Sstevel@tonic-gate goto again; 5810Sstevel@tonic-gate 5820Sstevel@tonic-gate /* 5830Sstevel@tonic-gate * free up the deltas in the logmap 5840Sstevel@tonic-gate */ 5850Sstevel@tonic-gate for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) { 5860Sstevel@tonic-gate bp = &rbp->rb_bh; 5870Sstevel@tonic-gate logmap_remove_roll(logmap, 5880Sstevel@tonic-gate ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, MAPBLOCKSIZE); 5890Sstevel@tonic-gate } 5900Sstevel@tonic-gate 5910Sstevel@tonic-gate /* 5920Sstevel@tonic-gate * free up log space; if possible 5930Sstevel@tonic-gate */ 5940Sstevel@tonic-gate logmap_sethead(logmap, ul); 5950Sstevel@tonic-gate 5960Sstevel@tonic-gate /* 5970Sstevel@tonic-gate * LOOP 5980Sstevel@tonic-gate */ 5990Sstevel@tonic-gate goto again; 6000Sstevel@tonic-gate } 601