10Sstevel@tonic-gate /*
20Sstevel@tonic-gate * CDDL HEADER START
30Sstevel@tonic-gate *
40Sstevel@tonic-gate * The contents of this file are subject to the terms of the
5*6747Sga159272 * Common Development and Distribution License (the "License").
6*6747Sga159272 * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate *
80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate * See the License for the specific language governing permissions
110Sstevel@tonic-gate * and limitations under the License.
120Sstevel@tonic-gate *
130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate *
190Sstevel@tonic-gate * CDDL HEADER END
200Sstevel@tonic-gate */
210Sstevel@tonic-gate /*
22*6747Sga159272 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
230Sstevel@tonic-gate * Use is subject to license terms.
240Sstevel@tonic-gate */
250Sstevel@tonic-gate
260Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI"
270Sstevel@tonic-gate
280Sstevel@tonic-gate #include <sys/param.h>
290Sstevel@tonic-gate #include <sys/vnode.h>
300Sstevel@tonic-gate #include <sys/fs/ufs_fsdir.h>
310Sstevel@tonic-gate #include <sys/fs/ufs_fs.h>
320Sstevel@tonic-gate #include <sys/fs/ufs_inode.h>
330Sstevel@tonic-gate #include <sys/fs/ufs_log.h>
340Sstevel@tonic-gate #include <sys/sysmacros.h>
350Sstevel@tonic-gate #include <sys/promif.h>
360Sstevel@tonic-gate #include <sys/machparam.h>
370Sstevel@tonic-gate
380Sstevel@tonic-gate #include <sys/stat.h>
390Sstevel@tonic-gate #include <sys/bootdebug.h>
400Sstevel@tonic-gate #include <sys/salib.h>
410Sstevel@tonic-gate #include <sys/saio.h>
420Sstevel@tonic-gate #include <sys/filep.h>
430Sstevel@tonic-gate
440Sstevel@tonic-gate
450Sstevel@tonic-gate /*
460Sstevel@tonic-gate * Big theory statement on how ufsboot makes use of the log
470Sstevel@tonic-gate * in case the filesystem wasn't shut down cleanly.
480Sstevel@tonic-gate *
490Sstevel@tonic-gate * The structure of the ufs on-disk log looks like this:
500Sstevel@tonic-gate *
510Sstevel@tonic-gate * +-----------------+
520Sstevel@tonic-gate * | SUPERBLOCK |
530Sstevel@tonic-gate * | ... |
540Sstevel@tonic-gate * | fs_logbno +--> +-----------------------+
550Sstevel@tonic-gate * | ... | | EXTENT BLOCK |
560Sstevel@tonic-gate * +-----------------+ | ... |
570Sstevel@tonic-gate * | nextents |
580Sstevel@tonic-gate * +----------------------+ extents[0].pbno |
590Sstevel@tonic-gate * | | { extents[1].pbno } +------------+
600Sstevel@tonic-gate * | | ... +--> ... |
610Sstevel@tonic-gate * | +-----------------------+ |
620Sstevel@tonic-gate * v |
630Sstevel@tonic-gate * +-----------------------------+ \ |
640Sstevel@tonic-gate * | ON-DISK LOG HEADER | | |
650Sstevel@tonic-gate * | ... | | |
660Sstevel@tonic-gate * | od_head_lof +--+ | |
670Sstevel@tonic-gate * | ... | | | |
680Sstevel@tonic-gate * +-----------------------------+ <|---|- od_bol_lof |
690Sstevel@tonic-gate * | sector (may contain deltas) | | | (logical offset) |
700Sstevel@tonic-gate * | +-------------------------+ | | |
710Sstevel@tonic-gate * | | trailer (some ident#) | | > extents[0].nbno |
720Sstevel@tonic-gate * +---+-------------------------+ | | blocks ("sectors") |
730Sstevel@tonic-gate * . . | | |
740Sstevel@tonic-gate * . . | | |
750Sstevel@tonic-gate * +-----------------------------+<-+ | |
760Sstevel@tonic-gate * | delta1 delta2 delta3 | | |
770Sstevel@tonic-gate * | d +-------------------------+ | |
780Sstevel@tonic-gate * | e | ident#: od_head_ident | | |
790Sstevel@tonic-gate * +---+-------------------------+ / |
800Sstevel@tonic-gate * |
810Sstevel@tonic-gate * +-----------------------------+ <---------------------------+
820Sstevel@tonic-gate * | lta4 delta5 delta6 de |
830Sstevel@tonic-gate * | l +-------------------------+
840Sstevel@tonic-gate * | t | ident#: od_head_ident+1 |
850Sstevel@tonic-gate * +---+-------------------------+
860Sstevel@tonic-gate * . .
870Sstevel@tonic-gate * +-----------------------------+
880Sstevel@tonic-gate * | sector (may contain deltas) |
890Sstevel@tonic-gate * | +------------------+
900Sstevel@tonic-gate * | | trailer (ident#) |
910Sstevel@tonic-gate * +----------+------------------+ <-- od_eol_lof (logical offset)
920Sstevel@tonic-gate *
930Sstevel@tonic-gate * The ufs on-disk log has the following properties:
940Sstevel@tonic-gate *
950Sstevel@tonic-gate * 1. The log is made up from at least one extent. "fs_logbno" in
960Sstevel@tonic-gate * the superblock points to where this is found.
970Sstevel@tonic-gate * 2. Extents describe the logical layout.
980Sstevel@tonic-gate * - Logical offset 0 is the on-disk log header. It's also
990Sstevel@tonic-gate * at the beginning of the first physical block.
1000Sstevel@tonic-gate * - If there's more than one extent, the equation holds:
1010Sstevel@tonic-gate * extent[i+1].lbno == extent[i].lbno + extent[i].nbno
1020Sstevel@tonic-gate * i.e. logical offsets form a contiguous sequence. Yet on disk,
1030Sstevel@tonic-gate * two logically-adjacent offsets may be located in two
1040Sstevel@tonic-gate * physically disjoint extents, so logical offsets need to be
1050Sstevel@tonic-gate * translated into physical disk block addresses for access.
1060Sstevel@tonic-gate * - Various fields in the on-disk log header structure refer
1070Sstevel@tonic-gate * to such logical log offsets.
1080Sstevel@tonic-gate * 3. The actual logical logspace begins after the log header, at
1090Sstevel@tonic-gate * the logical offset indicated by "od_bol_lof". Every 512 Bytes
1100Sstevel@tonic-gate * (a "sector" in terms of ufs logging) is a sector trailer which
1110Sstevel@tonic-gate * contains a sequence number, the sector ident.
1120Sstevel@tonic-gate * 4. Deltas are packed tight in the remaining space, i.e. a delta
1130Sstevel@tonic-gate * may be part of more than one sector. Reads from the logspace
1140Sstevel@tonic-gate * must be split at sector boundaries, since the trailer is never
1150Sstevel@tonic-gate * part of a delta. Delta sizes vary.
1160Sstevel@tonic-gate * 5. The field "od_head_lof" points to the start of the dirty part
1170Sstevel@tonic-gate * of the log, i.e. to the first delta header. Likewise, "od_head_ident"
1180Sstevel@tonic-gate * is the sequence number where the valid part of the log starts; if
1190Sstevel@tonic-gate * the sector pointed to by "od_head_lof" has a sector ident different
1200Sstevel@tonic-gate * from "od_head_ident", the log is empty.
1210Sstevel@tonic-gate * 6. The valid part of the log extends for as many sectors as their ident
1220Sstevel@tonic-gate * numbers form a contiguous sequence. When reaching the logical end of
1230Sstevel@tonic-gate * the log, "od_bol_lof", logical offsets wrap around to "od_bol_lof",
1240Sstevel@tonic-gate * i.e. the log forms a circular buffer.
1250Sstevel@tonic-gate *
1260Sstevel@tonic-gate * For the strategy how to handle accessing the log, item 4. is the
1270Sstevel@tonic-gate * most important one - its consequence is that the log can only be
1280Sstevel@tonic-gate * read in one direction - forward, starting at the head.
1290Sstevel@tonic-gate *
1300Sstevel@tonic-gate * The task of identifying whether a given metadata block is
1310Sstevel@tonic-gate * actually in the log therefore requires reading the entire
1320Sstevel@tonic-gate * log. Doing so is memory-efficient but kills speed if re-done
1330Sstevel@tonic-gate * at every metadata read (64MB log size vs. 512 byte metadata
1340Sstevel@tonic-gate * block size: 128 times as much I/O, possibly only to find out
1350Sstevel@tonic-gate * that this block was not in the log ...).
1360Sstevel@tonic-gate *
1370Sstevel@tonic-gate * First thought to speed this up is to let ufsboot roll the log.
1380Sstevel@tonic-gate * But this is not possible because:
1390Sstevel@tonic-gate * - ufsboot currently does not implement any write functionality,
1400Sstevel@tonic-gate * the boot-time ufs implementation is read-only.
1410Sstevel@tonic-gate * - firmware write interfaces may or may not be available, in any
1420Sstevel@tonic-gate * case, they're rarely used and untested for such a purpose.
1430Sstevel@tonic-gate * - that would duplicate a lot of code, since at the moment only
1440Sstevel@tonic-gate * kernel ufs logging implements log rolling.
1450Sstevel@tonic-gate * - the boot environment cannot be considered high-performance;
1460Sstevel@tonic-gate * rolling the log there would be slow.
1470Sstevel@tonic-gate * - boot device and root device could well be different, creating
1480Sstevel@tonic-gate * inconsistencies e.g. with a mirrored root if the log is rolled.
1490Sstevel@tonic-gate *
1500Sstevel@tonic-gate * Therefore, caching the log structural information (boot-relevant
1510Sstevel@tonic-gate * deltas and their logical log offset) is required for fast access
1520Sstevel@tonic-gate * to the data in the log. This code builds a logmap for that purpose.
1530Sstevel@tonic-gate *
1540Sstevel@tonic-gate * As a simple optimization, if we find the log is empty, we will not
1550Sstevel@tonic-gate * use it - log reader support for ufsboot has no noticeable overhead
1560Sstevel@tonic-gate * for clean logs, or for root filesystems that aren't logging.
1570Sstevel@tonic-gate */
1580Sstevel@tonic-gate
1590Sstevel@tonic-gate #define LB_HASHSHIFT 13
1600Sstevel@tonic-gate #define LB_HASHSIZE (1 << LB_HASHSHIFT)
1610Sstevel@tonic-gate #define LB_HASHFUNC(mof) (((mof) >> LB_HASHSHIFT) & (LB_HASHSIZE - 1))
1620Sstevel@tonic-gate
1630Sstevel@tonic-gate #define LOGBUF_MAXSIZE (8*1024*1024)
1640Sstevel@tonic-gate #define LOGBUF_MINSIZE (256*1024)
1650Sstevel@tonic-gate
1660Sstevel@tonic-gate #define LOG_IS_EMPTY 0
1670Sstevel@tonic-gate #define LOG_IS_OK 1
1680Sstevel@tonic-gate #define LOG_IS_ERRORED 2
1690Sstevel@tonic-gate
1700Sstevel@tonic-gate /*
1710Sstevel@tonic-gate * We build a hashed logmap of those while scanning the log.
1720Sstevel@tonic-gate * sizeof(lb_map_t) is 40 on 64bit, 32 on 32bit; the max sized
1730Sstevel@tonic-gate * resalloc'ed buffer can accomodate around ~500k of those;
1740Sstevel@tonic-gate * this is approximately the maximum amount of deltas we'll
1750Sstevel@tonic-gate * see if a 64MB ufs log is completely filled. We'll make no
1760Sstevel@tonic-gate * attempt to free and reallocate the resalloc'ed buffer if
1770Sstevel@tonic-gate * we overflow, as conservative sizing should make that an
1780Sstevel@tonic-gate * impossibility. A future enhancement may allocate memory
1790Sstevel@tonic-gate * here as needed - once the boot time memory allocator
1800Sstevel@tonic-gate * supports that.
1810Sstevel@tonic-gate */
1820Sstevel@tonic-gate typedef struct lb_mapentry {
1830Sstevel@tonic-gate struct lb_mapentry *l_next; /* hash chaining */
1840Sstevel@tonic-gate struct lb_mapentry *l_prev; /* hash chaining */
1850Sstevel@tonic-gate int64_t l_mof; /* disk addr this delta is against */
1860Sstevel@tonic-gate int16_t l_nb; /* size of delta */
1870Sstevel@tonic-gate int16_t l_flags;
1880Sstevel@tonic-gate int32_t l_lof; /* log offset for delta header */
1890Sstevel@tonic-gate int32_t l_tid; /* transaction this delta is part of */
1900Sstevel@tonic-gate delta_t l_typ; /* see <sys/fs/ufs_trans.h> */
1910Sstevel@tonic-gate } lb_me_t;
1920Sstevel@tonic-gate
1930Sstevel@tonic-gate #define LB_ISCANCELLED 1
1940Sstevel@tonic-gate
1950Sstevel@tonic-gate #define inslist(lh, l) if ((*(lh))) { \
1960Sstevel@tonic-gate (*(lh))->l_prev->l_next = (l); \
1970Sstevel@tonic-gate (l)->l_next = (*(lh)); \
1980Sstevel@tonic-gate (l)->l_prev = (*(lh))->l_prev; \
1990Sstevel@tonic-gate (*(lh))->l_prev = (l); \
2000Sstevel@tonic-gate } else { \
2010Sstevel@tonic-gate (l)->l_next = (l); \
2020Sstevel@tonic-gate (l)->l_prev = (l); \
2030Sstevel@tonic-gate (*(lh)) = l; \
2040Sstevel@tonic-gate }
2050Sstevel@tonic-gate
2060Sstevel@tonic-gate #define remlist(lh, l) \
2070Sstevel@tonic-gate if ((l)->l_next == (l)) { \
2080Sstevel@tonic-gate if (*(lh) != (l) || (l)->l_prev != (l)) \
2090Sstevel@tonic-gate dprintf("Logmap hash inconsistency.\n"); \
2100Sstevel@tonic-gate *(lh) = (lb_me_t *)NULL; \
2110Sstevel@tonic-gate } else { \
2120Sstevel@tonic-gate if (*(lh) == (l)) \
2130Sstevel@tonic-gate *(lh) = (l)->l_next; \
2140Sstevel@tonic-gate (l)->l_prev->l_next = (l)->l_next; \
2150Sstevel@tonic-gate (l)->l_next->l_prev = (l)->l_prev; \
2160Sstevel@tonic-gate }
2170Sstevel@tonic-gate
2180Sstevel@tonic-gate #define lufs_alloc_me() \
2190Sstevel@tonic-gate (lb_me_t *)lufs_alloc_from_logbuf(sizeof (lb_me_t))
2200Sstevel@tonic-gate
2210Sstevel@tonic-gate extern int boothowto;
2220Sstevel@tonic-gate static int ufs_is_lufs = 0;
2230Sstevel@tonic-gate static fileid_t *logfp = (fileid_t *)NULL;
2240Sstevel@tonic-gate static extent_block_t *eb = (extent_block_t *)NULL;
2250Sstevel@tonic-gate static ml_odunit_t odi;
2260Sstevel@tonic-gate
2270Sstevel@tonic-gate static char logbuffer_min[LOGBUF_MINSIZE];
2280Sstevel@tonic-gate static caddr_t logbuffer = (caddr_t)NULL;
2290Sstevel@tonic-gate static caddr_t elogbuffer = (caddr_t)NULL;
2300Sstevel@tonic-gate static caddr_t logbuf_curptr;
2310Sstevel@tonic-gate static lb_me_t **loghash = (lb_me_t **)NULL;
2320Sstevel@tonic-gate static lb_me_t *lfreelist;
2330Sstevel@tonic-gate
2340Sstevel@tonic-gate static uint32_t curtid;
2350Sstevel@tonic-gate
2360Sstevel@tonic-gate
2370Sstevel@tonic-gate int lufs_support = 1;
2380Sstevel@tonic-gate
2390Sstevel@tonic-gate void lufs_boot_init(fileid_t *);
2400Sstevel@tonic-gate void lufs_closeall(void);
2410Sstevel@tonic-gate void lufs_merge_deltas(fileid_t *);
2420Sstevel@tonic-gate
2430Sstevel@tonic-gate static int lufs_logscan(void);
2440Sstevel@tonic-gate
2450Sstevel@tonic-gate extern int diskread(fileid_t *filep);
2460Sstevel@tonic-gate extern caddr_t resalloc(enum RESOURCES, size_t, caddr_t, int);
2470Sstevel@tonic-gate
248*6747Sga159272 #if defined(__sparcv9)
2490Sstevel@tonic-gate #define LOGBUF_BASEADDR ((caddr_t)(SYSBASE - LOGBUF_MAXSIZE))
2500Sstevel@tonic-gate #endif
2510Sstevel@tonic-gate
2520Sstevel@tonic-gate static int
lufs_alloc_logbuf(void)2530Sstevel@tonic-gate lufs_alloc_logbuf(void)
2540Sstevel@tonic-gate {
2550Sstevel@tonic-gate /*
2560Sstevel@tonic-gate * Allocate memory for caching the log. Since the logbuffer can
2570Sstevel@tonic-gate * potentially exceed the boot scratch memory limit, we use resalloc
2580Sstevel@tonic-gate * directly, passing the allocation to the low-level boot-time
2590Sstevel@tonic-gate * backend allocator. The chosen VA range is the top end of
2600Sstevel@tonic-gate * the kernel's segmap segment, so we're not interfering
2610Sstevel@tonic-gate * with the kernel because segmap is created at a time when
2620Sstevel@tonic-gate * the 2nd-stage boot has already been unloaded and this VA
2630Sstevel@tonic-gate * range was given back.
2640Sstevel@tonic-gate *
2650Sstevel@tonic-gate * On sparc platforms, the kernel cannot recover the memory
2660Sstevel@tonic-gate * obtained from resalloc because the page structs are allocated
2670Sstevel@tonic-gate * before the call to BOP_QUIESCE. To avoid leaking this
2680Sstevel@tonic-gate * memory, the logbuffer is allocated from a small bss array
2690Sstevel@tonic-gate * that should hold the logmap except in the most extreme cases.
2700Sstevel@tonic-gate * If the bss array is too small, the logbuffer is extended
2710Sstevel@tonic-gate * from resalloc 1 page at a time.
2720Sstevel@tonic-gate */
2730Sstevel@tonic-gate
2740Sstevel@tonic-gate logbuffer = logbuffer_min;
2750Sstevel@tonic-gate elogbuffer = logbuffer+LOGBUF_MINSIZE;
2760Sstevel@tonic-gate logbuf_curptr = logbuffer;
2770Sstevel@tonic-gate lfreelist = (lb_me_t *)NULL;
2780Sstevel@tonic-gate
2790Sstevel@tonic-gate if (logbuffer == (caddr_t)NULL)
2800Sstevel@tonic-gate return (0);
2810Sstevel@tonic-gate
2820Sstevel@tonic-gate dprintf("Buffer for boot loader logging support: 0x%p, size 0x%x\n",
283*6747Sga159272 logbuffer, elogbuffer-logbuffer);
2840Sstevel@tonic-gate
2850Sstevel@tonic-gate return (1);
2860Sstevel@tonic-gate }
2870Sstevel@tonic-gate
2880Sstevel@tonic-gate static void
lufs_free_logbuf()2890Sstevel@tonic-gate lufs_free_logbuf()
2900Sstevel@tonic-gate {
2910Sstevel@tonic-gate /*
2920Sstevel@tonic-gate * Solaris/x86 has no prom_free() routine at this time.
2930Sstevel@tonic-gate * Reclaiming the VA range below KERNEL_TEXT on Solaris/x86
2940Sstevel@tonic-gate * is done by the kernel startup itself, in hat_unload_prom()
2950Sstevel@tonic-gate * after the bootloader has been quiesced.
2960Sstevel@tonic-gate *
2970Sstevel@tonic-gate * Solaris on sparc has a prom_free() routine that will update
2980Sstevel@tonic-gate * the memlist properties to reflect the freeing of the
2990Sstevel@tonic-gate * logbuffer. However, the sparc kernel cannot recover
3000Sstevel@tonic-gate * the memory freed after the call to BOP_QUIESCE as the
3010Sstevel@tonic-gate * page struct have already been allocated. We call
3020Sstevel@tonic-gate * prom_free anyway so that the kernel can reclaim this
3030Sstevel@tonic-gate * memory in the future.
3040Sstevel@tonic-gate */
3050Sstevel@tonic-gate if (logbuffer == LOGBUF_BASEADDR)
3060Sstevel@tonic-gate prom_free(logbuffer, elogbuffer-logbuffer);
3070Sstevel@tonic-gate logbuffer = (caddr_t)NULL;
3080Sstevel@tonic-gate }
3090Sstevel@tonic-gate
3100Sstevel@tonic-gate static caddr_t
lufs_alloc_from_logbuf(size_t sz)3110Sstevel@tonic-gate lufs_alloc_from_logbuf(size_t sz)
3120Sstevel@tonic-gate {
3130Sstevel@tonic-gate caddr_t tmpaddr;
3140Sstevel@tonic-gate lb_me_t *l;
3150Sstevel@tonic-gate
3160Sstevel@tonic-gate /*
3170Sstevel@tonic-gate * Satisfy lb_me_t allocations from the freelist
3180Sstevel@tonic-gate * first if possible.
3190Sstevel@tonic-gate */
3200Sstevel@tonic-gate if ((sz == sizeof (lb_me_t)) && lfreelist) {
3210Sstevel@tonic-gate l = lfreelist;
3220Sstevel@tonic-gate lfreelist = lfreelist->l_next;
3230Sstevel@tonic-gate return ((caddr_t)l);
3240Sstevel@tonic-gate }
3250Sstevel@tonic-gate if (elogbuffer < logbuf_curptr + sz) {
3260Sstevel@tonic-gate caddr_t np;
3270Sstevel@tonic-gate size_t nsz;
3280Sstevel@tonic-gate
3290Sstevel@tonic-gate /*
3300Sstevel@tonic-gate * Out of space in current chunk - try to add another.
3310Sstevel@tonic-gate */
3320Sstevel@tonic-gate if (logbuffer == logbuffer_min) {
3330Sstevel@tonic-gate np = LOGBUF_BASEADDR;
3340Sstevel@tonic-gate } else {
3350Sstevel@tonic-gate np = elogbuffer;
3360Sstevel@tonic-gate }
3370Sstevel@tonic-gate nsz = roundup(sz, PAGESIZE);
3380Sstevel@tonic-gate if (np + nsz > LOGBUF_BASEADDR + LOGBUF_MAXSIZE) {
3390Sstevel@tonic-gate return ((caddr_t)NULL);
3400Sstevel@tonic-gate }
3410Sstevel@tonic-gate
3420Sstevel@tonic-gate np = resalloc(RES_CHILDVIRT, nsz, np, 0UL);
3430Sstevel@tonic-gate if (np == (caddr_t)NULL) {
3440Sstevel@tonic-gate return ((caddr_t)NULL);
3450Sstevel@tonic-gate }
3460Sstevel@tonic-gate if (logbuffer == logbuffer_min)
3470Sstevel@tonic-gate logbuffer = LOGBUF_BASEADDR;
3480Sstevel@tonic-gate logbuf_curptr = np;
3490Sstevel@tonic-gate elogbuffer = logbuf_curptr + nsz;
3500Sstevel@tonic-gate }
3510Sstevel@tonic-gate
3520Sstevel@tonic-gate tmpaddr = logbuf_curptr;
3530Sstevel@tonic-gate logbuf_curptr += sz;
3540Sstevel@tonic-gate bzero(tmpaddr, sz);
3550Sstevel@tonic-gate return (tmpaddr);
3560Sstevel@tonic-gate }
3570Sstevel@tonic-gate
3580Sstevel@tonic-gate static int32_t
lufs_read_log(int32_t addr,caddr_t va,int nb)3590Sstevel@tonic-gate lufs_read_log(int32_t addr, caddr_t va, int nb)
3600Sstevel@tonic-gate {
3610Sstevel@tonic-gate int i, fastpath = 0;
3620Sstevel@tonic-gate daddr_t pblk, lblk;
3630Sstevel@tonic-gate sect_trailer_t *st;
3640Sstevel@tonic-gate uint32_t ident;
3650Sstevel@tonic-gate
3660Sstevel@tonic-gate /*
3670Sstevel@tonic-gate * Fast path for skipping the read if no target buffer
3680Sstevel@tonic-gate * is specified. Don't do this for the initial scan.
3690Sstevel@tonic-gate */
3700Sstevel@tonic-gate if (ufs_is_lufs && (va == (caddr_t)NULL))
3710Sstevel@tonic-gate fastpath = 1;
3720Sstevel@tonic-gate
3730Sstevel@tonic-gate while (nb) {
3740Sstevel@tonic-gate /* log wraparound check */
3750Sstevel@tonic-gate if (addr == odi.od_eol_lof)
3760Sstevel@tonic-gate addr = odi.od_bol_lof;
3770Sstevel@tonic-gate if (fastpath)
3780Sstevel@tonic-gate goto read_done;
3790Sstevel@tonic-gate
3800Sstevel@tonic-gate /*
3810Sstevel@tonic-gate * Translate logically-contiguous log offsets into physical
3820Sstevel@tonic-gate * block numbers. For a log consisting of a single extent:
3830Sstevel@tonic-gate * pbno = btodb(addr) - extents[0].lbno;
3840Sstevel@tonic-gate * Otherwise, search for the extent which contains addr.
3850Sstevel@tonic-gate */
3860Sstevel@tonic-gate pblk = 0;
3870Sstevel@tonic-gate lblk = btodb(addr);
3880Sstevel@tonic-gate for (i = 0; i < eb->nextents; i++) {
3890Sstevel@tonic-gate if (lblk >= eb->extents[i].lbno &&
390*6747Sga159272 lblk < eb->extents[i].lbno +
391*6747Sga159272 eb->extents[i].nbno) {
3920Sstevel@tonic-gate pblk = lblk - eb->extents[i].lbno +
393*6747Sga159272 eb->extents[i].pbno;
3940Sstevel@tonic-gate break;
3950Sstevel@tonic-gate }
3960Sstevel@tonic-gate }
3970Sstevel@tonic-gate
3980Sstevel@tonic-gate if (pblk == 0) {
3990Sstevel@tonic-gate /*
4000Sstevel@tonic-gate * block #0 can never be in a log extent since this
4010Sstevel@tonic-gate * block always contains the primary superblock copy.
4020Sstevel@tonic-gate */
4030Sstevel@tonic-gate dprintf("No log extent found for log offset 0x%llx.\n",
404*6747Sga159272 addr);
4050Sstevel@tonic-gate return (0);
4060Sstevel@tonic-gate }
4070Sstevel@tonic-gate
4080Sstevel@tonic-gate /*
4090Sstevel@tonic-gate * Check whether the block we want is cached from the last
4100Sstevel@tonic-gate * read. If not, read it in now.
4110Sstevel@tonic-gate */
4120Sstevel@tonic-gate if (logfp->fi_blocknum != pblk) {
4130Sstevel@tonic-gate logfp->fi_blocknum = pblk;
4140Sstevel@tonic-gate logfp->fi_memp = logfp->fi_buf;
4150Sstevel@tonic-gate logfp->fi_count = DEV_BSIZE;
4160Sstevel@tonic-gate logfp->fi_offset = 0;
4170Sstevel@tonic-gate if (diskread(logfp)) {
4180Sstevel@tonic-gate dprintf("I/O error reading the ufs log" \
419*6747Sga159272 " at block 0x%x.\n",
420*6747Sga159272 logfp->fi_blocknum);
4210Sstevel@tonic-gate return (0);
4220Sstevel@tonic-gate }
4230Sstevel@tonic-gate /*
4240Sstevel@tonic-gate * Log structure verification. The block which we just
4250Sstevel@tonic-gate * read has an ident number that must match its offset
4260Sstevel@tonic-gate * in blocks from the head of the log. Since the log
4270Sstevel@tonic-gate * can wrap around, we have to check for that to get the
4280Sstevel@tonic-gate * ident right. Out-of-sequence idents can happen after
4290Sstevel@tonic-gate * power failures, panics during a partial transaction,
4300Sstevel@tonic-gate * media errors, ... - in any case, they mark the end of
4310Sstevel@tonic-gate * the valid part of the log.
4320Sstevel@tonic-gate */
4330Sstevel@tonic-gate st = (sect_trailer_t *)(logfp->fi_memp +
434*6747Sga159272 LDL_USABLE_BSIZE);
4350Sstevel@tonic-gate /* od_head_ident is where the sequence starts */
4360Sstevel@tonic-gate ident = odi.od_head_ident;
4370Sstevel@tonic-gate if (lblk >= lbtodb(odi.od_head_lof)) {
4380Sstevel@tonic-gate /* no wraparound */
4390Sstevel@tonic-gate ident += (lblk - lbtodb(odi.od_head_lof));
4400Sstevel@tonic-gate } else {
4410Sstevel@tonic-gate /* log wrapped around the end */
4420Sstevel@tonic-gate ident += (lbtodb(odi.od_eol_lof) -
443*6747Sga159272 lbtodb(odi.od_head_lof));
4440Sstevel@tonic-gate ident += (lblk - lbtodb(odi.od_bol_lof));
4450Sstevel@tonic-gate }
4460Sstevel@tonic-gate
4470Sstevel@tonic-gate if (ident != st->st_ident)
4480Sstevel@tonic-gate return (0);
4490Sstevel@tonic-gate }
4500Sstevel@tonic-gate read_done:
4510Sstevel@tonic-gate /*
4520Sstevel@tonic-gate * Copy the delta contents to the destination buffer if
4530Sstevel@tonic-gate * one was specified. Otherwise, just skip the contents.
4540Sstevel@tonic-gate */
4550Sstevel@tonic-gate i = MIN(NB_LEFT_IN_SECTOR(addr), nb);
4560Sstevel@tonic-gate if (va != NULL) {
4570Sstevel@tonic-gate bcopy(logfp->fi_buf + (addr - ldbtob(lbtodb(addr))),
458*6747Sga159272 va, i);
4590Sstevel@tonic-gate va += i;
4600Sstevel@tonic-gate }
4610Sstevel@tonic-gate nb -= i;
4620Sstevel@tonic-gate addr += i;
4630Sstevel@tonic-gate /*
4640Sstevel@tonic-gate * Skip sector trailer if necessary.
4650Sstevel@tonic-gate */
4660Sstevel@tonic-gate if (NB_LEFT_IN_SECTOR(addr) == 0)
4670Sstevel@tonic-gate addr += sizeof (sect_trailer_t);
4680Sstevel@tonic-gate }
4690Sstevel@tonic-gate return (addr);
4700Sstevel@tonic-gate }
4710Sstevel@tonic-gate
4720Sstevel@tonic-gate void
lufs_boot_init(fileid_t * filep)4730Sstevel@tonic-gate lufs_boot_init(fileid_t *filep)
4740Sstevel@tonic-gate {
4750Sstevel@tonic-gate struct fs *sb = (struct fs *)filep->fi_memp;
4760Sstevel@tonic-gate int err = 0;
4770Sstevel@tonic-gate
4780Sstevel@tonic-gate /*
4790Sstevel@tonic-gate * boot_ufs_mountroot() should have called us with a
4800Sstevel@tonic-gate * filep pointing to the superblock. Verify that this
4810Sstevel@tonic-gate * is so first.
4820Sstevel@tonic-gate * Then check whether this filesystem has a dirty log.
4830Sstevel@tonic-gate * Also return if lufs support was disabled on request.
4840Sstevel@tonic-gate */
4850Sstevel@tonic-gate if (!lufs_support ||
486*6747Sga159272 sb != (struct fs *)&filep->fi_devp->un_fs.di_fs ||
487*6747Sga159272 sb->fs_clean != FSLOG || sb->fs_logbno == NULL) {
4880Sstevel@tonic-gate return;
4890Sstevel@tonic-gate }
4900Sstevel@tonic-gate
4910Sstevel@tonic-gate if (boothowto & RB_VERBOSE)
4920Sstevel@tonic-gate printf("The boot filesystem is logging.\n");
4930Sstevel@tonic-gate
4940Sstevel@tonic-gate /*
4950Sstevel@tonic-gate * The filesystem is logging, there is a log area
4960Sstevel@tonic-gate * allocated for it. Check the log state and determine
4970Sstevel@tonic-gate * whether it'll be possible to use this log.
4980Sstevel@tonic-gate */
4990Sstevel@tonic-gate
5000Sstevel@tonic-gate /*
5010Sstevel@tonic-gate * Allocate a private fileid_t for use when reading
5020Sstevel@tonic-gate * from the log.
5030Sstevel@tonic-gate */
5040Sstevel@tonic-gate eb = (extent_block_t *)bkmem_zalloc(sb->fs_bsize);
5050Sstevel@tonic-gate logfp = (fileid_t *)bkmem_zalloc(sizeof (fileid_t));
5060Sstevel@tonic-gate logfp->fi_memp = logfp->fi_buf;
5070Sstevel@tonic-gate logfp->fi_devp = filep->fi_devp;
5080Sstevel@tonic-gate
5090Sstevel@tonic-gate /*
5100Sstevel@tonic-gate * Read the extent block and verify that what we
5110Sstevel@tonic-gate * find there are actually lufs extents.
5120Sstevel@tonic-gate * Make it simple: the extent block including all
5130Sstevel@tonic-gate * extents cannot be larger than a filesystem block.
5140Sstevel@tonic-gate * So read a whole filesystem block, to make sure
5150Sstevel@tonic-gate * we have read all extents in the same operation.
5160Sstevel@tonic-gate */
5170Sstevel@tonic-gate logfp->fi_blocknum = sb->fs_logbno;
5180Sstevel@tonic-gate logfp->fi_count = sb->fs_bsize;
5190Sstevel@tonic-gate logfp->fi_memp = (caddr_t)eb;
5200Sstevel@tonic-gate logfp->fi_offset = 0;
5210Sstevel@tonic-gate if (diskread(logfp) || eb->type != LUFS_EXTENTS) {
5220Sstevel@tonic-gate dprintf("Failed to read log extent block.\n");
5230Sstevel@tonic-gate err = LOG_IS_ERRORED;
5240Sstevel@tonic-gate goto out;
5250Sstevel@tonic-gate }
5260Sstevel@tonic-gate
5270Sstevel@tonic-gate /*
5280Sstevel@tonic-gate * Read the on disk log header. If that fails,
5290Sstevel@tonic-gate * try the backup copy on the adjacent block.
5300Sstevel@tonic-gate */
5310Sstevel@tonic-gate logfp->fi_blocknum = eb->extents[0].pbno;
5320Sstevel@tonic-gate logfp->fi_count = sizeof (ml_odunit_t);
5330Sstevel@tonic-gate logfp->fi_memp = (caddr_t)&odi;
5340Sstevel@tonic-gate logfp->fi_offset = 0;
5350Sstevel@tonic-gate if (diskread(logfp)) {
5360Sstevel@tonic-gate logfp->fi_blocknum = eb->extents[0].pbno + 1;
5370Sstevel@tonic-gate logfp->fi_count = sizeof (ml_odunit_t);
5380Sstevel@tonic-gate logfp->fi_memp = (caddr_t)&odi;
5390Sstevel@tonic-gate logfp->fi_offset = 0;
5400Sstevel@tonic-gate if (diskread(logfp)) {
5410Sstevel@tonic-gate dprintf("Failed to read on-disk log header.\n");
5420Sstevel@tonic-gate err = LOG_IS_ERRORED;
5430Sstevel@tonic-gate goto out;
5440Sstevel@tonic-gate }
5450Sstevel@tonic-gate }
5460Sstevel@tonic-gate
5470Sstevel@tonic-gate /*
5480Sstevel@tonic-gate * Verify that we understand this log, and
5490Sstevel@tonic-gate * that the log isn't bad or empty.
5500Sstevel@tonic-gate */
5510Sstevel@tonic-gate if (odi.od_version != LUFS_VERSION_LATEST) {
5520Sstevel@tonic-gate dprintf("On-disk log format v%d != supported format v%d.\n",
553*6747Sga159272 odi.od_version, LUFS_VERSION_LATEST);
5540Sstevel@tonic-gate err = LOG_IS_ERRORED;
5550Sstevel@tonic-gate } else if (odi.od_badlog) {
5560Sstevel@tonic-gate dprintf("On-disk log is marked bad.\n");
5570Sstevel@tonic-gate err = LOG_IS_ERRORED;
5580Sstevel@tonic-gate } else if (odi.od_chksum != odi.od_head_ident + odi.od_tail_ident) {
5590Sstevel@tonic-gate dprintf("On-disk log checksum %d != ident sum %d.\n",
560*6747Sga159272 odi.od_chksum, odi.od_head_ident + odi.od_tail_ident);
5610Sstevel@tonic-gate err = LOG_IS_ERRORED;
5620Sstevel@tonic-gate } else {
5630Sstevel@tonic-gate /*
5640Sstevel@tonic-gate * All consistency checks ok. Scan the log, build the
5650Sstevel@tonic-gate * log hash. If this succeeds we'll be using the log
5660Sstevel@tonic-gate * when reading from this filesystem.
5670Sstevel@tonic-gate */
5680Sstevel@tonic-gate err = lufs_logscan();
5690Sstevel@tonic-gate }
5700Sstevel@tonic-gate out:
5710Sstevel@tonic-gate ufs_is_lufs = 1;
5720Sstevel@tonic-gate switch (err) {
5730Sstevel@tonic-gate case LOG_IS_EMPTY:
5740Sstevel@tonic-gate if (boothowto & RB_VERBOSE)
5750Sstevel@tonic-gate printf("The ufs log is empty and will not be used.\n");
5760Sstevel@tonic-gate lufs_closeall();
5770Sstevel@tonic-gate break;
5780Sstevel@tonic-gate case LOG_IS_OK:
5790Sstevel@tonic-gate if (boothowto & RB_VERBOSE)
5800Sstevel@tonic-gate printf("Using the ufs log.\n");
5810Sstevel@tonic-gate break;
5820Sstevel@tonic-gate case LOG_IS_ERRORED:
5830Sstevel@tonic-gate if (boothowto & RB_VERBOSE)
5840Sstevel@tonic-gate printf("Couldn't build log hash. Can't use ufs log.\n");
5850Sstevel@tonic-gate lufs_closeall();
5860Sstevel@tonic-gate break;
5870Sstevel@tonic-gate default:
5880Sstevel@tonic-gate dprintf("Invalid error %d while scanning the ufs log.\n", err);
5890Sstevel@tonic-gate break;
5900Sstevel@tonic-gate }
5910Sstevel@tonic-gate }
5920Sstevel@tonic-gate
5930Sstevel@tonic-gate static int
lufs_logscan_read(int32_t * addr,struct delta * d)5940Sstevel@tonic-gate lufs_logscan_read(int32_t *addr, struct delta *d)
5950Sstevel@tonic-gate {
5960Sstevel@tonic-gate *addr = lufs_read_log(*addr, (caddr_t)d, sizeof (struct delta));
5970Sstevel@tonic-gate
5980Sstevel@tonic-gate if (*addr == 0 ||
5990Sstevel@tonic-gate d->d_typ < DT_NONE || d->d_typ > DT_MAX ||
6000Sstevel@tonic-gate d->d_nb >= odi.od_logsize)
6010Sstevel@tonic-gate return (0);
6020Sstevel@tonic-gate
6030Sstevel@tonic-gate return (1);
6040Sstevel@tonic-gate }
6050Sstevel@tonic-gate
6060Sstevel@tonic-gate static int
lufs_logscan_skip(int32_t * addr,struct delta * d)6070Sstevel@tonic-gate lufs_logscan_skip(int32_t *addr, struct delta *d)
6080Sstevel@tonic-gate {
6090Sstevel@tonic-gate switch (d->d_typ) {
6100Sstevel@tonic-gate case DT_COMMIT:
6110Sstevel@tonic-gate /*
6120Sstevel@tonic-gate * A DT_COMMIT delta has no size as such, but will
6130Sstevel@tonic-gate * always "fill up" the sector that contains it.
6140Sstevel@tonic-gate * The next delta header is found at the beginning
6150Sstevel@tonic-gate * of the next 512-Bytes sector, adjust "addr" to
6160Sstevel@tonic-gate * reflect that.
6170Sstevel@tonic-gate */
6180Sstevel@tonic-gate *addr += ((*addr & (DEV_BSIZE - 1))) ?
619*6747Sga159272 NB_LEFT_IN_SECTOR(*addr) +
620*6747Sga159272 sizeof (sect_trailer_t) : 0;
6210Sstevel@tonic-gate return (1);
6220Sstevel@tonic-gate case DT_CANCEL:
6230Sstevel@tonic-gate case DT_ABZERO:
6240Sstevel@tonic-gate /*
6250Sstevel@tonic-gate * These types of deltas occupy no space in the log
6260Sstevel@tonic-gate */
6270Sstevel@tonic-gate return (1);
6280Sstevel@tonic-gate default:
6290Sstevel@tonic-gate /*
6300Sstevel@tonic-gate * Skip over the delta contents.
6310Sstevel@tonic-gate */
6320Sstevel@tonic-gate *addr = lufs_read_log(*addr, NULL, d->d_nb);
6330Sstevel@tonic-gate }
6340Sstevel@tonic-gate
6350Sstevel@tonic-gate return (*addr != NULL);
6360Sstevel@tonic-gate }
6370Sstevel@tonic-gate
6380Sstevel@tonic-gate static void
lufs_logscan_freecancel(void)6390Sstevel@tonic-gate lufs_logscan_freecancel(void)
6400Sstevel@tonic-gate {
6410Sstevel@tonic-gate lb_me_t **lh, *l, *lnext;
6420Sstevel@tonic-gate int i;
6430Sstevel@tonic-gate
6440Sstevel@tonic-gate /*
6450Sstevel@tonic-gate * Walk the entire log hash and put cancelled entries
6460Sstevel@tonic-gate * onto the freelist. Corner cases:
6470Sstevel@tonic-gate * a) empty hash chain (*lh == NULL)
6480Sstevel@tonic-gate * b) only one entry in chain, and that is cancelled.
6490Sstevel@tonic-gate * If for every cancelled delta another one would've
6500Sstevel@tonic-gate * been added, this situation couldn't occur, but a
6510Sstevel@tonic-gate * DT_CANCEL delta can lead to this as it is never
6520Sstevel@tonic-gate * added.
6530Sstevel@tonic-gate */
6540Sstevel@tonic-gate for (i = 0; i < LB_HASHSIZE; i++) {
6550Sstevel@tonic-gate lh = &loghash[i];
6560Sstevel@tonic-gate l = *lh;
6570Sstevel@tonic-gate do {
6580Sstevel@tonic-gate if (*lh == (lb_me_t *)NULL)
6590Sstevel@tonic-gate break;
6600Sstevel@tonic-gate lnext = l->l_next;
6610Sstevel@tonic-gate if (l->l_flags & LB_ISCANCELLED) {
6620Sstevel@tonic-gate remlist(lh, l);
6630Sstevel@tonic-gate bzero((caddr_t)l, sizeof (lb_me_t));
6640Sstevel@tonic-gate l->l_next = lfreelist;
6650Sstevel@tonic-gate lfreelist = l;
6660Sstevel@tonic-gate /*
6670Sstevel@tonic-gate * Just removed the hash head. In order not
6680Sstevel@tonic-gate * to terminate the while loop, respin chain
6690Sstevel@tonic-gate * walk for this hash chain.
6700Sstevel@tonic-gate */
6710Sstevel@tonic-gate if (lnext == *lh) {
6720Sstevel@tonic-gate i--;
6730Sstevel@tonic-gate break;
6740Sstevel@tonic-gate }
6750Sstevel@tonic-gate }
6760Sstevel@tonic-gate l = lnext;
6770Sstevel@tonic-gate } while (l != *lh);
6780Sstevel@tonic-gate }
6790Sstevel@tonic-gate }
6800Sstevel@tonic-gate
6810Sstevel@tonic-gate static int
lufs_logscan_addmap(int32_t * addr,struct delta * d)6820Sstevel@tonic-gate lufs_logscan_addmap(int32_t *addr, struct delta *d)
6830Sstevel@tonic-gate {
6840Sstevel@tonic-gate lb_me_t **lh, *l;
6850Sstevel@tonic-gate
6860Sstevel@tonic-gate switch (d->d_typ) {
6870Sstevel@tonic-gate case DT_COMMIT:
6880Sstevel@tonic-gate /*
6890Sstevel@tonic-gate * Handling DT_COMMIT deltas is special. We need to:
6900Sstevel@tonic-gate * 1. increase the transaction ID
6910Sstevel@tonic-gate * 2. remove cancelled entries.
6920Sstevel@tonic-gate */
6930Sstevel@tonic-gate lufs_logscan_freecancel();
6940Sstevel@tonic-gate curtid++;
6950Sstevel@tonic-gate break;
6960Sstevel@tonic-gate case DT_INODE:
6970Sstevel@tonic-gate /*
6980Sstevel@tonic-gate * Deltas against parts of on-disk inodes are
6990Sstevel@tonic-gate * assumed to be timestamps. Ignore those.
7000Sstevel@tonic-gate */
7010Sstevel@tonic-gate if (d->d_nb != sizeof (struct dinode))
7020Sstevel@tonic-gate break;
7030Sstevel@tonic-gate /* FALLTHROUGH */
7040Sstevel@tonic-gate case DT_CANCEL:
7050Sstevel@tonic-gate case DT_ABZERO:
7060Sstevel@tonic-gate case DT_AB:
7070Sstevel@tonic-gate case DT_DIR:
7080Sstevel@tonic-gate case DT_FBI:
7090Sstevel@tonic-gate /*
7100Sstevel@tonic-gate * These types of deltas contain and/or modify structural
7110Sstevel@tonic-gate * information that is needed for booting the system:
7120Sstevel@tonic-gate * - where to find a file (DT_DIR, DT_FBI)
7130Sstevel@tonic-gate * - the file itself (DT_INODE)
7140Sstevel@tonic-gate * - data blocks associated with a file (DT_AB, DT_ABZERO)
7150Sstevel@tonic-gate *
7160Sstevel@tonic-gate * Building the hash chains becomes complicated because there
7170Sstevel@tonic-gate * may exist an older (== previously added) entry that overlaps
7180Sstevel@tonic-gate * with the one we want to add.
7190Sstevel@tonic-gate * Four cases must be distinguished:
7200Sstevel@tonic-gate * 1. The new delta is an exact match for an existing one,
7210Sstevel@tonic-gate * or is a superset of an existing one, and both
7220Sstevel@tonic-gate * belong to the same transaction.
7230Sstevel@tonic-gate * The new delta completely supersedes the old one, so
7240Sstevel@tonic-gate * remove that and reuse the structure for the new.
7250Sstevel@tonic-gate * Then add the new delta to the head of the hashchain.
7260Sstevel@tonic-gate * 2. The new delta is an exact match for an existing one,
7270Sstevel@tonic-gate * or is a superset of an existing one, but the two
7280Sstevel@tonic-gate * belong to different transactions (i.e. the old one is
7290Sstevel@tonic-gate * committed).
7300Sstevel@tonic-gate * The existing one is marked to be cancelled when the
7310Sstevel@tonic-gate * next DT_COMMIT record is found, and the hash chain
7320Sstevel@tonic-gate * walk is continued as there may be more existing entries
7330Sstevel@tonic-gate * found which overlap the new delta (happens if that is
7340Sstevel@tonic-gate * a superset of those in the log).
7350Sstevel@tonic-gate * Once no more overlaps are found, goto 4.
7360Sstevel@tonic-gate * 3. An existing entry completely covers the new one.
7370Sstevel@tonic-gate * The new delta is then added directly before this
7380Sstevel@tonic-gate * existing one.
7390Sstevel@tonic-gate * 4. No (more) overlaps with existing entries are found.
7400Sstevel@tonic-gate * Unless this is a DT_CANCEL delta, whose only purpose
7410Sstevel@tonic-gate * is already handled by marking overlapping entries for
7420Sstevel@tonic-gate * cancellation, add the new delta at the hash chain head.
7430Sstevel@tonic-gate *
7440Sstevel@tonic-gate * This strategy makes sure that the hash chains are properly
7450Sstevel@tonic-gate * ordered. lufs_merge_deltas() walks the hash chain backward,
7460Sstevel@tonic-gate * which then ensures that delta merging is done in the same
7470Sstevel@tonic-gate * order as those deltas occur in the log - remember, the
7480Sstevel@tonic-gate * log can only be read in one direction.
7490Sstevel@tonic-gate *
7500Sstevel@tonic-gate */
7510Sstevel@tonic-gate lh = &loghash[LB_HASHFUNC(d->d_mof)];
7520Sstevel@tonic-gate l = *lh;
7530Sstevel@tonic-gate do {
7540Sstevel@tonic-gate if (l == (lb_me_t *)NULL)
7550Sstevel@tonic-gate break;
7560Sstevel@tonic-gate /*
7570Sstevel@tonic-gate * This covers the first two cases above.
7580Sstevel@tonic-gate * If this is a perfect match from the same transaction,
7590Sstevel@tonic-gate * and it isn't already cancelled, we simply replace it
7600Sstevel@tonic-gate * with its newer incarnation.
7610Sstevel@tonic-gate * Otherwise, mark it for cancellation. Handling of
7620Sstevel@tonic-gate * DT_COMMIT is going to remove it, then.
7630Sstevel@tonic-gate */
7640Sstevel@tonic-gate if (WITHIN(l->l_mof, l->l_nb, d->d_mof, d->d_nb)) {
7650Sstevel@tonic-gate if (!(l->l_flags & LB_ISCANCELLED)) {
7660Sstevel@tonic-gate if (l->l_tid == curtid &&
7670Sstevel@tonic-gate d->d_typ != DT_CANCEL) {
7680Sstevel@tonic-gate remlist(lh, l);
7690Sstevel@tonic-gate l->l_mof = d->d_mof;
7700Sstevel@tonic-gate l->l_lof = *addr;
7710Sstevel@tonic-gate l->l_nb = d->d_nb;
7720Sstevel@tonic-gate l->l_typ = d->d_typ;
7730Sstevel@tonic-gate l->l_flags = 0;
7740Sstevel@tonic-gate l->l_tid = curtid;
7750Sstevel@tonic-gate inslist(lh, l);
7760Sstevel@tonic-gate return (1);
7770Sstevel@tonic-gate } else {
7780Sstevel@tonic-gate /*
7790Sstevel@tonic-gate * 2nd case - cancel only.
7800Sstevel@tonic-gate */
7810Sstevel@tonic-gate l->l_flags |= LB_ISCANCELLED;
7820Sstevel@tonic-gate }
7830Sstevel@tonic-gate }
7840Sstevel@tonic-gate } else if (WITHIN(d->d_mof, d->d_nb,
785*6747Sga159272 l->l_mof, l->l_nb)) {
7860Sstevel@tonic-gate /*
7870Sstevel@tonic-gate * This is the third case above.
7880Sstevel@tonic-gate * With deltas DT_ABZERO/DT_AB and DT_FBI/DT_DIR
7890Sstevel@tonic-gate * this may happen - an existing previous delta
7900Sstevel@tonic-gate * is larger than the current one we're planning
7910Sstevel@tonic-gate * to add - DT_ABZERO deltas are supersets of
7920Sstevel@tonic-gate * DT_AB deltas, and likewise DT_FBI/DT_DIR.
7930Sstevel@tonic-gate * In order to do merging correctly, such deltas
7940Sstevel@tonic-gate * put up a barrier for new ones that overlap,
7950Sstevel@tonic-gate * and we have to add the new delta immediately
7960Sstevel@tonic-gate * before (!) the existing one.
7970Sstevel@tonic-gate */
7980Sstevel@tonic-gate lb_me_t *newl;
7990Sstevel@tonic-gate newl = lufs_alloc_me();
8000Sstevel@tonic-gate if (newl == (lb_me_t *)NULL) {
8010Sstevel@tonic-gate /*
8020Sstevel@tonic-gate * No memory. Throw away everything
8030Sstevel@tonic-gate * and try booting without logging
8040Sstevel@tonic-gate * support.
8050Sstevel@tonic-gate */
8060Sstevel@tonic-gate curtid = 0;
8070Sstevel@tonic-gate return (0);
8080Sstevel@tonic-gate }
8090Sstevel@tonic-gate newl->l_mof = d->d_mof;
8100Sstevel@tonic-gate newl->l_lof = *addr; /* "payload" address */
8110Sstevel@tonic-gate newl->l_nb = d->d_nb;
8120Sstevel@tonic-gate newl->l_typ = d->d_typ;
8130Sstevel@tonic-gate newl->l_tid = curtid;
8140Sstevel@tonic-gate newl->l_prev = l->l_prev;
8150Sstevel@tonic-gate newl->l_next = l;
8160Sstevel@tonic-gate l->l_prev->l_next = newl;
8170Sstevel@tonic-gate l->l_prev = newl;
8180Sstevel@tonic-gate if (*lh == l)
8190Sstevel@tonic-gate *lh = newl;
8200Sstevel@tonic-gate return (1);
8210Sstevel@tonic-gate }
8220Sstevel@tonic-gate l = l->l_next;
8230Sstevel@tonic-gate } while (l != *lh);
8240Sstevel@tonic-gate
8250Sstevel@tonic-gate /*
8260Sstevel@tonic-gate * This is case 4., add a new delta at the head of the chain.
8270Sstevel@tonic-gate *
8280Sstevel@tonic-gate * If the new delta is a DT_CANCEL entry, we handled it by
8290Sstevel@tonic-gate * marking everything it covered for cancellation. We can
8300Sstevel@tonic-gate * get by without actually adding the delta itself to the
8310Sstevel@tonic-gate * hash, as it'd need to be removed by the commit code anyway.
8320Sstevel@tonic-gate */
8330Sstevel@tonic-gate if (d->d_typ == DT_CANCEL)
8340Sstevel@tonic-gate break;
8350Sstevel@tonic-gate
8360Sstevel@tonic-gate l = lufs_alloc_me();
8370Sstevel@tonic-gate if (l == (lb_me_t *)NULL) {
8380Sstevel@tonic-gate /*
8390Sstevel@tonic-gate * No memory. Throw away everything
8400Sstevel@tonic-gate * and try booting without logging
8410Sstevel@tonic-gate * support.
8420Sstevel@tonic-gate */
8430Sstevel@tonic-gate curtid = 0;
8440Sstevel@tonic-gate return (0);
8450Sstevel@tonic-gate }
8460Sstevel@tonic-gate l->l_mof = d->d_mof;
8470Sstevel@tonic-gate l->l_lof = *addr; /* this is the "payload" address */
8480Sstevel@tonic-gate l->l_nb = d->d_nb;
8490Sstevel@tonic-gate l->l_typ = d->d_typ;
8500Sstevel@tonic-gate l->l_tid = curtid;
8510Sstevel@tonic-gate inslist(lh, l);
8520Sstevel@tonic-gate break;
8530Sstevel@tonic-gate default:
8540Sstevel@tonic-gate break;
8550Sstevel@tonic-gate }
8560Sstevel@tonic-gate return (1);
8570Sstevel@tonic-gate }
8580Sstevel@tonic-gate
8590Sstevel@tonic-gate static int
lufs_logscan_prescan(void)8600Sstevel@tonic-gate lufs_logscan_prescan(void)
8610Sstevel@tonic-gate {
8620Sstevel@tonic-gate /*
8630Sstevel@tonic-gate * Simulate a full log by setting the tail to be one sector
8640Sstevel@tonic-gate * behind the head. This will make the logscan read all
8650Sstevel@tonic-gate * of the log until an out-of-sequence sector ident is
8660Sstevel@tonic-gate * found.
8670Sstevel@tonic-gate */
8680Sstevel@tonic-gate odi.od_tail_lof = dbtob(btodb(odi.od_head_lof)) - DEV_BSIZE;
8690Sstevel@tonic-gate if (odi.od_tail_lof < odi.od_bol_lof)
8700Sstevel@tonic-gate odi.od_tail_lof = odi.od_eol_lof - DEV_BSIZE;
8710Sstevel@tonic-gate if (odi.od_tail_lof >= odi.od_eol_lof)
8720Sstevel@tonic-gate odi.od_tail_lof = odi.od_bol_lof;
8730Sstevel@tonic-gate
8740Sstevel@tonic-gate /*
8750Sstevel@tonic-gate * While sector trailers maintain TID values, od_head_tid
8760Sstevel@tonic-gate * is not being updated by the kernel ufs logging support
8770Sstevel@tonic-gate * at this time. We therefore count transactions ourselves
8780Sstevel@tonic-gate * starting at zero - as does the kernel ufs logscan code.
8790Sstevel@tonic-gate */
8800Sstevel@tonic-gate curtid = 0;
8810Sstevel@tonic-gate
8820Sstevel@tonic-gate if (!lufs_alloc_logbuf()) {
8830Sstevel@tonic-gate dprintf("Failed to allocate log buffer.\n");
8840Sstevel@tonic-gate return (0);
8850Sstevel@tonic-gate }
8860Sstevel@tonic-gate
8870Sstevel@tonic-gate loghash = (lb_me_t **)lufs_alloc_from_logbuf(
888*6747Sga159272 LB_HASHSIZE * sizeof (lb_me_t *));
8890Sstevel@tonic-gate if (loghash == (lb_me_t **)NULL) {
8900Sstevel@tonic-gate dprintf("Can't allocate loghash[] array.");
8910Sstevel@tonic-gate return (0);
8920Sstevel@tonic-gate }
8930Sstevel@tonic-gate return (1);
8940Sstevel@tonic-gate }
8950Sstevel@tonic-gate
8960Sstevel@tonic-gate /*
8970Sstevel@tonic-gate * This function must remove all uncommitted entries (l->l_tid == curtid)
8980Sstevel@tonic-gate * from the log hash. Doing this, we implicitly delete pending cancellations
8990Sstevel@tonic-gate * as well.
9000Sstevel@tonic-gate * It uses the same hash walk algorithm as lufs_logscan_freecancel(). Only
9010Sstevel@tonic-gate * the check for entries that need to be removed is different.
9020Sstevel@tonic-gate */
9030Sstevel@tonic-gate static void
lufs_logscan_postscan(void)9040Sstevel@tonic-gate lufs_logscan_postscan(void)
9050Sstevel@tonic-gate {
9060Sstevel@tonic-gate lb_me_t **lh, *l, *lnext;
9070Sstevel@tonic-gate int i;
9080Sstevel@tonic-gate
9090Sstevel@tonic-gate for (i = 0; i < LB_HASHSIZE; i++) {
9100Sstevel@tonic-gate lh = &loghash[i];
9110Sstevel@tonic-gate l = *lh;
9120Sstevel@tonic-gate do {
9130Sstevel@tonic-gate if (l == (lb_me_t *)NULL)
9140Sstevel@tonic-gate break;
9150Sstevel@tonic-gate lnext = l->l_next;
9160Sstevel@tonic-gate if (l->l_tid == curtid) {
9170Sstevel@tonic-gate remlist(lh, l);
9180Sstevel@tonic-gate bzero((caddr_t)l, sizeof (lb_me_t));
9190Sstevel@tonic-gate l->l_next = lfreelist;
9200Sstevel@tonic-gate lfreelist = l;
9210Sstevel@tonic-gate if (*lh == (lb_me_t *)NULL)
9220Sstevel@tonic-gate break;
9230Sstevel@tonic-gate /*
9240Sstevel@tonic-gate * Just removed the hash head. In order not
9250Sstevel@tonic-gate * to terminate the while loop, respin chain
9260Sstevel@tonic-gate * walk for this hash chain.
9270Sstevel@tonic-gate */
9280Sstevel@tonic-gate if (lnext == *lh) {
9290Sstevel@tonic-gate i--;
9300Sstevel@tonic-gate break;
9310Sstevel@tonic-gate }
9320Sstevel@tonic-gate } else {
9330Sstevel@tonic-gate l->l_flags &= ~(LB_ISCANCELLED);
9340Sstevel@tonic-gate }
9350Sstevel@tonic-gate l = lnext;
9360Sstevel@tonic-gate } while (l != *lh);
9370Sstevel@tonic-gate }
9380Sstevel@tonic-gate }
9390Sstevel@tonic-gate
9400Sstevel@tonic-gate /*
9410Sstevel@tonic-gate * This function builds the log hash. It performs the same sequence
9420Sstevel@tonic-gate * of actions at logscan as the kernel ufs logging support:
9430Sstevel@tonic-gate * - Prepare the log for scanning by simulating a full log.
9440Sstevel@tonic-gate * - As long as sectors read from the log have contiguous idents, do:
9450Sstevel@tonic-gate * read the delta header
9460Sstevel@tonic-gate * add the delta to the logmap
9470Sstevel@tonic-gate * skip over the contents to the start of the next delta header
9480Sstevel@tonic-gate * - After terminating the scan, remove uncommitted entries.
9490Sstevel@tonic-gate *
9500Sstevel@tonic-gate * This function cannot fail except if mapping the logbuffer area
9510Sstevel@tonic-gate * during lufs_logscan_prescan() fails. If there is a structural
9520Sstevel@tonic-gate * integrity problem and the on-disk log cannot be read, we'll
9530Sstevel@tonic-gate * treat this as the same situation as an uncommitted transaction
9540Sstevel@tonic-gate * at the end of the log (or, corner case of that, an empty log
9550Sstevel@tonic-gate * with no committed transactions in it at all).
9560Sstevel@tonic-gate *
9570Sstevel@tonic-gate */
9580Sstevel@tonic-gate static int
lufs_logscan(void)9590Sstevel@tonic-gate lufs_logscan(void)
9600Sstevel@tonic-gate {
9610Sstevel@tonic-gate int32_t addr;
9620Sstevel@tonic-gate struct delta d;
9630Sstevel@tonic-gate
9640Sstevel@tonic-gate if (!lufs_logscan_prescan())
9650Sstevel@tonic-gate return (LOG_IS_ERRORED);
9660Sstevel@tonic-gate
9670Sstevel@tonic-gate addr = odi.od_head_lof;
9680Sstevel@tonic-gate
9690Sstevel@tonic-gate /*
9700Sstevel@tonic-gate * Note that addr == od_tail_lof means a completely filled
9710Sstevel@tonic-gate * log. This almost never happens, so the common exit path
9720Sstevel@tonic-gate * from this loop is via one of the 'break's.
9730Sstevel@tonic-gate */
9740Sstevel@tonic-gate while (addr != odi.od_tail_lof) {
9750Sstevel@tonic-gate if (!lufs_logscan_read(&addr, &d))
9760Sstevel@tonic-gate break;
9770Sstevel@tonic-gate if (!lufs_logscan_addmap(&addr, &d))
9780Sstevel@tonic-gate return (LOG_IS_ERRORED);
9790Sstevel@tonic-gate if (!lufs_logscan_skip(&addr, &d))
9800Sstevel@tonic-gate break;
9810Sstevel@tonic-gate }
9820Sstevel@tonic-gate
9830Sstevel@tonic-gate lufs_logscan_postscan();
9840Sstevel@tonic-gate /*
9850Sstevel@tonic-gate * Check whether the log contains data, and if so whether
9860Sstevel@tonic-gate * it contains committed data.
9870Sstevel@tonic-gate */
9880Sstevel@tonic-gate if (addr == odi.od_head_lof || curtid == 0) {
9890Sstevel@tonic-gate return (LOG_IS_EMPTY);
9900Sstevel@tonic-gate }
9910Sstevel@tonic-gate return (LOG_IS_OK);
9920Sstevel@tonic-gate }
9930Sstevel@tonic-gate
9940Sstevel@tonic-gate /*
9950Sstevel@tonic-gate * A metadata block was read from disk. Check whether the logmap
9960Sstevel@tonic-gate * has a delta against this byte range, and if so read it in, since
9970Sstevel@tonic-gate * the data in the log is more recent than what was read from other
9980Sstevel@tonic-gate * places on the disk.
9990Sstevel@tonic-gate */
10000Sstevel@tonic-gate void
lufs_merge_deltas(fileid_t * fp)10010Sstevel@tonic-gate lufs_merge_deltas(fileid_t *fp)
10020Sstevel@tonic-gate {
10030Sstevel@tonic-gate int nb;
10040Sstevel@tonic-gate int64_t bof;
10050Sstevel@tonic-gate lb_me_t **lh, *l;
10060Sstevel@tonic-gate int32_t skip;
10070Sstevel@tonic-gate
10080Sstevel@tonic-gate /*
10090Sstevel@tonic-gate * No logmap: Empty log. Nothing to do here.
10100Sstevel@tonic-gate */
10110Sstevel@tonic-gate if (!ufs_is_lufs || logbuffer == (caddr_t)NULL)
10120Sstevel@tonic-gate return;
10130Sstevel@tonic-gate
10140Sstevel@tonic-gate bof = ldbtob(fp->fi_blocknum);
10150Sstevel@tonic-gate nb = fp->fi_count;
10160Sstevel@tonic-gate
10170Sstevel@tonic-gate /*
10180Sstevel@tonic-gate * Search the log hash.
10190Sstevel@tonic-gate * Merge deltas if an overlap is found.
10200Sstevel@tonic-gate */
10210Sstevel@tonic-gate
10220Sstevel@tonic-gate lh = &loghash[LB_HASHFUNC(bof)];
10230Sstevel@tonic-gate
10240Sstevel@tonic-gate if (*lh == (lb_me_t *)NULL)
10250Sstevel@tonic-gate return;
10260Sstevel@tonic-gate
10270Sstevel@tonic-gate l = *lh;
10280Sstevel@tonic-gate
10290Sstevel@tonic-gate do {
10300Sstevel@tonic-gate l = l->l_prev;
10310Sstevel@tonic-gate if (OVERLAP(l->l_mof, l->l_nb, bof, nb)) {
10320Sstevel@tonic-gate /*
10330Sstevel@tonic-gate * Found a delta in the log hash which overlaps
10340Sstevel@tonic-gate * with the current metadata block. Read the
10350Sstevel@tonic-gate * actual delta payload from the on-disk log
10360Sstevel@tonic-gate * directly into the file buffer.
10370Sstevel@tonic-gate */
10380Sstevel@tonic-gate if (l->l_typ != DT_ABZERO) {
10390Sstevel@tonic-gate /*
10400Sstevel@tonic-gate * We have to actually read this part of the
10410Sstevel@tonic-gate * log as it could contain a sector trailer, or
10420Sstevel@tonic-gate * wrap around the end of the log.
10430Sstevel@tonic-gate * If it did, the second offset generation would
10440Sstevel@tonic-gate * be incorrect if we'd started at l->l_lof.
10450Sstevel@tonic-gate */
10460Sstevel@tonic-gate if (!(skip = lufs_read_log(l->l_lof, NULL,
1047*6747Sga159272 MAX(bof - l->l_mof, 0))))
10480Sstevel@tonic-gate dprintf("scan/merge error, pre-skip\n");
10490Sstevel@tonic-gate if (!(skip = lufs_read_log(skip,
1050*6747Sga159272 fp->fi_memp + MAX(l->l_mof - bof, 0),
1051*6747Sga159272 MIN(l->l_mof + l->l_nb, bof + nb) -
1052*6747Sga159272 MAX(l->l_mof, bof))))
10530Sstevel@tonic-gate dprintf("scan/merge error, merge\n");
10540Sstevel@tonic-gate } else {
10550Sstevel@tonic-gate /*
10560Sstevel@tonic-gate * DT_ABZERO requires no disk access, just
10570Sstevel@tonic-gate * clear the byte range which overlaps with
10580Sstevel@tonic-gate * the delta.
10590Sstevel@tonic-gate */
10600Sstevel@tonic-gate bzero(fp->fi_memp + MAX(l->l_mof - bof, 0),
1061*6747Sga159272 MIN(l->l_mof + l->l_nb, bof + nb) -
1062*6747Sga159272 MAX(l->l_mof, bof));
10630Sstevel@tonic-gate }
10640Sstevel@tonic-gate }
10650Sstevel@tonic-gate } while (l->l_prev != (*lh)->l_prev);
10660Sstevel@tonic-gate
10670Sstevel@tonic-gate printf("*\b");
10680Sstevel@tonic-gate }
10690Sstevel@tonic-gate
10700Sstevel@tonic-gate void
lufs_closeall(void)10710Sstevel@tonic-gate lufs_closeall(void)
10720Sstevel@tonic-gate {
10730Sstevel@tonic-gate if (ufs_is_lufs) {
10740Sstevel@tonic-gate bkmem_free((char *)eb, logfp->fi_devp->un_fs.di_fs.fs_bsize);
10750Sstevel@tonic-gate bkmem_free((char *)logfp, sizeof (fileid_t));
10760Sstevel@tonic-gate eb = (extent_block_t *)NULL;
10770Sstevel@tonic-gate bzero((caddr_t)&odi, sizeof (ml_odunit_t));
10780Sstevel@tonic-gate logfp = (fileid_t *)NULL;
10790Sstevel@tonic-gate lufs_free_logbuf();
10800Sstevel@tonic-gate ufs_is_lufs = 0;
10810Sstevel@tonic-gate }
10820Sstevel@tonic-gate }
1083