xref: /onnv-gate/usr/src/uts/common/os/devcache.c (revision 5331:3047ad28a67b)
12797Sjg /*
22797Sjg  * CDDL HEADER START
32797Sjg  *
42797Sjg  * The contents of this file are subject to the terms of the
52797Sjg  * Common Development and Distribution License (the "License").
62797Sjg  * You may not use this file except in compliance with the License.
72797Sjg  *
82797Sjg  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
92797Sjg  * or http://www.opensolaris.org/os/licensing.
102797Sjg  * See the License for the specific language governing permissions
112797Sjg  * and limitations under the License.
122797Sjg  *
132797Sjg  * When distributing Covered Code, include this CDDL HEADER in each
142797Sjg  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
152797Sjg  * If applicable, add the following below this CDDL HEADER, with the
162797Sjg  * fields enclosed by brackets "[]" replaced with your own identifying
172797Sjg  * information: Portions Copyright [yyyy] [name of copyright owner]
182797Sjg  *
192797Sjg  * CDDL HEADER END
202797Sjg  */
212797Sjg /*
224845Svikram  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
232797Sjg  * Use is subject to license terms.
242797Sjg  */
252797Sjg 
262797Sjg #pragma ident	"%Z%%M%	%I%	%E% SMI"
272797Sjg 
282797Sjg #include <sys/note.h>
292797Sjg #include <sys/t_lock.h>
302797Sjg #include <sys/cmn_err.h>
312797Sjg #include <sys/instance.h>
322797Sjg #include <sys/conf.h>
332797Sjg #include <sys/stat.h>
342797Sjg #include <sys/ddi.h>
352797Sjg #include <sys/hwconf.h>
362797Sjg #include <sys/sunddi.h>
372797Sjg #include <sys/sunndi.h>
382797Sjg #include <sys/ddi_impldefs.h>
392797Sjg #include <sys/ndi_impldefs.h>
402797Sjg #include <sys/modctl.h>
412797Sjg #include <sys/dacf.h>
422797Sjg #include <sys/promif.h>
432797Sjg #include <sys/cpuvar.h>
442797Sjg #include <sys/pathname.h>
452797Sjg #include <sys/kobj.h>
462797Sjg #include <sys/devcache.h>
472797Sjg #include <sys/devcache_impl.h>
482797Sjg #include <sys/sysmacros.h>
492797Sjg #include <sys/varargs.h>
502797Sjg #include <sys/callb.h>
512797Sjg 
522797Sjg /*
532797Sjg  * This facility provides interfaces to clients to register,
542797Sjg  * read and update cache data in persisted backing store files,
552797Sjg  * usually in /etc/devices.  The data persisted through this
562797Sjg  * mechanism should be stateless data, functioning in the sense
572797Sjg  * of a cache.  Writes are performed by a background daemon
582797Sjg  * thread, permitting a client to schedule an update without
592797Sjg  * blocking, then continue updating the data state in
602797Sjg  * parallel.  The data is only locked by the daemon thread
612797Sjg  * to pack the data in preparation for the write.
622797Sjg  *
632797Sjg  * Data persisted through this mechanism should be capable
642797Sjg  * of being regenerated through normal system operation,
652797Sjg  * for example attaching all disk devices would cause all
662797Sjg  * devids to be registered for those devices.  By caching
672797Sjg  * a devid-device tuple, the system can operate in a
682797Sjg  * more optimal way, directly attaching the device mapped
692797Sjg  * to a devid, rather than burdensomely driving attach of
702797Sjg  * the entire device tree to discover a single device.
712797Sjg  *
722797Sjg  * Note that a client should only need to include
732797Sjg  * <sys/devcache.h> for the supported interfaces.
742797Sjg  *
752797Sjg  * The data per client is entirely within the control of
762797Sjg  * the client.  When reading, data unpacked from the backing
772797Sjg  * store should be inserted in the list.  The pointer to
78*5331Samw  * the list can be retrieved via nvf_list().  When writing,
792797Sjg  * the data on the list is to be packed and returned to the
802797Sjg  * nvpdaemon as an nvlist.
812797Sjg  *
822797Sjg  * Obvious restrictions are imposed by the limits of the
832797Sjg  * nvlist format.  The data cannot be read or written
842797Sjg  * piecemeal, and large amounts of data aren't recommended.
852797Sjg  * However, nvlists do allow that data be named and typed
862797Sjg  * and can be size-of-int invariant, and the cached data
872797Sjg  * can be versioned conveniently.
882797Sjg  *
892797Sjg  * The registration involves two steps: a handle is
902797Sjg  * allocated by calling the registration function.
912797Sjg  * This sets up the data referenced by the handle and
922797Sjg  * initializes the lock.  Following registration, the
932797Sjg  * client must initialize the data list.  The list
942797Sjg  * interfaces require that the list element with offset
952797Sjg  * to the node link be provided.  The format of the
962797Sjg  * list element is under the control of the client.
972797Sjg  *
982797Sjg  * Locking: the address of the data list r/w lock provided
992797Sjg  * can be accessed with nvf_lock().  The lock must be held
1002797Sjg  * as reader when traversing the list or checking state,
1012797Sjg  * such as nvf_is_dirty().  The lock must be held as
1022797Sjg  * writer when updating the list or marking it dirty.
1032797Sjg  * The lock must not be held when waking the daemon.
1042797Sjg  *
1052797Sjg  * The data r/w lock is held as writer when the pack,
1062797Sjg  * unpack and free list handlers are called.  The
1072797Sjg  * lock should not be dropped and must be still held
1082797Sjg  * upon return.  The client should also hold the lock
1092797Sjg  * as reader when checking if the list is dirty, and
1102797Sjg  * as writer when marking the list dirty or initiating
1112797Sjg  * a read.
1122797Sjg  *
1132797Sjg  * The asynchronous nature of updates allows for the
1142797Sjg  * possibility that the data may continue to be updated
1152797Sjg  * once the daemon has been notified that an update is
1162797Sjg  * desired.  The data only needs to be locked against
1172797Sjg  * updates when packing the data into the form to be
1182797Sjg  * written.  When the write of the packed data has
1192797Sjg  * completed, the daemon will automatically reschedule
1202797Sjg  * an update if the data was marked dirty after the
1212797Sjg  * point at which it was packed.  Before beginning an
1222797Sjg  * update, the daemon attempts to lock the data as
1232797Sjg  * writer; if the writer lock is already held, it
1242797Sjg  * backs off and retries later.  The model is to give
1252797Sjg  * priority to the kernel processes generating the
1262797Sjg  * data, and that the nature of the data is that
1272797Sjg  * it does not change often, can be re-generated when
1282797Sjg  * needed, so updates should not happen often and
1292797Sjg  * can be delayed until the data stops changing.
1302797Sjg  * The client may update the list or mark it dirty
1312797Sjg  * any time it is able to acquire the lock as
1322797Sjg  * writer first.
1332797Sjg  *
1342797Sjg  * A failed write will be retried after some delay,
1352797Sjg  * in the hope that the cause of the error will be
1362797Sjg  * transient, for example a filesystem with no space
1372797Sjg  * available.  An update on a read-only filesystem
1382797Sjg  * is failed silently and not retried; this would be
1392797Sjg  * the case when booted off install media.
1402797Sjg  *
1412797Sjg  * There is no unregister mechanism as of yet, as it
1422797Sjg  * hasn't been needed so far.
1432797Sjg  */
1442797Sjg 
1452797Sjg /*
1462797Sjg  * Global list of files registered and updated by the nvpflush
1472797Sjg  * daemon, protected by the nvf_cache_mutex.  While an
1482797Sjg  * update is taking place, a file is temporarily moved to
1492797Sjg  * the dirty list to avoid locking the primary list for
1502797Sjg  * the duration of the update.
1512797Sjg  */
1522797Sjg list_t		nvf_cache_files;
1532797Sjg list_t		nvf_dirty_files;
1542797Sjg kmutex_t	nvf_cache_mutex;
1552797Sjg 
1562797Sjg 
1572797Sjg /*
1582797Sjg  * Allow some delay from an update of the data before flushing
1592797Sjg  * to permit simultaneous updates of multiple changes.
1602797Sjg  * Changes in the data are expected to be bursty, ie
1612797Sjg  * reconfig or hot-plug of a new adapter.
1622797Sjg  *
1632797Sjg  * kfio_report_error (default 0)
1642797Sjg  *	Set to 1 to enable some error messages related to low-level
1652797Sjg  *	kernel file i/o operations.
1662797Sjg  *
1672797Sjg  * nvpflush_delay (default 10)
1682797Sjg  *	The number of seconds after data is marked dirty before the
1692797Sjg  *	flush daemon is triggered to flush the data.  A longer period
1702797Sjg  *	of time permits more data updates per write.  Note that
1712797Sjg  *	every update resets the timer so no repository write will
1722797Sjg  *	occur while data is being updated continuously.
1732797Sjg  *
1742797Sjg  * nvpdaemon_idle_time (default 60)
1752797Sjg  *	The number of seconds the daemon will sleep idle before exiting.
1762797Sjg  *
1772797Sjg  */
1782797Sjg #define	NVPFLUSH_DELAY		10
1792797Sjg #define	NVPDAEMON_IDLE_TIME	60
1802797Sjg 
1812797Sjg #define	TICKS_PER_SECOND	(drv_usectohz(1000000))
1822797Sjg 
1832797Sjg /*
1842797Sjg  * Tunables
1852797Sjg  */
1862797Sjg int kfio_report_error = 0;		/* kernel file i/o operations */
1872797Sjg int kfio_disable_read = 0;		/* disable all reads */
1882797Sjg int kfio_disable_write = 0;		/* disable all writes */
1892797Sjg 
1902797Sjg int nvpflush_delay	= NVPFLUSH_DELAY;
1912797Sjg int nvpdaemon_idle_time	= NVPDAEMON_IDLE_TIME;
1922797Sjg 
1932797Sjg static timeout_id_t	nvpflush_id = 0;
1942797Sjg static int		nvpflush_timer_busy = 0;
1952797Sjg static int		nvpflush_daemon_active = 0;
1962797Sjg static kthread_t	*nvpflush_thr_id = 0;
1972797Sjg 
1982797Sjg static int		do_nvpflush = 0;
1992797Sjg static int		nvpbusy = 0;
2002797Sjg static kmutex_t		nvpflush_lock;
2012797Sjg static kcondvar_t	nvpflush_cv;
2022797Sjg static kthread_id_t	nvpflush_thread;
2032797Sjg static clock_t		nvpticks;
2042797Sjg 
2052797Sjg static void nvpflush_daemon(void);
2062797Sjg 
2072797Sjg #ifdef	DEBUG
2082797Sjg int nvpdaemon_debug = 0;
2092797Sjg int kfio_debug = 0;
2102797Sjg #endif	/* DEBUG */
2112797Sjg 
2122797Sjg extern int modrootloaded;
2132797Sjg extern void mdi_read_devices_files(void);
2142797Sjg extern void mdi_clean_vhcache(void);
2152797Sjg 
2162797Sjg /*
2172797Sjg  * Initialize the overall cache file management
2182797Sjg  */
2192797Sjg void
2202797Sjg i_ddi_devices_init(void)
2212797Sjg {
2222797Sjg 	list_create(&nvf_cache_files, sizeof (nvfd_t),
2232797Sjg 	    offsetof(nvfd_t, nvf_link));
2242797Sjg 	list_create(&nvf_dirty_files, sizeof (nvfd_t),
2252797Sjg 	    offsetof(nvfd_t, nvf_link));
2262797Sjg 	mutex_init(&nvf_cache_mutex, NULL, MUTEX_DEFAULT, NULL);
2274845Svikram 	retire_store_init();
2282797Sjg 	devid_cache_init();
2292797Sjg }
2302797Sjg 
2312797Sjg /*
2322797Sjg  * Read cache files
2332797Sjg  * The files read here should be restricted to those
2342797Sjg  * that may be required to mount root.
2352797Sjg  */
2362797Sjg void
2372797Sjg i_ddi_read_devices_files(void)
2382797Sjg {
2394845Svikram 	/*
2404845Svikram 	 * The retire store should be the first file read as it
2414845Svikram 	 * may need to offline devices. kfio_disable_read is not
2424845Svikram 	 * used for retire. For the rationale see the tunable
2434845Svikram 	 * ddi_retire_store_bypass and comments in:
2444845Svikram 	 *	uts/common/os/retire_store.c
2454845Svikram 	 */
2464845Svikram 
2474845Svikram 	retire_store_read();
2484845Svikram 
2492797Sjg 	if (!kfio_disable_read) {
2502797Sjg 		mdi_read_devices_files();
2512797Sjg 		devid_cache_read();
2522797Sjg 	}
2532797Sjg }
2542797Sjg 
2552797Sjg void
2562797Sjg i_ddi_start_flush_daemon(void)
2572797Sjg {
2582797Sjg 	nvfd_t	*nvfdp;
2592797Sjg 
2602797Sjg 	ASSERT(i_ddi_io_initialized());
2612797Sjg 
2622797Sjg 	mutex_init(&nvpflush_lock, NULL, MUTEX_DRIVER, NULL);
2632797Sjg 	cv_init(&nvpflush_cv, NULL, CV_DRIVER, NULL);
2642797Sjg 
2652797Sjg 	mutex_enter(&nvf_cache_mutex);
2662797Sjg 	for (nvfdp = list_head(&nvf_cache_files); nvfdp;
2672797Sjg 	    nvfdp = list_next(&nvf_cache_files, nvfdp)) {
2682797Sjg 		if (NVF_IS_DIRTY(nvfdp)) {
2692797Sjg 			nvf_wake_daemon();
2702797Sjg 			break;
2712797Sjg 		}
2722797Sjg 	}
2732797Sjg 	mutex_exit(&nvf_cache_mutex);
2742797Sjg }
2752797Sjg 
2762797Sjg void
2772797Sjg i_ddi_clean_devices_files(void)
2782797Sjg {
2792797Sjg 	devid_cache_cleanup();
2802797Sjg 	mdi_clean_vhcache();
2812797Sjg }
2822797Sjg 
2832797Sjg /*
2842797Sjg  * Register a cache file to be managed and updated by the nvpflush daemon.
2852797Sjg  * All operations are performed through the returned handle.
2862797Sjg  * There is no unregister mechanism for now.
2872797Sjg  */
2882797Sjg nvf_handle_t
2892797Sjg nvf_register_file(nvf_ops_t *ops)
2902797Sjg {
2912797Sjg 	nvfd_t *nvfdp;
2922797Sjg 
2932797Sjg 	nvfdp = kmem_zalloc(sizeof (*nvfdp), KM_SLEEP);
2942797Sjg 
2952797Sjg 	nvfdp->nvf_ops = ops;
2962797Sjg 	nvfdp->nvf_flags = 0;
2972797Sjg 	rw_init(&nvfdp->nvf_lock, NULL, RW_DRIVER, NULL);
2982797Sjg 
2992797Sjg 	mutex_enter(&nvf_cache_mutex);
3002797Sjg 	list_insert_tail(&nvf_cache_files, nvfdp);
3012797Sjg 	mutex_exit(&nvf_cache_mutex);
3022797Sjg 
3032797Sjg 	return ((nvf_handle_t)nvfdp);
3042797Sjg }
3052797Sjg 
3062797Sjg /*PRINTFLIKE1*/
3072797Sjg void
3082797Sjg nvf_error(const char *fmt, ...)
3092797Sjg {
3102797Sjg 	va_list ap;
3112797Sjg 
3122797Sjg 	if (kfio_report_error) {
3132797Sjg 		va_start(ap, fmt);
3142797Sjg 		vcmn_err(CE_NOTE, fmt, ap);
3152797Sjg 		va_end(ap);
3162797Sjg 	}
3172797Sjg }
3182797Sjg 
3192797Sjg /*
3202797Sjg  * Some operations clients may use to manage the data
3212797Sjg  * to be persisted in a cache file.
3222797Sjg  */
3232797Sjg char *
3242797Sjg nvf_cache_name(nvf_handle_t handle)
3252797Sjg {
3262797Sjg 	return (((nvfd_t *)handle)->nvf_cache_path);
3272797Sjg }
3282797Sjg 
3292797Sjg krwlock_t *
3302797Sjg nvf_lock(nvf_handle_t handle)
3312797Sjg {
3322797Sjg 	return (&(((nvfd_t *)handle)->nvf_lock));
3332797Sjg }
3342797Sjg 
3352797Sjg list_t *
3362797Sjg nvf_list(nvf_handle_t handle)
3372797Sjg {
3382797Sjg 	return (&(((nvfd_t *)handle)->nvf_data_list));
3392797Sjg }
3402797Sjg 
3412797Sjg void
3422797Sjg nvf_mark_dirty(nvf_handle_t handle)
3432797Sjg {
3442797Sjg 	ASSERT(RW_WRITE_HELD(&(((nvfd_t *)handle)->nvf_lock)));
3452797Sjg 	NVF_MARK_DIRTY((nvfd_t *)handle);
3462797Sjg }
3472797Sjg 
3482797Sjg int
3492797Sjg nvf_is_dirty(nvf_handle_t handle)
3502797Sjg {
3512797Sjg 	ASSERT(RW_LOCK_HELD(&(((nvfd_t *)handle)->nvf_lock)));
3522797Sjg 	return (NVF_IS_DIRTY((nvfd_t *)handle));
3532797Sjg }
3542797Sjg 
3552797Sjg static uint16_t
3562797Sjg nvp_cksum(uchar_t *buf, int64_t buflen)
3572797Sjg {
3582797Sjg 	uint16_t cksum = 0;
3592797Sjg 	uint16_t *p = (uint16_t *)buf;
3602797Sjg 	int64_t n;
3612797Sjg 
3622797Sjg 	if ((buflen & 0x01) != 0) {
3632797Sjg 		buflen--;
3642797Sjg 		cksum = buf[buflen];
3652797Sjg 	}
3662797Sjg 	n = buflen / 2;
3672797Sjg 	while (n-- > 0)
3682797Sjg 		cksum ^= *p++;
3692797Sjg 	return (cksum);
3702797Sjg }
3712797Sjg 
3722797Sjg int
3732797Sjg fread_nvlist(char *filename, nvlist_t **ret_nvlist)
3742797Sjg {
3752797Sjg 	struct _buf	*file;
3762797Sjg 	nvpf_hdr_t	hdr;
3772797Sjg 	char		*buf;
3782797Sjg 	nvlist_t	*nvl;
3792797Sjg 	int		rval;
3802797Sjg 	uint_t		offset;
3812797Sjg 	int		n;
3822797Sjg 	char		c;
3832797Sjg 	uint16_t	cksum, hdrsum;
3842797Sjg 
3852797Sjg 	*ret_nvlist = NULL;
3862797Sjg 
3872797Sjg 	file = kobj_open_file(filename);
3882797Sjg 	if (file == (struct _buf *)-1) {
3892797Sjg 		KFDEBUG((CE_CONT, "cannot open file: %s\n", filename));
3902797Sjg 		return (ENOENT);
3912797Sjg 	}
3922797Sjg 
3932797Sjg 	offset = 0;
3942797Sjg 	n = kobj_read_file(file, (char *)&hdr, sizeof (hdr), offset);
3952797Sjg 	if (n != sizeof (hdr)) {
3962797Sjg 		kobj_close_file(file);
3972797Sjg 		if (n < 0) {
3982797Sjg 			nvf_error("error reading header: %s\n", filename);
3992797Sjg 			return (EIO);
4002797Sjg 		} else if (n == 0) {
4012797Sjg 			KFDEBUG((CE_CONT, "file empty: %s\n", filename));
4022797Sjg 		} else {
4032797Sjg 			nvf_error("header size incorrect: %s\n", filename);
4042797Sjg 		}
4052797Sjg 		return (EINVAL);
4062797Sjg 	}
4072797Sjg 	offset += n;
4082797Sjg 
4092797Sjg 	KFDEBUG2((CE_CONT, "nvpf_magic: 0x%x\n", hdr.nvpf_magic));
4102797Sjg 	KFDEBUG2((CE_CONT, "nvpf_version: %d\n", hdr.nvpf_version));
4112797Sjg 	KFDEBUG2((CE_CONT, "nvpf_size: %lld\n",
4122797Sjg 		(longlong_t)hdr.nvpf_size));
4132797Sjg 	KFDEBUG2((CE_CONT, "nvpf_hdr_chksum: 0x%x\n",
4142797Sjg 		hdr.nvpf_hdr_chksum));
4152797Sjg 	KFDEBUG2((CE_CONT, "nvpf_chksum: 0x%x\n", hdr.nvpf_chksum));
4162797Sjg 
4172797Sjg 	cksum = hdr.nvpf_hdr_chksum;
4182797Sjg 	hdr.nvpf_hdr_chksum = 0;
4192797Sjg 	hdrsum = nvp_cksum((uchar_t *)&hdr, sizeof (hdr));
4202797Sjg 
4212797Sjg 	if (hdr.nvpf_magic != NVPF_HDR_MAGIC ||
4222797Sjg 	    hdr.nvpf_version != NVPF_HDR_VERSION || hdrsum != cksum) {
4232797Sjg 		kobj_close_file(file);
4242797Sjg 		if (hdrsum != cksum) {
4252797Sjg 			nvf_error("%s: checksum error "
4262797Sjg 			    "(actual 0x%x, expected 0x%x)\n",
4272797Sjg 			    filename, hdrsum, cksum);
4282797Sjg 		}
4292797Sjg 		nvf_error("%s: header information incorrect", filename);
4302797Sjg 		return (EINVAL);
4312797Sjg 	}
4322797Sjg 
4332797Sjg 	ASSERT(hdr.nvpf_size >= 0);
4342797Sjg 
4352797Sjg 	buf = kmem_alloc(hdr.nvpf_size, KM_SLEEP);
4362797Sjg 	n = kobj_read_file(file, buf, hdr.nvpf_size, offset);
4372797Sjg 	if (n != hdr.nvpf_size) {
4382797Sjg 		kmem_free(buf, hdr.nvpf_size);
4392797Sjg 		kobj_close_file(file);
4402797Sjg 		if (n < 0) {
4412797Sjg 			nvf_error("%s: read error %d", filename, n);
4422797Sjg 		} else {
4432797Sjg 			nvf_error("%s: incomplete read %d/%lld",
4442797Sjg 				filename, n, (longlong_t)hdr.nvpf_size);
4452797Sjg 		}
4462797Sjg 		return (EINVAL);
4472797Sjg 	}
4482797Sjg 	offset += n;
4492797Sjg 
4502797Sjg 	rval = kobj_read_file(file, &c, 1, offset);
4512797Sjg 	kobj_close_file(file);
4522797Sjg 	if (rval > 0) {
4532797Sjg 		nvf_error("%s is larger than %lld\n",
4542797Sjg 			filename, (longlong_t)hdr.nvpf_size);
4552797Sjg 		kmem_free(buf, hdr.nvpf_size);
4562797Sjg 		return (EINVAL);
4572797Sjg 	}
4582797Sjg 
4592797Sjg 	cksum = nvp_cksum((uchar_t *)buf, hdr.nvpf_size);
4602797Sjg 	if (hdr.nvpf_chksum != cksum) {
4612797Sjg 		nvf_error("%s: checksum error (actual 0x%x, expected 0x%x)\n",
4622797Sjg 		    filename, hdr.nvpf_chksum, cksum);
4632797Sjg 		kmem_free(buf, hdr.nvpf_size);
4642797Sjg 		return (EINVAL);
4652797Sjg 	}
4662797Sjg 
4672797Sjg 	nvl = NULL;
4682797Sjg 	rval = nvlist_unpack(buf, hdr.nvpf_size, &nvl, 0);
4692797Sjg 	if (rval != 0) {
4702797Sjg 		nvf_error("%s: error %d unpacking nvlist\n",
4712797Sjg 			filename, rval);
4722797Sjg 		kmem_free(buf, hdr.nvpf_size);
4732797Sjg 		return (EINVAL);
4742797Sjg 	}
4752797Sjg 
4762797Sjg 	kmem_free(buf, hdr.nvpf_size);
4772797Sjg 	*ret_nvlist = nvl;
4782797Sjg 	return (0);
4792797Sjg }
4802797Sjg 
4812797Sjg static int
4822797Sjg kfcreate(char *filename, kfile_t **kfilep)
4832797Sjg {
4842797Sjg 	kfile_t	*fp;
4852797Sjg 	int	rval;
4862797Sjg 
4872797Sjg 	ASSERT(modrootloaded);
4882797Sjg 
4892797Sjg 	fp = kmem_alloc(sizeof (kfile_t), KM_SLEEP);
4902797Sjg 
4912797Sjg 	fp->kf_vnflags = FCREAT | FWRITE | FTRUNC;
4922797Sjg 	fp->kf_fname = filename;
4932797Sjg 	fp->kf_fpos = 0;
4942797Sjg 	fp->kf_state = 0;
4952797Sjg 
4962797Sjg 	KFDEBUG((CE_CONT, "create: %s flags 0x%x\n",
4972797Sjg 		filename, fp->kf_vnflags));
4982797Sjg 	rval = vn_open(filename, UIO_SYSSPACE, fp->kf_vnflags,
4992797Sjg 	    0444, &fp->kf_vp, CRCREAT, 0);
5002797Sjg 	if (rval != 0) {
5012797Sjg 		kmem_free(fp, sizeof (kfile_t));
5022797Sjg 		KFDEBUG((CE_CONT, "%s: create error %d\n",
5032797Sjg 			filename, rval));
5042797Sjg 		return (rval);
5052797Sjg 	}
5062797Sjg 
5072797Sjg 	*kfilep = fp;
5082797Sjg 	return (0);
5092797Sjg }
5102797Sjg 
5112797Sjg static int
5122797Sjg kfremove(char *filename)
5132797Sjg {
5142797Sjg 	int rval;
5152797Sjg 
5162797Sjg 	KFDEBUG((CE_CONT, "remove: %s\n", filename));
5172797Sjg 	rval = vn_remove(filename, UIO_SYSSPACE, RMFILE);
5182797Sjg 	if (rval != 0) {
5192797Sjg 		KFDEBUG((CE_CONT, "%s: remove error %d\n",
5202797Sjg 			filename, rval));
5212797Sjg 	}
5222797Sjg 	return (rval);
5232797Sjg }
5242797Sjg 
5252797Sjg static int
5262797Sjg kfread(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n)
5272797Sjg {
5282797Sjg 	ssize_t		resid;
5292797Sjg 	int		err;
5302797Sjg 	ssize_t		n;
5312797Sjg 
5322797Sjg 	ASSERT(modrootloaded);
5332797Sjg 
5342797Sjg 	if (fp->kf_state != 0)
5352797Sjg 		return (fp->kf_state);
5362797Sjg 
5372797Sjg 	err = vn_rdwr(UIO_READ, fp->kf_vp, buf, bufsiz, fp->kf_fpos,
5382797Sjg 		UIO_SYSSPACE, 0, (rlim64_t)0, kcred, &resid);
5392797Sjg 	if (err != 0) {
5402797Sjg 		KFDEBUG((CE_CONT, "%s: read error %d\n",
5412797Sjg 			fp->kf_fname, err));
5422797Sjg 		fp->kf_state = err;
5432797Sjg 		return (err);
5442797Sjg 	}
5452797Sjg 
5462797Sjg 	ASSERT(resid >= 0 && resid <= bufsiz);
5472797Sjg 	n = bufsiz - resid;
5482797Sjg 
5492797Sjg 	KFDEBUG1((CE_CONT, "%s: read %ld bytes ok %ld bufsiz, %ld resid\n",
5502797Sjg 		fp->kf_fname, n, bufsiz, resid));
5512797Sjg 
5522797Sjg 	fp->kf_fpos += n;
5532797Sjg 	*ret_n = n;
5542797Sjg 	return (0);
5552797Sjg }
5562797Sjg 
5572797Sjg static int
5582797Sjg kfwrite(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n)
5592797Sjg {
5602797Sjg 	rlim64_t	rlimit;
5612797Sjg 	ssize_t		resid;
5622797Sjg 	int		err;
5632797Sjg 	ssize_t		len;
5642797Sjg 	ssize_t		n = 0;
5652797Sjg 
5662797Sjg 	ASSERT(modrootloaded);
5672797Sjg 
5682797Sjg 	if (fp->kf_state != 0)
5692797Sjg 		return (fp->kf_state);
5702797Sjg 
5712797Sjg 	len = bufsiz;
5722797Sjg 	rlimit = bufsiz + 1;
5732797Sjg 	for (;;) {
5742797Sjg 		err = vn_rdwr(UIO_WRITE, fp->kf_vp, buf, len, fp->kf_fpos,
5752797Sjg 			UIO_SYSSPACE, FSYNC, rlimit, kcred, &resid);
5762797Sjg 		if (err) {
5772797Sjg 			KFDEBUG((CE_CONT, "%s: write error %d\n",
5782797Sjg 				fp->kf_fname, err));
5792797Sjg 			fp->kf_state = err;
5802797Sjg 			return (err);
5812797Sjg 		}
5822797Sjg 
5832797Sjg 		KFDEBUG1((CE_CONT, "%s: write %ld bytes ok %ld resid\n",
5842797Sjg 			fp->kf_fname, len-resid, resid));
5852797Sjg 
5862797Sjg 		ASSERT(resid >= 0 && resid <= len);
5872797Sjg 
5882797Sjg 		n += (len - resid);
5892797Sjg 		if (resid == 0)
5902797Sjg 			break;
5912797Sjg 
5922797Sjg 		if (resid == len) {
5932797Sjg 			KFDEBUG((CE_CONT, "%s: filesystem full?\n",
5942797Sjg 				fp->kf_fname));
5952797Sjg 			fp->kf_state = ENOSPC;
5962797Sjg 			return (ENOSPC);
5972797Sjg 		}
5982797Sjg 
5992797Sjg 		len -= resid;
6002797Sjg 		buf += len;
6012797Sjg 		fp->kf_fpos += len;
6022797Sjg 		len = resid;
6032797Sjg 	}
6042797Sjg 
6052797Sjg 	ASSERT(n == bufsiz);
6062797Sjg 	KFDEBUG1((CE_CONT, "%s: wrote %ld bytes ok\n", fp->kf_fname, n));
6072797Sjg 
6082797Sjg 	*ret_n = n;
6092797Sjg 	return (0);
6102797Sjg }
6112797Sjg 
6122797Sjg 
6132797Sjg static int
6142797Sjg kfclose(kfile_t *fp)
6152797Sjg {
6162797Sjg 	int		rval;
6172797Sjg 
6182797Sjg 	KFDEBUG((CE_CONT, "close: %s\n", fp->kf_fname));
6192797Sjg 
6202797Sjg 	if ((fp->kf_vnflags & FWRITE) && fp->kf_state == 0) {
621*5331Samw 		rval = VOP_FSYNC(fp->kf_vp, FSYNC, kcred, NULL);
6222797Sjg 		if (rval != 0) {
6232797Sjg 			nvf_error("%s: sync error %d\n",
6242797Sjg 				fp->kf_fname, rval);
6252797Sjg 		}
6262797Sjg 		KFDEBUG((CE_CONT, "%s: sync ok\n", fp->kf_fname));
6272797Sjg 	}
6282797Sjg 
629*5331Samw 	rval = VOP_CLOSE(fp->kf_vp, fp->kf_vnflags, 1, (offset_t)0, kcred,
630*5331Samw 		NULL);
6312797Sjg 	if (rval != 0) {
6322797Sjg 		if (fp->kf_state == 0) {
6332797Sjg 			nvf_error("%s: close error %d\n",
6342797Sjg 				fp->kf_fname, rval);
6352797Sjg 		}
6362797Sjg 	} else {
6372797Sjg 		if (fp->kf_state == 0)
6382797Sjg 			KFDEBUG((CE_CONT, "%s: close ok\n", fp->kf_fname));
6392797Sjg 	}
6402797Sjg 
6412797Sjg 	VN_RELE(fp->kf_vp);
6422797Sjg 	kmem_free(fp, sizeof (kfile_t));
6432797Sjg 	return (rval);
6442797Sjg }
6452797Sjg 
6462797Sjg static int
6472797Sjg kfrename(char *oldname, char *newname)
6482797Sjg {
6492797Sjg 	int rval;
6502797Sjg 
6512797Sjg 	ASSERT(modrootloaded);
6522797Sjg 
6532797Sjg 	KFDEBUG((CE_CONT, "renaming %s to %s\n", oldname, newname));
6542797Sjg 
6552797Sjg 	if ((rval = vn_rename(oldname, newname, UIO_SYSSPACE)) != 0) {
6562797Sjg 		KFDEBUG((CE_CONT, "rename %s to %s: %d\n",
6572797Sjg 			oldname, newname, rval));
6582797Sjg 	}
6592797Sjg 
6602797Sjg 	return (rval);
6612797Sjg }
6622797Sjg 
6632797Sjg int
6642797Sjg fwrite_nvlist(char *filename, nvlist_t *nvl)
6652797Sjg {
6662797Sjg 	char	*buf;
6672797Sjg 	char	*nvbuf;
6682797Sjg 	kfile_t	*fp;
6692797Sjg 	char	*newname;
6702797Sjg 	int	len, err, err1;
6712797Sjg 	size_t	buflen;
6722797Sjg 	ssize_t	n;
6732797Sjg 
6742797Sjg 	ASSERT(modrootloaded);
6752797Sjg 
6762797Sjg 	nvbuf = NULL;
6772797Sjg 	err = nvlist_pack(nvl, &nvbuf, &buflen, NV_ENCODE_NATIVE, 0);
6782797Sjg 	if (err != 0) {
6792797Sjg 		nvf_error("%s: error %d packing nvlist\n",
6802797Sjg 			filename, err);
6812797Sjg 		return (err);
6822797Sjg 	}
6832797Sjg 
6842797Sjg 	buf = kmem_alloc(sizeof (nvpf_hdr_t) + buflen, KM_SLEEP);
6852797Sjg 	bzero(buf, sizeof (nvpf_hdr_t));
6862797Sjg 
6872797Sjg 	((nvpf_hdr_t *)buf)->nvpf_magic = NVPF_HDR_MAGIC;
6882797Sjg 	((nvpf_hdr_t *)buf)->nvpf_version = NVPF_HDR_VERSION;
6892797Sjg 	((nvpf_hdr_t *)buf)->nvpf_size = buflen;
6902797Sjg 	((nvpf_hdr_t *)buf)->nvpf_chksum = nvp_cksum((uchar_t *)nvbuf, buflen);
6912797Sjg 	((nvpf_hdr_t *)buf)->nvpf_hdr_chksum =
6922797Sjg 		nvp_cksum((uchar_t *)buf, sizeof (nvpf_hdr_t));
6932797Sjg 
6942797Sjg 	bcopy(nvbuf, buf + sizeof (nvpf_hdr_t), buflen);
6952797Sjg 	kmem_free(nvbuf, buflen);
6962797Sjg 	buflen += sizeof (nvpf_hdr_t);
6972797Sjg 
6982797Sjg 	len = strlen(filename) + MAX_SUFFIX_LEN + 2;
6992797Sjg 	newname = kmem_alloc(len, KM_SLEEP);
7002797Sjg 
7012797Sjg 
7022797Sjg 	(void) sprintf(newname, "%s.%s",
7032797Sjg 		filename, NEW_FILENAME_SUFFIX);
7042797Sjg 
7052797Sjg 	/*
7062797Sjg 	 * To make it unlikely we suffer data loss, write
7072797Sjg 	 * data to the new temporary file.  Once successful
7082797Sjg 	 * complete the transaction by renaming the new file
7092797Sjg 	 * to replace the previous.
7102797Sjg 	 */
7112797Sjg 
7122797Sjg 	if ((err = kfcreate(newname, &fp)) == 0) {
7132797Sjg 		err = kfwrite(fp, buf, buflen, &n);
7142797Sjg 		if (err) {
7152797Sjg 			nvf_error("%s: write error - %d\n",
7162797Sjg 				newname, err);
7172797Sjg 		} else {
7182797Sjg 			if (n != buflen) {
7192797Sjg 				nvf_error(
7202797Sjg 				    "%s: partial write %ld of %ld bytes\n",
7212797Sjg 				    newname, n, buflen);
7222797Sjg 				nvf_error("%s: filesystem may be full?\n",
7232797Sjg 				    newname);
7242797Sjg 				err = EIO;
7252797Sjg 			}
7262797Sjg 		}
7272797Sjg 		if ((err1 = kfclose(fp)) != 0) {
7282797Sjg 			nvf_error("%s: close error\n", newname);
7292797Sjg 			if (err == 0)
7302797Sjg 				err = err1;
7312797Sjg 		}
7322797Sjg 		if (err != 0) {
7332797Sjg 			if (kfremove(newname) != 0) {
7342797Sjg 				nvf_error("%s: remove failed\n",
7352797Sjg 				    newname);
7362797Sjg 			}
7372797Sjg 		}
7382797Sjg 	} else {
7392797Sjg 		nvf_error("%s: create failed - %d\n", filename, err);
7402797Sjg 	}
7412797Sjg 
7422797Sjg 	if (err == 0) {
7432797Sjg 		if ((err = kfrename(newname, filename)) != 0) {
7442797Sjg 			nvf_error("%s: rename from %s failed\n",
7452797Sjg 				newname, filename);
7462797Sjg 		}
7472797Sjg 	}
7482797Sjg 
7492797Sjg 	kmem_free(newname, len);
7502797Sjg 	kmem_free(buf, buflen);
7512797Sjg 
7522797Sjg 	return (err);
7532797Sjg }
7542797Sjg 
7552797Sjg static int
7562797Sjg e_fwrite_nvlist(nvfd_t *nvfd, nvlist_t *nvl)
7572797Sjg {
7582797Sjg 	int err;
7592797Sjg 
7602797Sjg 	if ((err = fwrite_nvlist(nvfd->nvf_cache_path, nvl)) == 0)
7612797Sjg 		return (DDI_SUCCESS);
7622797Sjg 	else {
7632797Sjg 		if (err == EROFS)
7642797Sjg 			NVF_MARK_READONLY(nvfd);
7652797Sjg 		return (DDI_FAILURE);
7662797Sjg 	}
7672797Sjg }
7682797Sjg 
7692797Sjg static void
7702797Sjg nvp_list_free(nvfd_t *nvf)
7712797Sjg {
7722797Sjg 	ASSERT(RW_WRITE_HELD(&nvf->nvf_lock));
7732797Sjg 	(nvf->nvf_list_free)((nvf_handle_t)nvf);
7742797Sjg 	ASSERT(RW_WRITE_HELD(&nvf->nvf_lock));
7752797Sjg }
7762797Sjg 
7772797Sjg /*
7782797Sjg  * Read a file in the nvlist format
7792797Sjg  *	EIO - i/o error during read
7802797Sjg  *	ENOENT - file not found
7812797Sjg  *	EINVAL - file contents corrupted
7822797Sjg  */
7832797Sjg static int
7842797Sjg fread_nvp_list(nvfd_t *nvfd)
7852797Sjg {
7862797Sjg 	nvlist_t	*nvl;
7872797Sjg 	nvpair_t	*nvp;
7882797Sjg 	char		*name;
7892797Sjg 	nvlist_t	*sublist;
7902797Sjg 	int		rval;
7912797Sjg 	int		rv;
7922797Sjg 
7932797Sjg 	ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
7942797Sjg 
7952797Sjg 	rval = fread_nvlist(nvfd->nvf_cache_path, &nvl);
7962797Sjg 	if (rval != 0)
7972797Sjg 		return (rval);
7982797Sjg 	ASSERT(nvl != NULL);
7992797Sjg 
8002797Sjg 	nvp = NULL;
8012797Sjg 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
8022797Sjg 		name = nvpair_name(nvp);
8032797Sjg 		ASSERT(strlen(name) > 0);
8042797Sjg 
8052797Sjg 		switch (nvpair_type(nvp)) {
8062797Sjg 		case DATA_TYPE_NVLIST:
8072797Sjg 			rval = nvpair_value_nvlist(nvp, &sublist);
8082797Sjg 			if (rval != 0) {
8092797Sjg 				nvf_error(
8102797Sjg 				    "nvpair_value_nvlist error %s %d\n",
8112797Sjg 				    name, rval);
8122797Sjg 				goto error;
8132797Sjg 			}
8142797Sjg 
8152797Sjg 			/*
8162797Sjg 			 * unpack nvlist for this device and
8172797Sjg 			 * add elements to data list.
8182797Sjg 			 */
8192797Sjg 			ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
8202797Sjg 			rv = (nvfd->nvf_unpack_nvlist)
8212797Sjg 			    ((nvf_handle_t)nvfd, sublist, name);
8222797Sjg 			ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
8232797Sjg 			if (rv != 0) {
8242797Sjg 				nvf_error(
8252797Sjg 				    "%s: %s invalid list element\n",
8262797Sjg 				    nvfd->nvf_cache_path, name);
8272797Sjg 				rval = EINVAL;
8282797Sjg 				goto error;
8292797Sjg 			}
8302797Sjg 			break;
8312797Sjg 
8322797Sjg 		default:
8332797Sjg 			nvf_error("%s: %s unsupported data type %d\n",
8342797Sjg 				nvfd->nvf_cache_path, name, nvpair_type(nvp));
8352797Sjg 			rval = EINVAL;
8362797Sjg 			goto error;
8372797Sjg 		}
8382797Sjg 	}
8392797Sjg 
8402797Sjg 	nvlist_free(nvl);
8412797Sjg 
8422797Sjg 	return (0);
8432797Sjg 
8442797Sjg error:
8452797Sjg 	nvlist_free(nvl);
8462797Sjg 	nvp_list_free(nvfd);
8472797Sjg 	return (rval);
8482797Sjg }
8492797Sjg 
8502797Sjg 
8512797Sjg int
8522797Sjg nvf_read_file(nvf_handle_t nvf_handle)
8532797Sjg {
8542797Sjg 	nvfd_t *nvfd = (nvfd_t *)nvf_handle;
8552797Sjg 	int rval;
8562797Sjg 
8572797Sjg 	ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
8582797Sjg 
8592797Sjg 	if (kfio_disable_read)
8602797Sjg 		return (0);
8612797Sjg 
8622797Sjg 	KFDEBUG((CE_CONT, "reading %s\n", nvfd->nvf_cache_path));
8632797Sjg 
8642797Sjg 	rval = fread_nvp_list(nvfd);
8652797Sjg 	if (rval) {
8662797Sjg 		switch (rval) {
8672797Sjg 		case EIO:
8682797Sjg 			nvfd->nvf_flags |= NVF_F_REBUILD_MSG;
8692797Sjg 			cmn_err(CE_WARN, "%s: I/O error",
8702797Sjg 				nvfd->nvf_cache_path);
8712797Sjg 			break;
8722797Sjg 		case ENOENT:
8732797Sjg 			nvfd->nvf_flags |= NVF_F_CREATE_MSG;
8742797Sjg 			nvf_error("%s: not found\n",
8752797Sjg 				nvfd->nvf_cache_path);
8762797Sjg 			break;
8772797Sjg 		case EINVAL:
8782797Sjg 		default:
8792797Sjg 			nvfd->nvf_flags |= NVF_F_REBUILD_MSG;
8802797Sjg 			cmn_err(CE_WARN, "%s: data file corrupted",
8812797Sjg 				nvfd->nvf_cache_path);
8822797Sjg 			break;
8832797Sjg 		}
8842797Sjg 	}
8852797Sjg 	return (rval);
8862797Sjg }
8872797Sjg 
8882797Sjg static void
8892797Sjg nvf_write_is_complete(nvfd_t *fd)
8902797Sjg {
8912797Sjg 	if (fd->nvf_write_complete) {
8922797Sjg 		(fd->nvf_write_complete)((nvf_handle_t)fd);
8932797Sjg 	}
8942797Sjg }
8952797Sjg 
8962797Sjg /*ARGSUSED*/
8972797Sjg static void
8982797Sjg nvpflush_timeout(void *arg)
8992797Sjg {
9002797Sjg 	clock_t nticks;
9012797Sjg 
9022797Sjg 	mutex_enter(&nvpflush_lock);
9032797Sjg 	nticks = nvpticks - ddi_get_lbolt();
9042797Sjg 	if (nticks > 4) {
9052797Sjg 		nvpflush_timer_busy = 1;
9062797Sjg 		mutex_exit(&nvpflush_lock);
9072797Sjg 		nvpflush_id = timeout(nvpflush_timeout, NULL, nticks);
9082797Sjg 	} else {
9092797Sjg 		do_nvpflush = 1;
9102797Sjg 		NVPDAEMON_DEBUG((CE_CONT, "signal nvpdaemon\n"));
9112797Sjg 		cv_signal(&nvpflush_cv);
9122797Sjg 		nvpflush_id = 0;
9132797Sjg 		nvpflush_timer_busy = 0;
9142797Sjg 		mutex_exit(&nvpflush_lock);
9152797Sjg 	}
9162797Sjg }
9172797Sjg 
9182797Sjg /*
9192797Sjg  * After marking a list as dirty, wake the nvpflush daemon
9202797Sjg  * to perform the update.
9212797Sjg  */
9222797Sjg void
9232797Sjg nvf_wake_daemon(void)
9242797Sjg {
9252797Sjg 	clock_t nticks;
9262797Sjg 
9272797Sjg 	/*
9282797Sjg 	 * If the system isn't up yet
9292797Sjg 	 * don't even think about starting a flush.
9302797Sjg 	 */
9312797Sjg 	if (!i_ddi_io_initialized())
9322797Sjg 		return;
9332797Sjg 
9342797Sjg 	mutex_enter(&nvpflush_lock);
9352797Sjg 
9362797Sjg 	if (nvpflush_daemon_active == 0) {
9372797Sjg 		nvpflush_daemon_active = 1;
9382797Sjg 		mutex_exit(&nvpflush_lock);
9392797Sjg 		NVPDAEMON_DEBUG((CE_CONT, "starting nvpdaemon thread\n"));
9402797Sjg 		nvpflush_thr_id = thread_create(NULL, 0,
9412797Sjg 		    (void (*)())nvpflush_daemon,
9422797Sjg 		    NULL, 0, &p0, TS_RUN, minclsyspri);
9432797Sjg 		mutex_enter(&nvpflush_lock);
9442797Sjg 	}
9452797Sjg 
9462797Sjg 	nticks = nvpflush_delay * TICKS_PER_SECOND;
9472797Sjg 	nvpticks = ddi_get_lbolt() + nticks;
9482797Sjg 	if (nvpflush_timer_busy == 0) {
9492797Sjg 		nvpflush_timer_busy = 1;
9502797Sjg 		mutex_exit(&nvpflush_lock);
9512797Sjg 		nvpflush_id = timeout(nvpflush_timeout, NULL, nticks + 4);
9522797Sjg 	} else
9532797Sjg 		mutex_exit(&nvpflush_lock);
9542797Sjg }
9552797Sjg 
9562797Sjg static int
9572797Sjg nvpflush_one(nvfd_t *nvfd)
9582797Sjg {
9592797Sjg 	int rval = DDI_SUCCESS;
9602797Sjg 	nvlist_t *nvl;
9612797Sjg 
9622797Sjg 	rw_enter(&nvfd->nvf_lock, RW_READER);
9632797Sjg 
9642797Sjg 	ASSERT((nvfd->nvf_flags & NVF_F_FLUSHING) == 0);
9652797Sjg 
9662797Sjg 	if (!NVF_IS_DIRTY(nvfd) ||
9672797Sjg 	    NVF_IS_READONLY(nvfd) || kfio_disable_write) {
9682797Sjg 		NVF_CLEAR_DIRTY(nvfd);
9692797Sjg 		rw_exit(&nvfd->nvf_lock);
9702797Sjg 		return (DDI_SUCCESS);
9712797Sjg 	}
9722797Sjg 
9732797Sjg 	if (rw_tryupgrade(&nvfd->nvf_lock) == 0) {
9742797Sjg 		nvf_error("nvpflush: "
9752797Sjg 		    "%s rw upgrade failed\n", nvfd->nvf_cache_path);
9762797Sjg 		rw_exit(&nvfd->nvf_lock);
9772797Sjg 		return (DDI_FAILURE);
9782797Sjg 	}
9792797Sjg 	if (((nvfd->nvf_pack_list)
9802797Sjg 	    ((nvf_handle_t)nvfd, &nvl)) != DDI_SUCCESS) {
9812797Sjg 		nvf_error("nvpflush: "
9822797Sjg 		    "%s nvlist construction failed\n", nvfd->nvf_cache_path);
9832797Sjg 		ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
9842797Sjg 		rw_exit(&nvfd->nvf_lock);
9852797Sjg 		return (DDI_FAILURE);
9862797Sjg 	}
9872797Sjg 	ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
9882797Sjg 
9892797Sjg 	NVF_CLEAR_DIRTY(nvfd);
9902797Sjg 	nvfd->nvf_flags |= NVF_F_FLUSHING;
9912797Sjg 	rw_exit(&nvfd->nvf_lock);
9922797Sjg 
9932797Sjg 	rval = e_fwrite_nvlist(nvfd, nvl);
9942797Sjg 	nvlist_free(nvl);
9952797Sjg 
9962797Sjg 	rw_enter(&nvfd->nvf_lock, RW_WRITER);
9972797Sjg 	nvfd->nvf_flags &= ~NVF_F_FLUSHING;
9982797Sjg 	if (rval == DDI_FAILURE) {
9992797Sjg 		if (NVF_IS_READONLY(nvfd)) {
10002797Sjg 			rval = DDI_SUCCESS;
10012797Sjg 			nvfd->nvf_flags &= ~(NVF_F_ERROR | NVF_F_DIRTY);
10022797Sjg 		} else if ((nvfd->nvf_flags & NVF_F_ERROR) == 0) {
10032797Sjg 			cmn_err(CE_CONT,
10042797Sjg 			    "%s: updated failed\n", nvfd->nvf_cache_path);
10052797Sjg 			nvfd->nvf_flags |= NVF_F_ERROR | NVF_F_DIRTY;
10062797Sjg 		}
10072797Sjg 	} else {
10082797Sjg 		if (nvfd->nvf_flags & NVF_F_CREATE_MSG) {
10092797Sjg 			cmn_err(CE_CONT,
10102797Sjg 			    "!Creating %s\n", nvfd->nvf_cache_path);
10112797Sjg 			nvfd->nvf_flags &= ~NVF_F_CREATE_MSG;
10122797Sjg 		}
10132797Sjg 		if (nvfd->nvf_flags & NVF_F_REBUILD_MSG) {
10142797Sjg 			cmn_err(CE_CONT,
10152797Sjg 			    "!Rebuilding %s\n", nvfd->nvf_cache_path);
10162797Sjg 			nvfd->nvf_flags &= ~NVF_F_REBUILD_MSG;
10172797Sjg 		}
10182797Sjg 		if (nvfd->nvf_flags & NVF_F_ERROR) {
10192797Sjg 			cmn_err(CE_CONT,
10202797Sjg 			    "%s: update now ok\n", nvfd->nvf_cache_path);
10212797Sjg 			nvfd->nvf_flags &= ~NVF_F_ERROR;
10222797Sjg 		}
10232797Sjg 		/*
10242797Sjg 		 * The file may need to be flushed again if the cached
10252797Sjg 		 * data was touched while writing the earlier contents.
10262797Sjg 		 */
10272797Sjg 		if (NVF_IS_DIRTY(nvfd))
10282797Sjg 			rval = DDI_FAILURE;
10292797Sjg 	}
10302797Sjg 
10312797Sjg 	rw_exit(&nvfd->nvf_lock);
10322797Sjg 	return (rval);
10332797Sjg }
10342797Sjg 
10352797Sjg 
10362797Sjg static void
10372797Sjg nvpflush_daemon(void)
10382797Sjg {
10392797Sjg 	callb_cpr_t cprinfo;
10402797Sjg 	nvfd_t *nvfdp, *nextfdp;
10412797Sjg 	clock_t clk;
10422797Sjg 	int rval;
10432797Sjg 	int want_wakeup;
10442797Sjg 	int is_now_clean;
10452797Sjg 
10462797Sjg 	ASSERT(modrootloaded);
10472797Sjg 
10482797Sjg 	nvpflush_thread = curthread;
10492797Sjg 	NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: init\n"));
10502797Sjg 
10512797Sjg 	CALLB_CPR_INIT(&cprinfo, &nvpflush_lock, callb_generic_cpr, "nvp");
10522797Sjg 	mutex_enter(&nvpflush_lock);
10532797Sjg 	for (;;) {
10542797Sjg 
10552797Sjg 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
10562797Sjg 		while (do_nvpflush == 0) {
10572797Sjg 			clk = cv_timedwait(&nvpflush_cv, &nvpflush_lock,
10582797Sjg 			    ddi_get_lbolt() +
10592797Sjg 				(nvpdaemon_idle_time * TICKS_PER_SECOND));
10602797Sjg 			if (clk == -1 &&
10612797Sjg 			    do_nvpflush == 0 && nvpflush_timer_busy == 0) {
10622797Sjg 				/*
10632797Sjg 				 * Note that CALLB_CPR_EXIT calls mutex_exit()
10642797Sjg 				 * on the lock passed in to CALLB_CPR_INIT,
10652797Sjg 				 * so the lock must be held when invoking it.
10662797Sjg 				 */
10672797Sjg 				CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock);
10682797Sjg 				NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: exit\n"));
10692797Sjg 				ASSERT(mutex_owned(&nvpflush_lock));
10702797Sjg 				nvpflush_thr_id = NULL;
10712797Sjg 				nvpflush_daemon_active = 0;
10722797Sjg 				CALLB_CPR_EXIT(&cprinfo);
10732797Sjg 				thread_exit();
10742797Sjg 			}
10752797Sjg 		}
10762797Sjg 		CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock);
10772797Sjg 
10782797Sjg 		nvpbusy = 1;
10792797Sjg 		want_wakeup = 0;
10802797Sjg 		do_nvpflush = 0;
10812797Sjg 		mutex_exit(&nvpflush_lock);
10822797Sjg 
10832797Sjg 		/*
10842797Sjg 		 * Try flushing what's dirty, reschedule if there's
10852797Sjg 		 * a failure or data gets marked as dirty again.
10862797Sjg 		 * First move each file marked dirty to the dirty
10872797Sjg 		 * list to avoid locking the list across the write.
10882797Sjg 		 */
10892797Sjg 		mutex_enter(&nvf_cache_mutex);
10902797Sjg 		for (nvfdp = list_head(&nvf_cache_files);
10912797Sjg 		    nvfdp; nvfdp = nextfdp) {
10922797Sjg 			nextfdp = list_next(&nvf_cache_files, nvfdp);
10932797Sjg 			rw_enter(&nvfdp->nvf_lock, RW_READER);
10942797Sjg 			if (NVF_IS_DIRTY(nvfdp)) {
10952797Sjg 				list_remove(&nvf_cache_files, nvfdp);
10962797Sjg 				list_insert_tail(&nvf_dirty_files, nvfdp);
10972797Sjg 				rw_exit(&nvfdp->nvf_lock);
10982797Sjg 			} else {
10992797Sjg 				NVPDAEMON_DEBUG((CE_CONT,
11002797Sjg 				    "nvpdaemon: not dirty %s\n",
11012797Sjg 				    nvfdp->nvf_cache_path));
11022797Sjg 				rw_exit(&nvfdp->nvf_lock);
11032797Sjg 			}
11042797Sjg 		}
11052797Sjg 		mutex_exit(&nvf_cache_mutex);
11062797Sjg 
11072797Sjg 		/*
11082797Sjg 		 * Now go through the dirty list
11092797Sjg 		 */
11102797Sjg 		for (nvfdp = list_head(&nvf_dirty_files);
11112797Sjg 		    nvfdp; nvfdp = nextfdp) {
11122797Sjg 			nextfdp = list_next(&nvf_dirty_files, nvfdp);
11132797Sjg 
11142797Sjg 			is_now_clean = 0;
11152797Sjg 			rw_enter(&nvfdp->nvf_lock, RW_READER);
11162797Sjg 			if (NVF_IS_DIRTY(nvfdp)) {
11172797Sjg 				NVPDAEMON_DEBUG((CE_CONT,
11182797Sjg 				    "nvpdaemon: flush %s\n",
11192797Sjg 				    nvfdp->nvf_cache_path));
11202797Sjg 				rw_exit(&nvfdp->nvf_lock);
11212797Sjg 				rval = nvpflush_one(nvfdp);
11222797Sjg 				rw_enter(&nvfdp->nvf_lock, RW_READER);
11232797Sjg 				if (rval != DDI_SUCCESS ||
11242797Sjg 				    NVF_IS_DIRTY(nvfdp)) {
11252797Sjg 					rw_exit(&nvfdp->nvf_lock);
11262797Sjg 					NVPDAEMON_DEBUG((CE_CONT,
11272797Sjg 					    "nvpdaemon: %s dirty again\n",
11282797Sjg 					    nvfdp->nvf_cache_path));
11292797Sjg 					want_wakeup = 1;
11302797Sjg 				} else {
11312797Sjg 					rw_exit(&nvfdp->nvf_lock);
11322797Sjg 					nvf_write_is_complete(nvfdp);
11332797Sjg 					is_now_clean = 1;
11342797Sjg 				}
11352797Sjg 			} else {
11362797Sjg 				NVPDAEMON_DEBUG((CE_CONT,
11372797Sjg 				    "nvpdaemon: not dirty %s\n",
11382797Sjg 				    nvfdp->nvf_cache_path));
11392797Sjg 				rw_exit(&nvfdp->nvf_lock);
11402797Sjg 				is_now_clean = 1;
11412797Sjg 			}
11422797Sjg 
11432797Sjg 			if (is_now_clean) {
11442797Sjg 				mutex_enter(&nvf_cache_mutex);
11452797Sjg 				list_remove(&nvf_dirty_files, nvfdp);
11462797Sjg 				list_insert_tail(&nvf_cache_files,
11472797Sjg 				    nvfdp);
11482797Sjg 				mutex_exit(&nvf_cache_mutex);
11492797Sjg 			}
11502797Sjg 		}
11512797Sjg 
11522797Sjg 		if (want_wakeup)
11532797Sjg 			nvf_wake_daemon();
11542797Sjg 
11552797Sjg 		mutex_enter(&nvpflush_lock);
11562797Sjg 		nvpbusy = 0;
11572797Sjg 	}
11582797Sjg }
1159