xref: /onnv-gate/usr/src/uts/common/os/devcache.c (revision 2797:3782a13773c1)
1*2797Sjg /*
2*2797Sjg  * CDDL HEADER START
3*2797Sjg  *
4*2797Sjg  * The contents of this file are subject to the terms of the
5*2797Sjg  * Common Development and Distribution License (the "License").
6*2797Sjg  * You may not use this file except in compliance with the License.
7*2797Sjg  *
8*2797Sjg  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*2797Sjg  * or http://www.opensolaris.org/os/licensing.
10*2797Sjg  * See the License for the specific language governing permissions
11*2797Sjg  * and limitations under the License.
12*2797Sjg  *
13*2797Sjg  * When distributing Covered Code, include this CDDL HEADER in each
14*2797Sjg  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*2797Sjg  * If applicable, add the following below this CDDL HEADER, with the
16*2797Sjg  * fields enclosed by brackets "[]" replaced with your own identifying
17*2797Sjg  * information: Portions Copyright [yyyy] [name of copyright owner]
18*2797Sjg  *
19*2797Sjg  * CDDL HEADER END
20*2797Sjg  */
21*2797Sjg /*
22*2797Sjg  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23*2797Sjg  * Use is subject to license terms.
24*2797Sjg  */
25*2797Sjg 
26*2797Sjg #pragma ident	"%Z%%M%	%I%	%E% SMI"
27*2797Sjg 
28*2797Sjg #include <sys/note.h>
29*2797Sjg #include <sys/t_lock.h>
30*2797Sjg #include <sys/cmn_err.h>
31*2797Sjg #include <sys/instance.h>
32*2797Sjg #include <sys/conf.h>
33*2797Sjg #include <sys/stat.h>
34*2797Sjg #include <sys/ddi.h>
35*2797Sjg #include <sys/hwconf.h>
36*2797Sjg #include <sys/sunddi.h>
37*2797Sjg #include <sys/sunndi.h>
38*2797Sjg #include <sys/ddi_impldefs.h>
39*2797Sjg #include <sys/ndi_impldefs.h>
40*2797Sjg #include <sys/modctl.h>
41*2797Sjg #include <sys/dacf.h>
42*2797Sjg #include <sys/promif.h>
43*2797Sjg #include <sys/cpuvar.h>
44*2797Sjg #include <sys/pathname.h>
45*2797Sjg #include <sys/kobj.h>
46*2797Sjg #include <sys/devcache.h>
47*2797Sjg #include <sys/devcache_impl.h>
48*2797Sjg #include <sys/sysmacros.h>
49*2797Sjg #include <sys/varargs.h>
50*2797Sjg #include <sys/callb.h>
51*2797Sjg 
52*2797Sjg /*
53*2797Sjg  * This facility provides interfaces to clients to register,
54*2797Sjg  * read and update cache data in persisted backing store files,
55*2797Sjg  * usually in /etc/devices.  The data persisted through this
56*2797Sjg  * mechanism should be stateless data, functioning in the sense
57*2797Sjg  * of a cache.  Writes are performed by a background daemon
58*2797Sjg  * thread, permitting a client to schedule an update without
59*2797Sjg  * blocking, then continue updating the data state in
60*2797Sjg  * parallel.  The data is only locked by the daemon thread
61*2797Sjg  * to pack the data in preparation for the write.
62*2797Sjg  *
63*2797Sjg  * Data persisted through this mechanism should be capable
64*2797Sjg  * of being regenerated through normal system operation,
65*2797Sjg  * for example attaching all disk devices would cause all
66*2797Sjg  * devids to be registered for those devices.  By caching
67*2797Sjg  * a devid-device tuple, the system can operate in a
68*2797Sjg  * more optimal way, directly attaching the device mapped
69*2797Sjg  * to a devid, rather than burdensomely driving attach of
70*2797Sjg  * the entire device tree to discover a single device.
71*2797Sjg  *
72*2797Sjg  * Note that a client should only need to include
73*2797Sjg  * <sys/devcache.h> for the supported interfaces.
74*2797Sjg  *
75*2797Sjg  * The data per client is entirely within the control of
76*2797Sjg  * the client.  When reading, data unpacked from the backing
77*2797Sjg  * store should be inserted in the list.  The pointer to
78*2797Sjg  * the list can be retreived via nvf_list().  When writing,
79*2797Sjg  * the data on the list is to be packed and returned to the
80*2797Sjg  * nvpdaemon as an nvlist.
81*2797Sjg  *
82*2797Sjg  * Obvious restrictions are imposed by the limits of the
83*2797Sjg  * nvlist format.  The data cannot be read or written
84*2797Sjg  * piecemeal, and large amounts of data aren't recommended.
85*2797Sjg  * However, nvlists do allow that data be named and typed
86*2797Sjg  * and can be size-of-int invariant, and the cached data
87*2797Sjg  * can be versioned conveniently.
88*2797Sjg  *
89*2797Sjg  * The registration involves two steps: a handle is
90*2797Sjg  * allocated by calling the registration function.
91*2797Sjg  * This sets up the data referenced by the handle and
92*2797Sjg  * initializes the lock.  Following registration, the
93*2797Sjg  * client must initialize the data list.  The list
94*2797Sjg  * interfaces require that the list element with offset
95*2797Sjg  * to the node link be provided.  The format of the
96*2797Sjg  * list element is under the control of the client.
97*2797Sjg  *
98*2797Sjg  * Locking: the address of the data list r/w lock provided
99*2797Sjg  * can be accessed with nvf_lock().  The lock must be held
100*2797Sjg  * as reader when traversing the list or checking state,
101*2797Sjg  * such as nvf_is_dirty().  The lock must be held as
102*2797Sjg  * writer when updating the list or marking it dirty.
103*2797Sjg  * The lock must not be held when waking the daemon.
104*2797Sjg  *
105*2797Sjg  * The data r/w lock is held as writer when the pack,
106*2797Sjg  * unpack and free list handlers are called.  The
107*2797Sjg  * lock should not be dropped and must be still held
108*2797Sjg  * upon return.  The client should also hold the lock
109*2797Sjg  * as reader when checking if the list is dirty, and
110*2797Sjg  * as writer when marking the list dirty or initiating
111*2797Sjg  * a read.
112*2797Sjg  *
113*2797Sjg  * The asynchronous nature of updates allows for the
114*2797Sjg  * possibility that the data may continue to be updated
115*2797Sjg  * once the daemon has been notified that an update is
116*2797Sjg  * desired.  The data only needs to be locked against
117*2797Sjg  * updates when packing the data into the form to be
118*2797Sjg  * written.  When the write of the packed data has
119*2797Sjg  * completed, the daemon will automatically reschedule
120*2797Sjg  * an update if the data was marked dirty after the
121*2797Sjg  * point at which it was packed.  Before beginning an
122*2797Sjg  * update, the daemon attempts to lock the data as
123*2797Sjg  * writer; if the writer lock is already held, it
124*2797Sjg  * backs off and retries later.  The model is to give
125*2797Sjg  * priority to the kernel processes generating the
126*2797Sjg  * data, and that the nature of the data is that
127*2797Sjg  * it does not change often, can be re-generated when
128*2797Sjg  * needed, so updates should not happen often and
129*2797Sjg  * can be delayed until the data stops changing.
130*2797Sjg  * The client may update the list or mark it dirty
131*2797Sjg  * any time it is able to acquire the lock as
132*2797Sjg  * writer first.
133*2797Sjg  *
134*2797Sjg  * A failed write will be retried after some delay,
135*2797Sjg  * in the hope that the cause of the error will be
136*2797Sjg  * transient, for example a filesystem with no space
137*2797Sjg  * available.  An update on a read-only filesystem
138*2797Sjg  * is failed silently and not retried; this would be
139*2797Sjg  * the case when booted off install media.
140*2797Sjg  *
141*2797Sjg  * There is no unregister mechanism as of yet, as it
142*2797Sjg  * hasn't been needed so far.
143*2797Sjg  */
144*2797Sjg 
145*2797Sjg /*
146*2797Sjg  * Global list of files registered and updated by the nvpflush
147*2797Sjg  * daemon, protected by the nvf_cache_mutex.  While an
148*2797Sjg  * update is taking place, a file is temporarily moved to
149*2797Sjg  * the dirty list to avoid locking the primary list for
150*2797Sjg  * the duration of the update.
151*2797Sjg  */
152*2797Sjg list_t		nvf_cache_files;
153*2797Sjg list_t		nvf_dirty_files;
154*2797Sjg kmutex_t	nvf_cache_mutex;
155*2797Sjg 
156*2797Sjg 
157*2797Sjg /*
158*2797Sjg  * Allow some delay from an update of the data before flushing
159*2797Sjg  * to permit simultaneous updates of multiple changes.
160*2797Sjg  * Changes in the data are expected to be bursty, ie
161*2797Sjg  * reconfig or hot-plug of a new adapter.
162*2797Sjg  *
163*2797Sjg  * kfio_report_error (default 0)
164*2797Sjg  *	Set to 1 to enable some error messages related to low-level
165*2797Sjg  *	kernel file i/o operations.
166*2797Sjg  *
167*2797Sjg  * nvpflush_delay (default 10)
168*2797Sjg  *	The number of seconds after data is marked dirty before the
169*2797Sjg  *	flush daemon is triggered to flush the data.  A longer period
170*2797Sjg  *	of time permits more data updates per write.  Note that
171*2797Sjg  *	every update resets the timer so no repository write will
172*2797Sjg  *	occur while data is being updated continuously.
173*2797Sjg  *
174*2797Sjg  * nvpdaemon_idle_time (default 60)
175*2797Sjg  *	The number of seconds the daemon will sleep idle before exiting.
176*2797Sjg  *
177*2797Sjg  */
178*2797Sjg #define	NVPFLUSH_DELAY		10
179*2797Sjg #define	NVPDAEMON_IDLE_TIME	60
180*2797Sjg 
181*2797Sjg #define	TICKS_PER_SECOND	(drv_usectohz(1000000))
182*2797Sjg 
183*2797Sjg /*
184*2797Sjg  * Tunables
185*2797Sjg  */
186*2797Sjg int kfio_report_error = 0;		/* kernel file i/o operations */
187*2797Sjg int kfio_disable_read = 0;		/* disable all reads */
188*2797Sjg int kfio_disable_write = 0;		/* disable all writes */
189*2797Sjg 
190*2797Sjg int nvpflush_delay	= NVPFLUSH_DELAY;
191*2797Sjg int nvpdaemon_idle_time	= NVPDAEMON_IDLE_TIME;
192*2797Sjg 
193*2797Sjg static timeout_id_t	nvpflush_id = 0;
194*2797Sjg static int		nvpflush_timer_busy = 0;
195*2797Sjg static int		nvpflush_daemon_active = 0;
196*2797Sjg static kthread_t	*nvpflush_thr_id = 0;
197*2797Sjg 
198*2797Sjg static int		do_nvpflush = 0;
199*2797Sjg static int		nvpbusy = 0;
200*2797Sjg static kmutex_t		nvpflush_lock;
201*2797Sjg static kcondvar_t	nvpflush_cv;
202*2797Sjg static kthread_id_t	nvpflush_thread;
203*2797Sjg static clock_t		nvpticks;
204*2797Sjg 
205*2797Sjg static void nvpflush_daemon(void);
206*2797Sjg 
207*2797Sjg #ifdef	DEBUG
208*2797Sjg int nvpdaemon_debug = 0;
209*2797Sjg int kfio_debug = 0;
210*2797Sjg #endif	/* DEBUG */
211*2797Sjg 
212*2797Sjg extern int modrootloaded;
213*2797Sjg extern void mdi_read_devices_files(void);
214*2797Sjg extern void mdi_clean_vhcache(void);
215*2797Sjg 
216*2797Sjg /*
217*2797Sjg  * Initialize the overall cache file management
218*2797Sjg  */
219*2797Sjg void
220*2797Sjg i_ddi_devices_init(void)
221*2797Sjg {
222*2797Sjg 	list_create(&nvf_cache_files, sizeof (nvfd_t),
223*2797Sjg 	    offsetof(nvfd_t, nvf_link));
224*2797Sjg 	list_create(&nvf_dirty_files, sizeof (nvfd_t),
225*2797Sjg 	    offsetof(nvfd_t, nvf_link));
226*2797Sjg 	mutex_init(&nvf_cache_mutex, NULL, MUTEX_DEFAULT, NULL);
227*2797Sjg 	devid_cache_init();
228*2797Sjg }
229*2797Sjg 
230*2797Sjg /*
231*2797Sjg  * Read cache files
232*2797Sjg  * The files read here should be restricted to those
233*2797Sjg  * that may be required to mount root.
234*2797Sjg  */
235*2797Sjg void
236*2797Sjg i_ddi_read_devices_files(void)
237*2797Sjg {
238*2797Sjg 	if (!kfio_disable_read) {
239*2797Sjg 		mdi_read_devices_files();
240*2797Sjg 		devid_cache_read();
241*2797Sjg 	}
242*2797Sjg }
243*2797Sjg 
244*2797Sjg void
245*2797Sjg i_ddi_start_flush_daemon(void)
246*2797Sjg {
247*2797Sjg 	nvfd_t	*nvfdp;
248*2797Sjg 
249*2797Sjg 	ASSERT(i_ddi_io_initialized());
250*2797Sjg 
251*2797Sjg 	mutex_init(&nvpflush_lock, NULL, MUTEX_DRIVER, NULL);
252*2797Sjg 	cv_init(&nvpflush_cv, NULL, CV_DRIVER, NULL);
253*2797Sjg 
254*2797Sjg 	mutex_enter(&nvf_cache_mutex);
255*2797Sjg 	for (nvfdp = list_head(&nvf_cache_files); nvfdp;
256*2797Sjg 	    nvfdp = list_next(&nvf_cache_files, nvfdp)) {
257*2797Sjg 		if (NVF_IS_DIRTY(nvfdp)) {
258*2797Sjg 			nvf_wake_daemon();
259*2797Sjg 			break;
260*2797Sjg 		}
261*2797Sjg 	}
262*2797Sjg 	mutex_exit(&nvf_cache_mutex);
263*2797Sjg }
264*2797Sjg 
265*2797Sjg void
266*2797Sjg i_ddi_clean_devices_files(void)
267*2797Sjg {
268*2797Sjg 	devid_cache_cleanup();
269*2797Sjg 	mdi_clean_vhcache();
270*2797Sjg }
271*2797Sjg 
272*2797Sjg /*
273*2797Sjg  * Register a cache file to be managed and updated by the nvpflush daemon.
274*2797Sjg  * All operations are performed through the returned handle.
275*2797Sjg  * There is no unregister mechanism for now.
276*2797Sjg  */
277*2797Sjg nvf_handle_t
278*2797Sjg nvf_register_file(nvf_ops_t *ops)
279*2797Sjg {
280*2797Sjg 	nvfd_t *nvfdp;
281*2797Sjg 
282*2797Sjg 	nvfdp = kmem_zalloc(sizeof (*nvfdp), KM_SLEEP);
283*2797Sjg 
284*2797Sjg 	nvfdp->nvf_ops = ops;
285*2797Sjg 	nvfdp->nvf_flags = 0;
286*2797Sjg 	rw_init(&nvfdp->nvf_lock, NULL, RW_DRIVER, NULL);
287*2797Sjg 
288*2797Sjg 	mutex_enter(&nvf_cache_mutex);
289*2797Sjg 	list_insert_tail(&nvf_cache_files, nvfdp);
290*2797Sjg 	mutex_exit(&nvf_cache_mutex);
291*2797Sjg 
292*2797Sjg 	return ((nvf_handle_t)nvfdp);
293*2797Sjg }
294*2797Sjg 
295*2797Sjg /*PRINTFLIKE1*/
296*2797Sjg void
297*2797Sjg nvf_error(const char *fmt, ...)
298*2797Sjg {
299*2797Sjg 	va_list ap;
300*2797Sjg 
301*2797Sjg 	if (kfio_report_error) {
302*2797Sjg 		va_start(ap, fmt);
303*2797Sjg 		vcmn_err(CE_NOTE, fmt, ap);
304*2797Sjg 		va_end(ap);
305*2797Sjg 	}
306*2797Sjg }
307*2797Sjg 
308*2797Sjg /*
309*2797Sjg  * Some operations clients may use to manage the data
310*2797Sjg  * to be persisted in a cache file.
311*2797Sjg  */
312*2797Sjg char *
313*2797Sjg nvf_cache_name(nvf_handle_t handle)
314*2797Sjg {
315*2797Sjg 	return (((nvfd_t *)handle)->nvf_cache_path);
316*2797Sjg }
317*2797Sjg 
318*2797Sjg krwlock_t *
319*2797Sjg nvf_lock(nvf_handle_t handle)
320*2797Sjg {
321*2797Sjg 	return (&(((nvfd_t *)handle)->nvf_lock));
322*2797Sjg }
323*2797Sjg 
324*2797Sjg list_t *
325*2797Sjg nvf_list(nvf_handle_t handle)
326*2797Sjg {
327*2797Sjg 	return (&(((nvfd_t *)handle)->nvf_data_list));
328*2797Sjg }
329*2797Sjg 
330*2797Sjg void
331*2797Sjg nvf_mark_dirty(nvf_handle_t handle)
332*2797Sjg {
333*2797Sjg 	ASSERT(RW_WRITE_HELD(&(((nvfd_t *)handle)->nvf_lock)));
334*2797Sjg 	NVF_MARK_DIRTY((nvfd_t *)handle);
335*2797Sjg }
336*2797Sjg 
337*2797Sjg int
338*2797Sjg nvf_is_dirty(nvf_handle_t handle)
339*2797Sjg {
340*2797Sjg 	ASSERT(RW_LOCK_HELD(&(((nvfd_t *)handle)->nvf_lock)));
341*2797Sjg 	return (NVF_IS_DIRTY((nvfd_t *)handle));
342*2797Sjg }
343*2797Sjg 
344*2797Sjg static uint16_t
345*2797Sjg nvp_cksum(uchar_t *buf, int64_t buflen)
346*2797Sjg {
347*2797Sjg 	uint16_t cksum = 0;
348*2797Sjg 	uint16_t *p = (uint16_t *)buf;
349*2797Sjg 	int64_t n;
350*2797Sjg 
351*2797Sjg 	if ((buflen & 0x01) != 0) {
352*2797Sjg 		buflen--;
353*2797Sjg 		cksum = buf[buflen];
354*2797Sjg 	}
355*2797Sjg 	n = buflen / 2;
356*2797Sjg 	while (n-- > 0)
357*2797Sjg 		cksum ^= *p++;
358*2797Sjg 	return (cksum);
359*2797Sjg }
360*2797Sjg 
361*2797Sjg int
362*2797Sjg fread_nvlist(char *filename, nvlist_t **ret_nvlist)
363*2797Sjg {
364*2797Sjg 	struct _buf	*file;
365*2797Sjg 	nvpf_hdr_t	hdr;
366*2797Sjg 	char		*buf;
367*2797Sjg 	nvlist_t	*nvl;
368*2797Sjg 	int		rval;
369*2797Sjg 	uint_t		offset;
370*2797Sjg 	int		n;
371*2797Sjg 	char		c;
372*2797Sjg 	uint16_t	cksum, hdrsum;
373*2797Sjg 
374*2797Sjg 	*ret_nvlist = NULL;
375*2797Sjg 
376*2797Sjg 	file = kobj_open_file(filename);
377*2797Sjg 	if (file == (struct _buf *)-1) {
378*2797Sjg 		KFDEBUG((CE_CONT, "cannot open file: %s\n", filename));
379*2797Sjg 		return (ENOENT);
380*2797Sjg 	}
381*2797Sjg 
382*2797Sjg 	offset = 0;
383*2797Sjg 	n = kobj_read_file(file, (char *)&hdr, sizeof (hdr), offset);
384*2797Sjg 	if (n != sizeof (hdr)) {
385*2797Sjg 		kobj_close_file(file);
386*2797Sjg 		if (n < 0) {
387*2797Sjg 			nvf_error("error reading header: %s\n", filename);
388*2797Sjg 			return (EIO);
389*2797Sjg 		} else if (n == 0) {
390*2797Sjg 			KFDEBUG((CE_CONT, "file empty: %s\n", filename));
391*2797Sjg 		} else {
392*2797Sjg 			nvf_error("header size incorrect: %s\n", filename);
393*2797Sjg 		}
394*2797Sjg 		return (EINVAL);
395*2797Sjg 	}
396*2797Sjg 	offset += n;
397*2797Sjg 
398*2797Sjg 	KFDEBUG2((CE_CONT, "nvpf_magic: 0x%x\n", hdr.nvpf_magic));
399*2797Sjg 	KFDEBUG2((CE_CONT, "nvpf_version: %d\n", hdr.nvpf_version));
400*2797Sjg 	KFDEBUG2((CE_CONT, "nvpf_size: %lld\n",
401*2797Sjg 		(longlong_t)hdr.nvpf_size));
402*2797Sjg 	KFDEBUG2((CE_CONT, "nvpf_hdr_chksum: 0x%x\n",
403*2797Sjg 		hdr.nvpf_hdr_chksum));
404*2797Sjg 	KFDEBUG2((CE_CONT, "nvpf_chksum: 0x%x\n", hdr.nvpf_chksum));
405*2797Sjg 
406*2797Sjg 	cksum = hdr.nvpf_hdr_chksum;
407*2797Sjg 	hdr.nvpf_hdr_chksum = 0;
408*2797Sjg 	hdrsum = nvp_cksum((uchar_t *)&hdr, sizeof (hdr));
409*2797Sjg 
410*2797Sjg 	if (hdr.nvpf_magic != NVPF_HDR_MAGIC ||
411*2797Sjg 	    hdr.nvpf_version != NVPF_HDR_VERSION || hdrsum != cksum) {
412*2797Sjg 		kobj_close_file(file);
413*2797Sjg 		if (hdrsum != cksum) {
414*2797Sjg 			nvf_error("%s: checksum error "
415*2797Sjg 			    "(actual 0x%x, expected 0x%x)\n",
416*2797Sjg 			    filename, hdrsum, cksum);
417*2797Sjg 		}
418*2797Sjg 		nvf_error("%s: header information incorrect", filename);
419*2797Sjg 		return (EINVAL);
420*2797Sjg 	}
421*2797Sjg 
422*2797Sjg 	ASSERT(hdr.nvpf_size >= 0);
423*2797Sjg 
424*2797Sjg 	buf = kmem_alloc(hdr.nvpf_size, KM_SLEEP);
425*2797Sjg 	n = kobj_read_file(file, buf, hdr.nvpf_size, offset);
426*2797Sjg 	if (n != hdr.nvpf_size) {
427*2797Sjg 		kmem_free(buf, hdr.nvpf_size);
428*2797Sjg 		kobj_close_file(file);
429*2797Sjg 		if (n < 0) {
430*2797Sjg 			nvf_error("%s: read error %d", filename, n);
431*2797Sjg 		} else {
432*2797Sjg 			nvf_error("%s: incomplete read %d/%lld",
433*2797Sjg 				filename, n, (longlong_t)hdr.nvpf_size);
434*2797Sjg 		}
435*2797Sjg 		return (EINVAL);
436*2797Sjg 	}
437*2797Sjg 	offset += n;
438*2797Sjg 
439*2797Sjg 	rval = kobj_read_file(file, &c, 1, offset);
440*2797Sjg 	kobj_close_file(file);
441*2797Sjg 	if (rval > 0) {
442*2797Sjg 		nvf_error("%s is larger than %lld\n",
443*2797Sjg 			filename, (longlong_t)hdr.nvpf_size);
444*2797Sjg 		kmem_free(buf, hdr.nvpf_size);
445*2797Sjg 		return (EINVAL);
446*2797Sjg 	}
447*2797Sjg 
448*2797Sjg 	cksum = nvp_cksum((uchar_t *)buf, hdr.nvpf_size);
449*2797Sjg 	if (hdr.nvpf_chksum != cksum) {
450*2797Sjg 		nvf_error("%s: checksum error (actual 0x%x, expected 0x%x)\n",
451*2797Sjg 		    filename, hdr.nvpf_chksum, cksum);
452*2797Sjg 		kmem_free(buf, hdr.nvpf_size);
453*2797Sjg 		return (EINVAL);
454*2797Sjg 	}
455*2797Sjg 
456*2797Sjg 	nvl = NULL;
457*2797Sjg 	rval = nvlist_unpack(buf, hdr.nvpf_size, &nvl, 0);
458*2797Sjg 	if (rval != 0) {
459*2797Sjg 		nvf_error("%s: error %d unpacking nvlist\n",
460*2797Sjg 			filename, rval);
461*2797Sjg 		kmem_free(buf, hdr.nvpf_size);
462*2797Sjg 		return (EINVAL);
463*2797Sjg 	}
464*2797Sjg 
465*2797Sjg 	kmem_free(buf, hdr.nvpf_size);
466*2797Sjg 	*ret_nvlist = nvl;
467*2797Sjg 	return (0);
468*2797Sjg }
469*2797Sjg 
470*2797Sjg static int
471*2797Sjg kfcreate(char *filename, kfile_t **kfilep)
472*2797Sjg {
473*2797Sjg 	kfile_t	*fp;
474*2797Sjg 	int	rval;
475*2797Sjg 
476*2797Sjg 	ASSERT(modrootloaded);
477*2797Sjg 
478*2797Sjg 	fp = kmem_alloc(sizeof (kfile_t), KM_SLEEP);
479*2797Sjg 
480*2797Sjg 	fp->kf_vnflags = FCREAT | FWRITE | FTRUNC;
481*2797Sjg 	fp->kf_fname = filename;
482*2797Sjg 	fp->kf_fpos = 0;
483*2797Sjg 	fp->kf_state = 0;
484*2797Sjg 
485*2797Sjg 	KFDEBUG((CE_CONT, "create: %s flags 0x%x\n",
486*2797Sjg 		filename, fp->kf_vnflags));
487*2797Sjg 	rval = vn_open(filename, UIO_SYSSPACE, fp->kf_vnflags,
488*2797Sjg 	    0444, &fp->kf_vp, CRCREAT, 0);
489*2797Sjg 	if (rval != 0) {
490*2797Sjg 		kmem_free(fp, sizeof (kfile_t));
491*2797Sjg 		KFDEBUG((CE_CONT, "%s: create error %d\n",
492*2797Sjg 			filename, rval));
493*2797Sjg 		return (rval);
494*2797Sjg 	}
495*2797Sjg 
496*2797Sjg 	*kfilep = fp;
497*2797Sjg 	return (0);
498*2797Sjg }
499*2797Sjg 
500*2797Sjg static int
501*2797Sjg kfremove(char *filename)
502*2797Sjg {
503*2797Sjg 	int rval;
504*2797Sjg 
505*2797Sjg 	KFDEBUG((CE_CONT, "remove: %s\n", filename));
506*2797Sjg 	rval = vn_remove(filename, UIO_SYSSPACE, RMFILE);
507*2797Sjg 	if (rval != 0) {
508*2797Sjg 		KFDEBUG((CE_CONT, "%s: remove error %d\n",
509*2797Sjg 			filename, rval));
510*2797Sjg 	}
511*2797Sjg 	return (rval);
512*2797Sjg }
513*2797Sjg 
514*2797Sjg static int
515*2797Sjg kfread(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n)
516*2797Sjg {
517*2797Sjg 	ssize_t		resid;
518*2797Sjg 	int		err;
519*2797Sjg 	ssize_t		n;
520*2797Sjg 
521*2797Sjg 	ASSERT(modrootloaded);
522*2797Sjg 
523*2797Sjg 	if (fp->kf_state != 0)
524*2797Sjg 		return (fp->kf_state);
525*2797Sjg 
526*2797Sjg 	err = vn_rdwr(UIO_READ, fp->kf_vp, buf, bufsiz, fp->kf_fpos,
527*2797Sjg 		UIO_SYSSPACE, 0, (rlim64_t)0, kcred, &resid);
528*2797Sjg 	if (err != 0) {
529*2797Sjg 		KFDEBUG((CE_CONT, "%s: read error %d\n",
530*2797Sjg 			fp->kf_fname, err));
531*2797Sjg 		fp->kf_state = err;
532*2797Sjg 		return (err);
533*2797Sjg 	}
534*2797Sjg 
535*2797Sjg 	ASSERT(resid >= 0 && resid <= bufsiz);
536*2797Sjg 	n = bufsiz - resid;
537*2797Sjg 
538*2797Sjg 	KFDEBUG1((CE_CONT, "%s: read %ld bytes ok %ld bufsiz, %ld resid\n",
539*2797Sjg 		fp->kf_fname, n, bufsiz, resid));
540*2797Sjg 
541*2797Sjg 	fp->kf_fpos += n;
542*2797Sjg 	*ret_n = n;
543*2797Sjg 	return (0);
544*2797Sjg }
545*2797Sjg 
546*2797Sjg static int
547*2797Sjg kfwrite(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n)
548*2797Sjg {
549*2797Sjg 	rlim64_t	rlimit;
550*2797Sjg 	ssize_t		resid;
551*2797Sjg 	int		err;
552*2797Sjg 	ssize_t		len;
553*2797Sjg 	ssize_t		n = 0;
554*2797Sjg 
555*2797Sjg 	ASSERT(modrootloaded);
556*2797Sjg 
557*2797Sjg 	if (fp->kf_state != 0)
558*2797Sjg 		return (fp->kf_state);
559*2797Sjg 
560*2797Sjg 	len = bufsiz;
561*2797Sjg 	rlimit = bufsiz + 1;
562*2797Sjg 	for (;;) {
563*2797Sjg 		err = vn_rdwr(UIO_WRITE, fp->kf_vp, buf, len, fp->kf_fpos,
564*2797Sjg 			UIO_SYSSPACE, FSYNC, rlimit, kcred, &resid);
565*2797Sjg 		if (err) {
566*2797Sjg 			KFDEBUG((CE_CONT, "%s: write error %d\n",
567*2797Sjg 				fp->kf_fname, err));
568*2797Sjg 			fp->kf_state = err;
569*2797Sjg 			return (err);
570*2797Sjg 		}
571*2797Sjg 
572*2797Sjg 		KFDEBUG1((CE_CONT, "%s: write %ld bytes ok %ld resid\n",
573*2797Sjg 			fp->kf_fname, len-resid, resid));
574*2797Sjg 
575*2797Sjg 		ASSERT(resid >= 0 && resid <= len);
576*2797Sjg 
577*2797Sjg 		n += (len - resid);
578*2797Sjg 		if (resid == 0)
579*2797Sjg 			break;
580*2797Sjg 
581*2797Sjg 		if (resid == len) {
582*2797Sjg 			KFDEBUG((CE_CONT, "%s: filesystem full?\n",
583*2797Sjg 				fp->kf_fname));
584*2797Sjg 			fp->kf_state = ENOSPC;
585*2797Sjg 			return (ENOSPC);
586*2797Sjg 		}
587*2797Sjg 
588*2797Sjg 		len -= resid;
589*2797Sjg 		buf += len;
590*2797Sjg 		fp->kf_fpos += len;
591*2797Sjg 		len = resid;
592*2797Sjg 	}
593*2797Sjg 
594*2797Sjg 	ASSERT(n == bufsiz);
595*2797Sjg 	KFDEBUG1((CE_CONT, "%s: wrote %ld bytes ok\n", fp->kf_fname, n));
596*2797Sjg 
597*2797Sjg 	*ret_n = n;
598*2797Sjg 	return (0);
599*2797Sjg }
600*2797Sjg 
601*2797Sjg 
602*2797Sjg static int
603*2797Sjg kfclose(kfile_t *fp)
604*2797Sjg {
605*2797Sjg 	int		rval;
606*2797Sjg 
607*2797Sjg 	KFDEBUG((CE_CONT, "close: %s\n", fp->kf_fname));
608*2797Sjg 
609*2797Sjg 	if ((fp->kf_vnflags & FWRITE) && fp->kf_state == 0) {
610*2797Sjg 		rval = VOP_FSYNC(fp->kf_vp, FSYNC,  kcred);
611*2797Sjg 		if (rval != 0) {
612*2797Sjg 			nvf_error("%s: sync error %d\n",
613*2797Sjg 				fp->kf_fname, rval);
614*2797Sjg 		}
615*2797Sjg 		KFDEBUG((CE_CONT, "%s: sync ok\n", fp->kf_fname));
616*2797Sjg 	}
617*2797Sjg 
618*2797Sjg 	rval = VOP_CLOSE(fp->kf_vp, fp->kf_vnflags, 1, (offset_t)0, kcred);
619*2797Sjg 	if (rval != 0) {
620*2797Sjg 		if (fp->kf_state == 0) {
621*2797Sjg 			nvf_error("%s: close error %d\n",
622*2797Sjg 				fp->kf_fname, rval);
623*2797Sjg 		}
624*2797Sjg 	} else {
625*2797Sjg 		if (fp->kf_state == 0)
626*2797Sjg 			KFDEBUG((CE_CONT, "%s: close ok\n", fp->kf_fname));
627*2797Sjg 	}
628*2797Sjg 
629*2797Sjg 	VN_RELE(fp->kf_vp);
630*2797Sjg 	kmem_free(fp, sizeof (kfile_t));
631*2797Sjg 	return (rval);
632*2797Sjg }
633*2797Sjg 
634*2797Sjg static int
635*2797Sjg kfrename(char *oldname, char *newname)
636*2797Sjg {
637*2797Sjg 	int rval;
638*2797Sjg 
639*2797Sjg 	ASSERT(modrootloaded);
640*2797Sjg 
641*2797Sjg 	KFDEBUG((CE_CONT, "renaming %s to %s\n", oldname, newname));
642*2797Sjg 
643*2797Sjg 	if ((rval = vn_rename(oldname, newname, UIO_SYSSPACE)) != 0) {
644*2797Sjg 		KFDEBUG((CE_CONT, "rename %s to %s: %d\n",
645*2797Sjg 			oldname, newname, rval));
646*2797Sjg 	}
647*2797Sjg 
648*2797Sjg 	return (rval);
649*2797Sjg }
650*2797Sjg 
651*2797Sjg int
652*2797Sjg fwrite_nvlist(char *filename, nvlist_t *nvl)
653*2797Sjg {
654*2797Sjg 	char	*buf;
655*2797Sjg 	char	*nvbuf;
656*2797Sjg 	kfile_t	*fp;
657*2797Sjg 	char	*newname;
658*2797Sjg 	int	len, err, err1;
659*2797Sjg 	size_t	buflen;
660*2797Sjg 	ssize_t	n;
661*2797Sjg 
662*2797Sjg 	ASSERT(modrootloaded);
663*2797Sjg 
664*2797Sjg 	nvbuf = NULL;
665*2797Sjg 	err = nvlist_pack(nvl, &nvbuf, &buflen, NV_ENCODE_NATIVE, 0);
666*2797Sjg 	if (err != 0) {
667*2797Sjg 		nvf_error("%s: error %d packing nvlist\n",
668*2797Sjg 			filename, err);
669*2797Sjg 		return (err);
670*2797Sjg 	}
671*2797Sjg 
672*2797Sjg 	buf = kmem_alloc(sizeof (nvpf_hdr_t) + buflen, KM_SLEEP);
673*2797Sjg 	bzero(buf, sizeof (nvpf_hdr_t));
674*2797Sjg 
675*2797Sjg 	((nvpf_hdr_t *)buf)->nvpf_magic = NVPF_HDR_MAGIC;
676*2797Sjg 	((nvpf_hdr_t *)buf)->nvpf_version = NVPF_HDR_VERSION;
677*2797Sjg 	((nvpf_hdr_t *)buf)->nvpf_size = buflen;
678*2797Sjg 	((nvpf_hdr_t *)buf)->nvpf_chksum = nvp_cksum((uchar_t *)nvbuf, buflen);
679*2797Sjg 	((nvpf_hdr_t *)buf)->nvpf_hdr_chksum =
680*2797Sjg 		nvp_cksum((uchar_t *)buf, sizeof (nvpf_hdr_t));
681*2797Sjg 
682*2797Sjg 	bcopy(nvbuf, buf + sizeof (nvpf_hdr_t), buflen);
683*2797Sjg 	kmem_free(nvbuf, buflen);
684*2797Sjg 	buflen += sizeof (nvpf_hdr_t);
685*2797Sjg 
686*2797Sjg 	len = strlen(filename) + MAX_SUFFIX_LEN + 2;
687*2797Sjg 	newname = kmem_alloc(len, KM_SLEEP);
688*2797Sjg 
689*2797Sjg 
690*2797Sjg 	(void) sprintf(newname, "%s.%s",
691*2797Sjg 		filename, NEW_FILENAME_SUFFIX);
692*2797Sjg 
693*2797Sjg 	/*
694*2797Sjg 	 * To make it unlikely we suffer data loss, write
695*2797Sjg 	 * data to the new temporary file.  Once successful
696*2797Sjg 	 * complete the transaction by renaming the new file
697*2797Sjg 	 * to replace the previous.
698*2797Sjg 	 */
699*2797Sjg 
700*2797Sjg 	if ((err = kfcreate(newname, &fp)) == 0) {
701*2797Sjg 		err = kfwrite(fp, buf, buflen, &n);
702*2797Sjg 		if (err) {
703*2797Sjg 			nvf_error("%s: write error - %d\n",
704*2797Sjg 				newname, err);
705*2797Sjg 		} else {
706*2797Sjg 			if (n != buflen) {
707*2797Sjg 				nvf_error(
708*2797Sjg 				    "%s: partial write %ld of %ld bytes\n",
709*2797Sjg 				    newname, n, buflen);
710*2797Sjg 				nvf_error("%s: filesystem may be full?\n",
711*2797Sjg 				    newname);
712*2797Sjg 				err = EIO;
713*2797Sjg 			}
714*2797Sjg 		}
715*2797Sjg 		if ((err1 = kfclose(fp)) != 0) {
716*2797Sjg 			nvf_error("%s: close error\n", newname);
717*2797Sjg 			if (err == 0)
718*2797Sjg 				err = err1;
719*2797Sjg 		}
720*2797Sjg 		if (err != 0) {
721*2797Sjg 			if (kfremove(newname) != 0) {
722*2797Sjg 				nvf_error("%s: remove failed\n",
723*2797Sjg 				    newname);
724*2797Sjg 			}
725*2797Sjg 		}
726*2797Sjg 	} else {
727*2797Sjg 		nvf_error("%s: create failed - %d\n", filename, err);
728*2797Sjg 	}
729*2797Sjg 
730*2797Sjg 	if (err == 0) {
731*2797Sjg 		if ((err = kfrename(newname, filename)) != 0) {
732*2797Sjg 			nvf_error("%s: rename from %s failed\n",
733*2797Sjg 				newname, filename);
734*2797Sjg 		}
735*2797Sjg 	}
736*2797Sjg 
737*2797Sjg 	kmem_free(newname, len);
738*2797Sjg 	kmem_free(buf, buflen);
739*2797Sjg 
740*2797Sjg 	return (err);
741*2797Sjg }
742*2797Sjg 
743*2797Sjg static int
744*2797Sjg e_fwrite_nvlist(nvfd_t *nvfd, nvlist_t *nvl)
745*2797Sjg {
746*2797Sjg 	int err;
747*2797Sjg 
748*2797Sjg 	if ((err = fwrite_nvlist(nvfd->nvf_cache_path, nvl)) == 0)
749*2797Sjg 		return (DDI_SUCCESS);
750*2797Sjg 	else {
751*2797Sjg 		if (err == EROFS)
752*2797Sjg 			NVF_MARK_READONLY(nvfd);
753*2797Sjg 		return (DDI_FAILURE);
754*2797Sjg 	}
755*2797Sjg }
756*2797Sjg 
757*2797Sjg static void
758*2797Sjg nvp_list_free(nvfd_t *nvf)
759*2797Sjg {
760*2797Sjg 	ASSERT(RW_WRITE_HELD(&nvf->nvf_lock));
761*2797Sjg 	(nvf->nvf_list_free)((nvf_handle_t)nvf);
762*2797Sjg 	ASSERT(RW_WRITE_HELD(&nvf->nvf_lock));
763*2797Sjg }
764*2797Sjg 
765*2797Sjg /*
766*2797Sjg  * Read a file in the nvlist format
767*2797Sjg  *	EIO - i/o error during read
768*2797Sjg  *	ENOENT - file not found
769*2797Sjg  *	EINVAL - file contents corrupted
770*2797Sjg  */
771*2797Sjg static int
772*2797Sjg fread_nvp_list(nvfd_t *nvfd)
773*2797Sjg {
774*2797Sjg 	nvlist_t	*nvl;
775*2797Sjg 	nvpair_t	*nvp;
776*2797Sjg 	char		*name;
777*2797Sjg 	nvlist_t	*sublist;
778*2797Sjg 	int		rval;
779*2797Sjg 	int		rv;
780*2797Sjg 
781*2797Sjg 	ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
782*2797Sjg 
783*2797Sjg 	rval = fread_nvlist(nvfd->nvf_cache_path, &nvl);
784*2797Sjg 	if (rval != 0)
785*2797Sjg 		return (rval);
786*2797Sjg 	ASSERT(nvl != NULL);
787*2797Sjg 
788*2797Sjg 	nvp = NULL;
789*2797Sjg 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
790*2797Sjg 		name = nvpair_name(nvp);
791*2797Sjg 		ASSERT(strlen(name) > 0);
792*2797Sjg 
793*2797Sjg 		switch (nvpair_type(nvp)) {
794*2797Sjg 		case DATA_TYPE_NVLIST:
795*2797Sjg 			rval = nvpair_value_nvlist(nvp, &sublist);
796*2797Sjg 			if (rval != 0) {
797*2797Sjg 				nvf_error(
798*2797Sjg 				    "nvpair_value_nvlist error %s %d\n",
799*2797Sjg 				    name, rval);
800*2797Sjg 				goto error;
801*2797Sjg 			}
802*2797Sjg 
803*2797Sjg 			/*
804*2797Sjg 			 * unpack nvlist for this device and
805*2797Sjg 			 * add elements to data list.
806*2797Sjg 			 */
807*2797Sjg 			ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
808*2797Sjg 			rv = (nvfd->nvf_unpack_nvlist)
809*2797Sjg 			    ((nvf_handle_t)nvfd, sublist, name);
810*2797Sjg 			ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
811*2797Sjg 			if (rv != 0) {
812*2797Sjg 				nvf_error(
813*2797Sjg 				    "%s: %s invalid list element\n",
814*2797Sjg 				    nvfd->nvf_cache_path, name);
815*2797Sjg 				rval = EINVAL;
816*2797Sjg 				goto error;
817*2797Sjg 			}
818*2797Sjg 			break;
819*2797Sjg 
820*2797Sjg 		default:
821*2797Sjg 			nvf_error("%s: %s unsupported data type %d\n",
822*2797Sjg 				nvfd->nvf_cache_path, name, nvpair_type(nvp));
823*2797Sjg 			rval = EINVAL;
824*2797Sjg 			goto error;
825*2797Sjg 		}
826*2797Sjg 	}
827*2797Sjg 
828*2797Sjg 	nvlist_free(nvl);
829*2797Sjg 
830*2797Sjg 	return (0);
831*2797Sjg 
832*2797Sjg error:
833*2797Sjg 	nvlist_free(nvl);
834*2797Sjg 	nvp_list_free(nvfd);
835*2797Sjg 	return (rval);
836*2797Sjg }
837*2797Sjg 
838*2797Sjg 
839*2797Sjg int
840*2797Sjg nvf_read_file(nvf_handle_t nvf_handle)
841*2797Sjg {
842*2797Sjg 	nvfd_t *nvfd = (nvfd_t *)nvf_handle;
843*2797Sjg 	int rval;
844*2797Sjg 
845*2797Sjg 	ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
846*2797Sjg 
847*2797Sjg 	if (kfio_disable_read)
848*2797Sjg 		return (0);
849*2797Sjg 
850*2797Sjg 	KFDEBUG((CE_CONT, "reading %s\n", nvfd->nvf_cache_path));
851*2797Sjg 
852*2797Sjg 	rval = fread_nvp_list(nvfd);
853*2797Sjg 	if (rval) {
854*2797Sjg 		switch (rval) {
855*2797Sjg 		case EIO:
856*2797Sjg 			nvfd->nvf_flags |= NVF_F_REBUILD_MSG;
857*2797Sjg 			cmn_err(CE_WARN, "%s: I/O error",
858*2797Sjg 				nvfd->nvf_cache_path);
859*2797Sjg 			break;
860*2797Sjg 		case ENOENT:
861*2797Sjg 			nvfd->nvf_flags |= NVF_F_CREATE_MSG;
862*2797Sjg 			nvf_error("%s: not found\n",
863*2797Sjg 				nvfd->nvf_cache_path);
864*2797Sjg 			break;
865*2797Sjg 		case EINVAL:
866*2797Sjg 		default:
867*2797Sjg 			nvfd->nvf_flags |= NVF_F_REBUILD_MSG;
868*2797Sjg 			cmn_err(CE_WARN, "%s: data file corrupted",
869*2797Sjg 				nvfd->nvf_cache_path);
870*2797Sjg 			break;
871*2797Sjg 		}
872*2797Sjg 	}
873*2797Sjg 	return (rval);
874*2797Sjg }
875*2797Sjg 
876*2797Sjg static void
877*2797Sjg nvf_write_is_complete(nvfd_t *fd)
878*2797Sjg {
879*2797Sjg 	if (fd->nvf_write_complete) {
880*2797Sjg 		(fd->nvf_write_complete)((nvf_handle_t)fd);
881*2797Sjg 	}
882*2797Sjg }
883*2797Sjg 
884*2797Sjg /*ARGSUSED*/
885*2797Sjg static void
886*2797Sjg nvpflush_timeout(void *arg)
887*2797Sjg {
888*2797Sjg 	clock_t nticks;
889*2797Sjg 
890*2797Sjg 	mutex_enter(&nvpflush_lock);
891*2797Sjg 	nticks = nvpticks - ddi_get_lbolt();
892*2797Sjg 	if (nticks > 4) {
893*2797Sjg 		nvpflush_timer_busy = 1;
894*2797Sjg 		mutex_exit(&nvpflush_lock);
895*2797Sjg 		nvpflush_id = timeout(nvpflush_timeout, NULL, nticks);
896*2797Sjg 	} else {
897*2797Sjg 		do_nvpflush = 1;
898*2797Sjg 		NVPDAEMON_DEBUG((CE_CONT, "signal nvpdaemon\n"));
899*2797Sjg 		cv_signal(&nvpflush_cv);
900*2797Sjg 		nvpflush_id = 0;
901*2797Sjg 		nvpflush_timer_busy = 0;
902*2797Sjg 		mutex_exit(&nvpflush_lock);
903*2797Sjg 	}
904*2797Sjg }
905*2797Sjg 
906*2797Sjg /*
907*2797Sjg  * After marking a list as dirty, wake the nvpflush daemon
908*2797Sjg  * to perform the update.
909*2797Sjg  */
910*2797Sjg void
911*2797Sjg nvf_wake_daemon(void)
912*2797Sjg {
913*2797Sjg 	clock_t nticks;
914*2797Sjg 
915*2797Sjg 	/*
916*2797Sjg 	 * If the system isn't up yet
917*2797Sjg 	 * don't even think about starting a flush.
918*2797Sjg 	 */
919*2797Sjg 	if (!i_ddi_io_initialized())
920*2797Sjg 		return;
921*2797Sjg 
922*2797Sjg 	mutex_enter(&nvpflush_lock);
923*2797Sjg 
924*2797Sjg 	if (nvpflush_daemon_active == 0) {
925*2797Sjg 		nvpflush_daemon_active = 1;
926*2797Sjg 		mutex_exit(&nvpflush_lock);
927*2797Sjg 		NVPDAEMON_DEBUG((CE_CONT, "starting nvpdaemon thread\n"));
928*2797Sjg 		nvpflush_thr_id = thread_create(NULL, 0,
929*2797Sjg 		    (void (*)())nvpflush_daemon,
930*2797Sjg 		    NULL, 0, &p0, TS_RUN, minclsyspri);
931*2797Sjg 		mutex_enter(&nvpflush_lock);
932*2797Sjg 	}
933*2797Sjg 
934*2797Sjg 	nticks = nvpflush_delay * TICKS_PER_SECOND;
935*2797Sjg 	nvpticks = ddi_get_lbolt() + nticks;
936*2797Sjg 	if (nvpflush_timer_busy == 0) {
937*2797Sjg 		nvpflush_timer_busy = 1;
938*2797Sjg 		mutex_exit(&nvpflush_lock);
939*2797Sjg 		nvpflush_id = timeout(nvpflush_timeout, NULL, nticks + 4);
940*2797Sjg 	} else
941*2797Sjg 		mutex_exit(&nvpflush_lock);
942*2797Sjg }
943*2797Sjg 
944*2797Sjg static int
945*2797Sjg nvpflush_one(nvfd_t *nvfd)
946*2797Sjg {
947*2797Sjg 	int rval = DDI_SUCCESS;
948*2797Sjg 	nvlist_t *nvl;
949*2797Sjg 
950*2797Sjg 	rw_enter(&nvfd->nvf_lock, RW_READER);
951*2797Sjg 
952*2797Sjg 	ASSERT((nvfd->nvf_flags & NVF_F_FLUSHING) == 0);
953*2797Sjg 
954*2797Sjg 	if (!NVF_IS_DIRTY(nvfd) ||
955*2797Sjg 	    NVF_IS_READONLY(nvfd) || kfio_disable_write) {
956*2797Sjg 		NVF_CLEAR_DIRTY(nvfd);
957*2797Sjg 		rw_exit(&nvfd->nvf_lock);
958*2797Sjg 		return (DDI_SUCCESS);
959*2797Sjg 	}
960*2797Sjg 
961*2797Sjg 	if (rw_tryupgrade(&nvfd->nvf_lock) == 0) {
962*2797Sjg 		nvf_error("nvpflush: "
963*2797Sjg 		    "%s rw upgrade failed\n", nvfd->nvf_cache_path);
964*2797Sjg 		rw_exit(&nvfd->nvf_lock);
965*2797Sjg 		return (DDI_FAILURE);
966*2797Sjg 	}
967*2797Sjg 	if (((nvfd->nvf_pack_list)
968*2797Sjg 	    ((nvf_handle_t)nvfd, &nvl)) != DDI_SUCCESS) {
969*2797Sjg 		nvf_error("nvpflush: "
970*2797Sjg 		    "%s nvlist construction failed\n", nvfd->nvf_cache_path);
971*2797Sjg 		ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
972*2797Sjg 		rw_exit(&nvfd->nvf_lock);
973*2797Sjg 		return (DDI_FAILURE);
974*2797Sjg 	}
975*2797Sjg 	ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
976*2797Sjg 
977*2797Sjg 	NVF_CLEAR_DIRTY(nvfd);
978*2797Sjg 	nvfd->nvf_flags |= NVF_F_FLUSHING;
979*2797Sjg 	rw_exit(&nvfd->nvf_lock);
980*2797Sjg 
981*2797Sjg 	rval = e_fwrite_nvlist(nvfd, nvl);
982*2797Sjg 	nvlist_free(nvl);
983*2797Sjg 
984*2797Sjg 	rw_enter(&nvfd->nvf_lock, RW_WRITER);
985*2797Sjg 	nvfd->nvf_flags &= ~NVF_F_FLUSHING;
986*2797Sjg 	if (rval == DDI_FAILURE) {
987*2797Sjg 		if (NVF_IS_READONLY(nvfd)) {
988*2797Sjg 			rval = DDI_SUCCESS;
989*2797Sjg 			nvfd->nvf_flags &= ~(NVF_F_ERROR | NVF_F_DIRTY);
990*2797Sjg 		} else if ((nvfd->nvf_flags & NVF_F_ERROR) == 0) {
991*2797Sjg 			cmn_err(CE_CONT,
992*2797Sjg 			    "%s: updated failed\n", nvfd->nvf_cache_path);
993*2797Sjg 			nvfd->nvf_flags |= NVF_F_ERROR | NVF_F_DIRTY;
994*2797Sjg 		}
995*2797Sjg 	} else {
996*2797Sjg 		if (nvfd->nvf_flags & NVF_F_CREATE_MSG) {
997*2797Sjg 			cmn_err(CE_CONT,
998*2797Sjg 			    "!Creating %s\n", nvfd->nvf_cache_path);
999*2797Sjg 			nvfd->nvf_flags &= ~NVF_F_CREATE_MSG;
1000*2797Sjg 		}
1001*2797Sjg 		if (nvfd->nvf_flags & NVF_F_REBUILD_MSG) {
1002*2797Sjg 			cmn_err(CE_CONT,
1003*2797Sjg 			    "!Rebuilding %s\n", nvfd->nvf_cache_path);
1004*2797Sjg 			nvfd->nvf_flags &= ~NVF_F_REBUILD_MSG;
1005*2797Sjg 		}
1006*2797Sjg 		if (nvfd->nvf_flags & NVF_F_ERROR) {
1007*2797Sjg 			cmn_err(CE_CONT,
1008*2797Sjg 			    "%s: update now ok\n", nvfd->nvf_cache_path);
1009*2797Sjg 			nvfd->nvf_flags &= ~NVF_F_ERROR;
1010*2797Sjg 		}
1011*2797Sjg 		/*
1012*2797Sjg 		 * The file may need to be flushed again if the cached
1013*2797Sjg 		 * data was touched while writing the earlier contents.
1014*2797Sjg 		 */
1015*2797Sjg 		if (NVF_IS_DIRTY(nvfd))
1016*2797Sjg 			rval = DDI_FAILURE;
1017*2797Sjg 	}
1018*2797Sjg 
1019*2797Sjg 	rw_exit(&nvfd->nvf_lock);
1020*2797Sjg 	return (rval);
1021*2797Sjg }
1022*2797Sjg 
1023*2797Sjg 
1024*2797Sjg static void
1025*2797Sjg nvpflush_daemon(void)
1026*2797Sjg {
1027*2797Sjg 	callb_cpr_t cprinfo;
1028*2797Sjg 	nvfd_t *nvfdp, *nextfdp;
1029*2797Sjg 	clock_t clk;
1030*2797Sjg 	int rval;
1031*2797Sjg 	int want_wakeup;
1032*2797Sjg 	int is_now_clean;
1033*2797Sjg 
1034*2797Sjg 	ASSERT(modrootloaded);
1035*2797Sjg 
1036*2797Sjg 	nvpflush_thread = curthread;
1037*2797Sjg 	NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: init\n"));
1038*2797Sjg 
1039*2797Sjg 	CALLB_CPR_INIT(&cprinfo, &nvpflush_lock, callb_generic_cpr, "nvp");
1040*2797Sjg 	mutex_enter(&nvpflush_lock);
1041*2797Sjg 	for (;;) {
1042*2797Sjg 
1043*2797Sjg 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1044*2797Sjg 		while (do_nvpflush == 0) {
1045*2797Sjg 			clk = cv_timedwait(&nvpflush_cv, &nvpflush_lock,
1046*2797Sjg 			    ddi_get_lbolt() +
1047*2797Sjg 				(nvpdaemon_idle_time * TICKS_PER_SECOND));
1048*2797Sjg 			if (clk == -1 &&
1049*2797Sjg 			    do_nvpflush == 0 && nvpflush_timer_busy == 0) {
1050*2797Sjg 				/*
1051*2797Sjg 				 * Note that CALLB_CPR_EXIT calls mutex_exit()
1052*2797Sjg 				 * on the lock passed in to CALLB_CPR_INIT,
1053*2797Sjg 				 * so the lock must be held when invoking it.
1054*2797Sjg 				 */
1055*2797Sjg 				CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock);
1056*2797Sjg 				NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: exit\n"));
1057*2797Sjg 				ASSERT(mutex_owned(&nvpflush_lock));
1058*2797Sjg 				nvpflush_thr_id = NULL;
1059*2797Sjg 				nvpflush_daemon_active = 0;
1060*2797Sjg 				CALLB_CPR_EXIT(&cprinfo);
1061*2797Sjg 				thread_exit();
1062*2797Sjg 			}
1063*2797Sjg 		}
1064*2797Sjg 		CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock);
1065*2797Sjg 
1066*2797Sjg 		nvpbusy = 1;
1067*2797Sjg 		want_wakeup = 0;
1068*2797Sjg 		do_nvpflush = 0;
1069*2797Sjg 		mutex_exit(&nvpflush_lock);
1070*2797Sjg 
1071*2797Sjg 		/*
1072*2797Sjg 		 * Try flushing what's dirty, reschedule if there's
1073*2797Sjg 		 * a failure or data gets marked as dirty again.
1074*2797Sjg 		 * First move each file marked dirty to the dirty
1075*2797Sjg 		 * list to avoid locking the list across the write.
1076*2797Sjg 		 */
1077*2797Sjg 		mutex_enter(&nvf_cache_mutex);
1078*2797Sjg 		for (nvfdp = list_head(&nvf_cache_files);
1079*2797Sjg 		    nvfdp; nvfdp = nextfdp) {
1080*2797Sjg 			nextfdp = list_next(&nvf_cache_files, nvfdp);
1081*2797Sjg 			rw_enter(&nvfdp->nvf_lock, RW_READER);
1082*2797Sjg 			if (NVF_IS_DIRTY(nvfdp)) {
1083*2797Sjg 				list_remove(&nvf_cache_files, nvfdp);
1084*2797Sjg 				list_insert_tail(&nvf_dirty_files, nvfdp);
1085*2797Sjg 				rw_exit(&nvfdp->nvf_lock);
1086*2797Sjg 			} else {
1087*2797Sjg 				NVPDAEMON_DEBUG((CE_CONT,
1088*2797Sjg 				    "nvpdaemon: not dirty %s\n",
1089*2797Sjg 				    nvfdp->nvf_cache_path));
1090*2797Sjg 				rw_exit(&nvfdp->nvf_lock);
1091*2797Sjg 			}
1092*2797Sjg 		}
1093*2797Sjg 		mutex_exit(&nvf_cache_mutex);
1094*2797Sjg 
1095*2797Sjg 		/*
1096*2797Sjg 		 * Now go through the dirty list
1097*2797Sjg 		 */
1098*2797Sjg 		for (nvfdp = list_head(&nvf_dirty_files);
1099*2797Sjg 		    nvfdp; nvfdp = nextfdp) {
1100*2797Sjg 			nextfdp = list_next(&nvf_dirty_files, nvfdp);
1101*2797Sjg 
1102*2797Sjg 			is_now_clean = 0;
1103*2797Sjg 			rw_enter(&nvfdp->nvf_lock, RW_READER);
1104*2797Sjg 			if (NVF_IS_DIRTY(nvfdp)) {
1105*2797Sjg 				NVPDAEMON_DEBUG((CE_CONT,
1106*2797Sjg 				    "nvpdaemon: flush %s\n",
1107*2797Sjg 				    nvfdp->nvf_cache_path));
1108*2797Sjg 				rw_exit(&nvfdp->nvf_lock);
1109*2797Sjg 				rval = nvpflush_one(nvfdp);
1110*2797Sjg 				rw_enter(&nvfdp->nvf_lock, RW_READER);
1111*2797Sjg 				if (rval != DDI_SUCCESS ||
1112*2797Sjg 				    NVF_IS_DIRTY(nvfdp)) {
1113*2797Sjg 					rw_exit(&nvfdp->nvf_lock);
1114*2797Sjg 					NVPDAEMON_DEBUG((CE_CONT,
1115*2797Sjg 					    "nvpdaemon: %s dirty again\n",
1116*2797Sjg 					    nvfdp->nvf_cache_path));
1117*2797Sjg 					want_wakeup = 1;
1118*2797Sjg 				} else {
1119*2797Sjg 					rw_exit(&nvfdp->nvf_lock);
1120*2797Sjg 					nvf_write_is_complete(nvfdp);
1121*2797Sjg 					is_now_clean = 1;
1122*2797Sjg 				}
1123*2797Sjg 			} else {
1124*2797Sjg 				NVPDAEMON_DEBUG((CE_CONT,
1125*2797Sjg 				    "nvpdaemon: not dirty %s\n",
1126*2797Sjg 				    nvfdp->nvf_cache_path));
1127*2797Sjg 				rw_exit(&nvfdp->nvf_lock);
1128*2797Sjg 				is_now_clean = 1;
1129*2797Sjg 			}
1130*2797Sjg 
1131*2797Sjg 			if (is_now_clean) {
1132*2797Sjg 				mutex_enter(&nvf_cache_mutex);
1133*2797Sjg 				list_remove(&nvf_dirty_files, nvfdp);
1134*2797Sjg 				list_insert_tail(&nvf_cache_files,
1135*2797Sjg 				    nvfdp);
1136*2797Sjg 				mutex_exit(&nvf_cache_mutex);
1137*2797Sjg 			}
1138*2797Sjg 		}
1139*2797Sjg 
1140*2797Sjg 		if (want_wakeup)
1141*2797Sjg 			nvf_wake_daemon();
1142*2797Sjg 
1143*2797Sjg 		mutex_enter(&nvpflush_lock);
1144*2797Sjg 		nvpbusy = 0;
1145*2797Sjg 	}
1146*2797Sjg }
1147