xref: /onnv-gate/usr/src/uts/common/fs/dev/sdev_ncache.c (revision 2621:4ea88858d952)
1*2621Sllai1 /*
2*2621Sllai1  * CDDL HEADER START
3*2621Sllai1  *
4*2621Sllai1  * The contents of this file are subject to the terms of the
5*2621Sllai1  * Common Development and Distribution License (the "License").
6*2621Sllai1  * You may not use this file except in compliance with the License.
7*2621Sllai1  *
8*2621Sllai1  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*2621Sllai1  * or http://www.opensolaris.org/os/licensing.
10*2621Sllai1  * See the License for the specific language governing permissions
11*2621Sllai1  * and limitations under the License.
12*2621Sllai1  *
13*2621Sllai1  * When distributing Covered Code, include this CDDL HEADER in each
14*2621Sllai1  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*2621Sllai1  * If applicable, add the following below this CDDL HEADER, with the
16*2621Sllai1  * fields enclosed by brackets "[]" replaced with your own identifying
17*2621Sllai1  * information: Portions Copyright [yyyy] [name of copyright owner]
18*2621Sllai1  *
19*2621Sllai1  * CDDL HEADER END
20*2621Sllai1  */
21*2621Sllai1 /*
22*2621Sllai1  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23*2621Sllai1  * Use is subject to license terms.
24*2621Sllai1  */
25*2621Sllai1 
26*2621Sllai1 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27*2621Sllai1 
28*2621Sllai1 /*
29*2621Sllai1  * negative cache handling for the /dev fs
30*2621Sllai1  */
31*2621Sllai1 
32*2621Sllai1 #include <sys/types.h>
33*2621Sllai1 #include <sys/param.h>
34*2621Sllai1 #include <sys/t_lock.h>
35*2621Sllai1 #include <sys/systm.h>
36*2621Sllai1 #include <sys/sysmacros.h>
37*2621Sllai1 #include <sys/user.h>
38*2621Sllai1 #include <sys/time.h>
39*2621Sllai1 #include <sys/vfs.h>
40*2621Sllai1 #include <sys/vnode.h>
41*2621Sllai1 #include <sys/file.h>
42*2621Sllai1 #include <sys/fcntl.h>
43*2621Sllai1 #include <sys/flock.h>
44*2621Sllai1 #include <sys/kmem.h>
45*2621Sllai1 #include <sys/uio.h>
46*2621Sllai1 #include <sys/errno.h>
47*2621Sllai1 #include <sys/stat.h>
48*2621Sllai1 #include <sys/cred.h>
49*2621Sllai1 #include <sys/cmn_err.h>
50*2621Sllai1 #include <sys/debug.h>
51*2621Sllai1 #include <sys/mode.h>
52*2621Sllai1 #include <sys/policy.h>
53*2621Sllai1 #include <fs/fs_subr.h>
54*2621Sllai1 #include <sys/mount.h>
55*2621Sllai1 #include <sys/fs/snode.h>
56*2621Sllai1 #include <sys/fs/dv_node.h>
57*2621Sllai1 #include <sys/fs/sdev_node.h>
58*2621Sllai1 #include <sys/sunndi.h>
59*2621Sllai1 #include <sys/sunmdi.h>
60*2621Sllai1 #include <sys/ddi.h>
61*2621Sllai1 #include <sys/modctl.h>
62*2621Sllai1 #include <sys/devctl_impl.h>
63*2621Sllai1 
64*2621Sllai1 
65*2621Sllai1 /*
66*2621Sllai1  * ncache is a negative cache of failed lookups.  An entry
67*2621Sllai1  * is added after an attempt to configure a device by that
68*2621Sllai1  * name failed.  An accumulation of these entries over time
69*2621Sllai1  * gives us a set of device name for which implicit reconfiguration
70*2621Sllai1  * does not need to be attempted.  If a name is created matching
71*2621Sllai1  * an entry in ncache, that entry is removed, with the
72*2621Sllai1  * persistent store updated.
73*2621Sllai1  *
74*2621Sllai1  * Implicit reconfig is initiated for any name during lookup that
75*2621Sllai1  * can't be resolved from the backing store and that isn't
76*2621Sllai1  * present in the negative cache.  This functionality is
77*2621Sllai1  * enabled during system startup once communication with devfsadm
78*2621Sllai1  * can be achieved.  Since readdir is more general, implicit
79*2621Sllai1  * reconfig initiated by reading a directory isn't enabled until
80*2621Sllai1  * the system is more fully booted, at the time of the multi-user
81*2621Sllai1  * milestone, corresponding to init state 2.
82*2621Sllai1  *
83*2621Sllai1  * A maximum is imposed on the number of entries in the cache
84*2621Sllai1  * to limit some script going wild and as a defense against attack.
85*2621Sllai1  * The default limit is 64 and can be adjusted via sdev_nc_max_entries.
86*2621Sllai1  *
87*2621Sllai1  * Each entry also has a expiration count.  When looked up a name in
88*2621Sllai1  * the cache is set to the default.  Subsequent boots will decrement
89*2621Sllai1  * the count if a name isn't referenced.  This permits a once-only
90*2621Sllai1  * entry to eventually be removed over time.
91*2621Sllai1  *
92*2621Sllai1  * sdev_reconfig_delay implements a "debounce" of the timing beyond
93*2621Sllai1  * system available indication, providing what the filesystem considers
94*2621Sllai1  * to be the system-is-fully-booted state.  This is provided to adjust
95*2621Sllai1  * the timing if some application startup is performing a readdir
96*2621Sllai1  * in /dev that initiates a troublesome implicit reconfig on every boot.
97*2621Sllai1  *
98*2621Sllai1  * sdev_nc_disable_reset can be used to disable clearing the negative cache
99*2621Sllai1  * on reconfig boot.  The default is to clear the cache on reconfig boot.
100*2621Sllai1  * sdev_nc_disable can be used to disable the negative cache itself.
101*2621Sllai1  *
102*2621Sllai1  * sdev_reconfig_disable can be used to disable implicit reconfig.
103*2621Sllai1  * The default is that implicit reconfig is enabled.
104*2621Sllai1  */
105*2621Sllai1 
106*2621Sllai1 /* tunables and defaults */
107*2621Sllai1 #define	SDEV_NC_EXPIRECNT	4
108*2621Sllai1 #define	SDEV_NC_MAX_ENTRIES	64
109*2621Sllai1 #define	SEV_RECONFIG_DELAY	6	/* seconds */
110*2621Sllai1 
111*2621Sllai1 int			sdev_nc_expirecnt = SDEV_NC_EXPIRECNT;
112*2621Sllai1 int			sdev_nc_max_entries = SDEV_NC_MAX_ENTRIES;
113*2621Sllai1 int			sdev_reconfig_delay = SEV_RECONFIG_DELAY;
114*2621Sllai1 int			sdev_reconfig_verbose = 0;
115*2621Sllai1 int			sdev_reconfig_disable = 0;
116*2621Sllai1 int			sdev_nc_disable = 0;
117*2621Sllai1 int			sdev_nc_disable_reset = 0;
118*2621Sllai1 int			sdev_nc_verbose = 0;
119*2621Sllai1 
120*2621Sllai1 /* globals */
121*2621Sllai1 sdev_nc_list_t		*sdev_ncache;
122*2621Sllai1 int			sdev_boot_state = SDEV_BOOT_STATE_INITIAL;
123*2621Sllai1 int			sdev_reconfig_boot = 0;
124*2621Sllai1 static timeout_id_t	sdev_timeout_id = 0;
125*2621Sllai1 
126*2621Sllai1 /* static prototypes */
127*2621Sllai1 static void sdev_ncache_write_complete(nvfd_t *);
128*2621Sllai1 static void sdev_ncache_write(void);
129*2621Sllai1 static void sdev_ncache_process_store(void);
130*2621Sllai1 static sdev_nc_list_t *sdev_nc_newlist(void);
131*2621Sllai1 static void sdev_nc_free_unlinked_node(sdev_nc_node_t *);
132*2621Sllai1 static void sdev_nc_free_all_nodes(sdev_nc_list_t *);
133*2621Sllai1 static void sdev_nc_freelist(sdev_nc_list_t *);
134*2621Sllai1 static sdev_nc_node_t *sdev_nc_findpath(sdev_nc_list_t *, char *);
135*2621Sllai1 static void sdev_nc_insertnode(sdev_nc_list_t *, sdev_nc_node_t *);
136*2621Sllai1 static void sdev_nc_free_bootonly(void);
137*2621Sllai1 
138*2621Sllai1 
139*2621Sllai1 /*
140*2621Sllai1  * called once at filesystem initialization
141*2621Sllai1  */
142*2621Sllai1 void
143*2621Sllai1 sdev_ncache_init(void)
144*2621Sllai1 {
145*2621Sllai1 	sdev_ncache = sdev_nc_newlist();
146*2621Sllai1 }
147*2621Sllai1 
148*2621Sllai1 /*
149*2621Sllai1  * called at mount of the global instance
150*2621Sllai1  * currently the global instance is never unmounted
151*2621Sllai1  */
152*2621Sllai1 void
153*2621Sllai1 sdev_ncache_setup(void)
154*2621Sllai1 {
155*2621Sllai1 	nvfd_t	*nvf = sdevfd;
156*2621Sllai1 
157*2621Sllai1 	nvf_register_write_complete(nvf, sdev_ncache_write_complete);
158*2621Sllai1 
159*2621Sllai1 	i_ddi_read_devname_file();
160*2621Sllai1 	sdev_ncache_process_store();
161*2621Sllai1 	sdev_devstate_change();
162*2621Sllai1 }
163*2621Sllai1 
164*2621Sllai1 static void
165*2621Sllai1 sdev_nvp_cache_free(nvfd_t *nvf)
166*2621Sllai1 {
167*2621Sllai1 	nvp_devname_t	*np;
168*2621Sllai1 	nvp_devname_t	*next;
169*2621Sllai1 
170*2621Sllai1 	for (np = NVF_DEVNAME_LIST(nvf); np; np = next) {
171*2621Sllai1 		next = NVP_DEVNAME_NEXT(np);
172*2621Sllai1 		nfd_nvp_free_and_unlink(nvf, NVPLIST(np));
173*2621Sllai1 	}
174*2621Sllai1 }
175*2621Sllai1 
176*2621Sllai1 static void
177*2621Sllai1 sdev_ncache_process_store(void)
178*2621Sllai1 {
179*2621Sllai1 	nvfd_t		*nvf = sdevfd;
180*2621Sllai1 	sdev_nc_list_t	*ncl = sdev_ncache;
181*2621Sllai1 	nvp_devname_t	*np;
182*2621Sllai1 	sdev_nc_node_t	*lp;
183*2621Sllai1 	char		*path;
184*2621Sllai1 	int		i, n;
185*2621Sllai1 
186*2621Sllai1 	if (sdev_nc_disable)
187*2621Sllai1 		return;
188*2621Sllai1 
189*2621Sllai1 	for (np = NVF_DEVNAME_LIST(nvf); np; np = NVP_DEVNAME_NEXT(np)) {
190*2621Sllai1 		for (i = 0; i < np->nvp_npaths; i++) {
191*2621Sllai1 			sdcmn_err5(("    %s %d\n",
192*2621Sllai1 			    np->nvp_paths[i], np->nvp_expirecnts[i]));
193*2621Sllai1 			if (ncl->ncl_nentries < sdev_nc_max_entries) {
194*2621Sllai1 				path = np->nvp_paths[i];
195*2621Sllai1 				n = strlen(path) + 1;
196*2621Sllai1 				lp = kmem_alloc(sizeof (sdev_nc_node_t),
197*2621Sllai1 				    KM_SLEEP);
198*2621Sllai1 				lp->ncn_name = kmem_alloc(n, KM_SLEEP);
199*2621Sllai1 				bcopy(path, lp->ncn_name, n);
200*2621Sllai1 				lp->ncn_flags = NCN_SRC_STORE;
201*2621Sllai1 				lp->ncn_expirecnt = np->nvp_expirecnts[i];
202*2621Sllai1 				sdev_nc_insertnode(ncl, lp);
203*2621Sllai1 			} else if (sdev_nc_verbose) {
204*2621Sllai1 				cmn_err(CE_CONT,
205*2621Sllai1 				    "?%s: truncating from ncache (max %d)\n",
206*2621Sllai1 				    np->nvp_paths[i], sdev_nc_max_entries);
207*2621Sllai1 			}
208*2621Sllai1 		}
209*2621Sllai1 	}
210*2621Sllai1 }
211*2621Sllai1 
212*2621Sllai1 static void
213*2621Sllai1 sdev_ncache_write_complete(nvfd_t *nvf)
214*2621Sllai1 {
215*2621Sllai1 	sdev_nc_list_t	*ncl = sdev_ncache;
216*2621Sllai1 
217*2621Sllai1 	mutex_enter(&ncl->ncl_mutex);
218*2621Sllai1 
219*2621Sllai1 	ASSERT(ncl->ncl_flags & NCL_LIST_WRITING);
220*2621Sllai1 
221*2621Sllai1 	if (ncl->ncl_flags & NCL_LIST_DIRTY) {
222*2621Sllai1 		sdcmn_err5(("ncache write complete but dirty again\n"));
223*2621Sllai1 		ncl->ncl_flags &= ~NCL_LIST_DIRTY;
224*2621Sllai1 		mutex_exit(&ncl->ncl_mutex);
225*2621Sllai1 		sdev_ncache_write();
226*2621Sllai1 	} else {
227*2621Sllai1 		sdcmn_err5(("ncache write complete\n"));
228*2621Sllai1 		ncl->ncl_flags &= ~NCL_LIST_WRITING;
229*2621Sllai1 		mutex_exit(&ncl->ncl_mutex);
230*2621Sllai1 		rw_enter(&nvf->nvf_lock, RW_WRITER);
231*2621Sllai1 		sdev_nvp_cache_free(nvf);
232*2621Sllai1 		rw_exit(&nvf->nvf_lock);
233*2621Sllai1 	}
234*2621Sllai1 }
235*2621Sllai1 
236*2621Sllai1 static void
237*2621Sllai1 sdev_ncache_write(void)
238*2621Sllai1 {
239*2621Sllai1 	nvfd_t		*nvf = sdevfd;
240*2621Sllai1 	sdev_nc_list_t	*ncl = sdev_ncache;
241*2621Sllai1 	nvp_devname_t	*np;
242*2621Sllai1 	sdev_nc_node_t	*lp;
243*2621Sllai1 	int		n, i;
244*2621Sllai1 
245*2621Sllai1 	if (sdev_cache_write_disable) {
246*2621Sllai1 		mutex_enter(&ncl->ncl_mutex);
247*2621Sllai1 		ncl->ncl_flags &= ~NCL_LIST_WRITING;
248*2621Sllai1 		mutex_exit(&ncl->ncl_mutex);
249*2621Sllai1 		return;
250*2621Sllai1 	}
251*2621Sllai1 
252*2621Sllai1 	/* proper lock ordering here is essential */
253*2621Sllai1 	rw_enter(&nvf->nvf_lock, RW_WRITER);
254*2621Sllai1 	sdev_nvp_cache_free(nvf);
255*2621Sllai1 
256*2621Sllai1 	rw_enter(&ncl->ncl_lock, RW_READER);
257*2621Sllai1 	n = ncl->ncl_nentries;
258*2621Sllai1 	ASSERT(n <= sdev_nc_max_entries);
259*2621Sllai1 
260*2621Sllai1 	np = kmem_zalloc(sizeof (nvp_devname_t), KM_SLEEP);
261*2621Sllai1 	np->nvp_npaths = n;
262*2621Sllai1 	np->nvp_paths = kmem_zalloc(n * sizeof (char *), KM_SLEEP);
263*2621Sllai1 	np->nvp_expirecnts = kmem_zalloc(n * sizeof (int), KM_SLEEP);
264*2621Sllai1 
265*2621Sllai1 	i = 0;
266*2621Sllai1 	for (lp = list_head(&ncl->ncl_list); lp;
267*2621Sllai1 	    lp = list_next(&ncl->ncl_list, lp)) {
268*2621Sllai1 		np->nvp_paths[i] = i_ddi_strdup(lp->ncn_name, KM_SLEEP);
269*2621Sllai1 		np->nvp_expirecnts[i] = lp->ncn_expirecnt;
270*2621Sllai1 		sdcmn_err5(("    %s %d\n",
271*2621Sllai1 		    np->nvp_paths[i], np->nvp_expirecnts[i]));
272*2621Sllai1 		i++;
273*2621Sllai1 	}
274*2621Sllai1 
275*2621Sllai1 	rw_exit(&ncl->ncl_lock);
276*2621Sllai1 
277*2621Sllai1 	NVF_MARK_DIRTY(nvf);
278*2621Sllai1 	nfd_nvp_link(nvf, NVPLIST(np));
279*2621Sllai1 	rw_exit(&nvf->nvf_lock);
280*2621Sllai1 
281*2621Sllai1 	wake_nvpflush_daemon();
282*2621Sllai1 }
283*2621Sllai1 
284*2621Sllai1 static void
285*2621Sllai1 sdev_nc_flush_updates(void)
286*2621Sllai1 {
287*2621Sllai1 	sdev_nc_list_t *ncl = sdev_ncache;
288*2621Sllai1 
289*2621Sllai1 	if (sdev_nc_disable || sdev_cache_write_disable)
290*2621Sllai1 		return;
291*2621Sllai1 
292*2621Sllai1 	mutex_enter(&ncl->ncl_mutex);
293*2621Sllai1 	if (((ncl->ncl_flags &
294*2621Sllai1 	    (NCL_LIST_DIRTY | NCL_LIST_WENABLE | NCL_LIST_WRITING)) ==
295*2621Sllai1 	    (NCL_LIST_DIRTY | NCL_LIST_WENABLE))) {
296*2621Sllai1 		ncl->ncl_flags &= ~NCL_LIST_DIRTY;
297*2621Sllai1 		ncl->ncl_flags |= NCL_LIST_WRITING;
298*2621Sllai1 		mutex_exit(&ncl->ncl_mutex);
299*2621Sllai1 		sdev_ncache_write();
300*2621Sllai1 	} else {
301*2621Sllai1 		mutex_exit(&ncl->ncl_mutex);
302*2621Sllai1 	}
303*2621Sllai1 }
304*2621Sllai1 
305*2621Sllai1 static void
306*2621Sllai1 sdev_nc_flush_boot_update(void)
307*2621Sllai1 {
308*2621Sllai1 	sdev_nc_list_t *ncl = sdev_ncache;
309*2621Sllai1 
310*2621Sllai1 	if (sdev_nc_disable || sdev_cache_write_disable ||
311*2621Sllai1 	    (sdev_boot_state == SDEV_BOOT_STATE_INITIAL)) {
312*2621Sllai1 		return;
313*2621Sllai1 	}
314*2621Sllai1 	mutex_enter(&ncl->ncl_mutex);
315*2621Sllai1 	if (ncl->ncl_flags & NCL_LIST_WENABLE) {
316*2621Sllai1 		mutex_exit(&ncl->ncl_mutex);
317*2621Sllai1 		sdev_nc_flush_updates();
318*2621Sllai1 	} else {
319*2621Sllai1 		mutex_exit(&ncl->ncl_mutex);
320*2621Sllai1 	}
321*2621Sllai1 
322*2621Sllai1 }
323*2621Sllai1 
324*2621Sllai1 static void
325*2621Sllai1 sdev_state_boot_complete()
326*2621Sllai1 {
327*2621Sllai1 	sdev_nc_list_t	*ncl = sdev_ncache;
328*2621Sllai1 	sdev_nc_node_t	*lp, *next;
329*2621Sllai1 
330*2621Sllai1 	/*
331*2621Sllai1 	 * Once boot is complete, decrement the expire count of each entry
332*2621Sllai1 	 * in the cache not touched by a reference.  Remove any that
333*2621Sllai1 	 * goes to zero.  This effectively removes random entries over
334*2621Sllai1 	 * time.
335*2621Sllai1 	 */
336*2621Sllai1 	rw_enter(&ncl->ncl_lock, RW_WRITER);
337*2621Sllai1 	mutex_enter(&ncl->ncl_mutex);
338*2621Sllai1 
339*2621Sllai1 	for (lp = list_head(&ncl->ncl_list); lp; lp = next) {
340*2621Sllai1 		next = list_next(&ncl->ncl_list, lp);
341*2621Sllai1 		if (sdev_nc_expirecnt > 0 && lp->ncn_expirecnt > 0) {
342*2621Sllai1 			if (lp->ncn_flags & NCN_ACTIVE) {
343*2621Sllai1 				if (lp->ncn_expirecnt != sdev_nc_expirecnt) {
344*2621Sllai1 					lp->ncn_expirecnt = sdev_nc_expirecnt;
345*2621Sllai1 					ncl->ncl_flags |= NCL_LIST_DIRTY;
346*2621Sllai1 				}
347*2621Sllai1 			} else {
348*2621Sllai1 				if (--lp->ncn_expirecnt == 0) {
349*2621Sllai1 					list_remove(&ncl->ncl_list, lp);
350*2621Sllai1 					sdev_nc_free_unlinked_node(lp);
351*2621Sllai1 					ncl->ncl_nentries--;
352*2621Sllai1 				}
353*2621Sllai1 				ncl->ncl_flags |= NCL_LIST_DIRTY;
354*2621Sllai1 			}
355*2621Sllai1 		}
356*2621Sllai1 	}
357*2621Sllai1 
358*2621Sllai1 	mutex_exit(&ncl->ncl_mutex);
359*2621Sllai1 	rw_exit(&ncl->ncl_lock);
360*2621Sllai1 
361*2621Sllai1 	sdev_nc_flush_boot_update();
362*2621Sllai1 	sdev_boot_state = SDEV_BOOT_STATE_COMPLETE;
363*2621Sllai1 }
364*2621Sllai1 
365*2621Sllai1 /*
366*2621Sllai1  * Upon transition to the login state on a reconfigure boot,
367*2621Sllai1  * a debounce timer is set up so that we cache all the nonsense
368*2621Sllai1  * lookups we're hit with by the windowing system startup.
369*2621Sllai1  */
370*2621Sllai1 
371*2621Sllai1 /*ARGSUSED*/
372*2621Sllai1 static void
373*2621Sllai1 sdev_state_timeout(void *arg)
374*2621Sllai1 {
375*2621Sllai1 	sdev_timeout_id = 0;
376*2621Sllai1 	sdev_state_boot_complete();
377*2621Sllai1 }
378*2621Sllai1 
379*2621Sllai1 static void
380*2621Sllai1 sdev_state_sysavail()
381*2621Sllai1 {
382*2621Sllai1 	sdev_nc_list_t *ncl = sdev_ncache;
383*2621Sllai1 	clock_t	nticks;
384*2621Sllai1 	int nsecs;
385*2621Sllai1 
386*2621Sllai1 	mutex_enter(&ncl->ncl_mutex);
387*2621Sllai1 	ncl->ncl_flags |= NCL_LIST_WENABLE;
388*2621Sllai1 	mutex_exit(&ncl->ncl_mutex);
389*2621Sllai1 
390*2621Sllai1 	nsecs = sdev_reconfig_delay;
391*2621Sllai1 	if (nsecs == 0) {
392*2621Sllai1 		sdev_state_boot_complete();
393*2621Sllai1 	} else {
394*2621Sllai1 		nticks = drv_usectohz(1000000 * nsecs);
395*2621Sllai1 		sdcmn_err5(("timeout initiated %ld\n", nticks));
396*2621Sllai1 		sdev_timeout_id = timeout(sdev_state_timeout, NULL, nticks);
397*2621Sllai1 		sdev_nc_flush_boot_update();
398*2621Sllai1 	}
399*2621Sllai1 }
400*2621Sllai1 
401*2621Sllai1 /*
402*2621Sllai1  * Called to inform the filesystem of progress during boot,
403*2621Sllai1  * either a notice of reconfiguration boot or an indication of
404*2621Sllai1  * system boot complete.  At system boot complete, set up a
405*2621Sllai1  * timer at the expiration of which no further failed lookups
406*2621Sllai1  * will be added to the negative cache.
407*2621Sllai1  *
408*2621Sllai1  * The dev filesystem infers from reconfig boot that implicit
409*2621Sllai1  * reconfig need not be invoked at all as all available devices
410*2621Sllai1  * will have already been named.
411*2621Sllai1  *
412*2621Sllai1  * The dev filesystem infers from "system available" that devfsadmd
413*2621Sllai1  * can now be run and hence implicit reconfiguration may be initiated.
414*2621Sllai1  * During early stages of system startup, implicit reconfig is
415*2621Sllai1  * not done to avoid impacting boot performance.
416*2621Sllai1  */
417*2621Sllai1 void
418*2621Sllai1 sdev_devstate_change(void)
419*2621Sllai1 {
420*2621Sllai1 	int new_state;
421*2621Sllai1 
422*2621Sllai1 	/*
423*2621Sllai1 	 * Track system state and manage interesting transitions
424*2621Sllai1 	 */
425*2621Sllai1 	new_state = SDEV_BOOT_STATE_INITIAL;
426*2621Sllai1 	if (i_ddi_reconfig())
427*2621Sllai1 		new_state = SDEV_BOOT_STATE_RECONFIG;
428*2621Sllai1 	if (i_ddi_sysavail())
429*2621Sllai1 		new_state = SDEV_BOOT_STATE_SYSAVAIL;
430*2621Sllai1 
431*2621Sllai1 	if (sdev_boot_state < new_state) {
432*2621Sllai1 		switch (new_state) {
433*2621Sllai1 		case SDEV_BOOT_STATE_RECONFIG:
434*2621Sllai1 			sdcmn_err5(("state change: reconfigure boot\n"));
435*2621Sllai1 			sdev_boot_state = new_state;
436*2621Sllai1 			sdev_reconfig_boot = 1;
437*2621Sllai1 			if (!sdev_nc_disable_reset)
438*2621Sllai1 				sdev_nc_free_bootonly();
439*2621Sllai1 			break;
440*2621Sllai1 		case SDEV_BOOT_STATE_SYSAVAIL:
441*2621Sllai1 			sdcmn_err5(("system available\n"));
442*2621Sllai1 			sdev_boot_state = new_state;
443*2621Sllai1 			sdev_state_sysavail();
444*2621Sllai1 			break;
445*2621Sllai1 		}
446*2621Sllai1 	}
447*2621Sllai1 }
448*2621Sllai1 
449*2621Sllai1 /*
450*2621Sllai1  * Lookup: filter out entries in the negative cache
451*2621Sllai1  * Return 1 if the lookup should not cause a reconfig.
452*2621Sllai1  */
453*2621Sllai1 int
454*2621Sllai1 sdev_lookup_filter(sdev_node_t *dv, char *nm)
455*2621Sllai1 {
456*2621Sllai1 	int n;
457*2621Sllai1 	sdev_nc_list_t *ncl = sdev_ncache;
458*2621Sllai1 	sdev_nc_node_t *lp;
459*2621Sllai1 	char *path;
460*2621Sllai1 	int rval = 0;
461*2621Sllai1 	int changed = 0;
462*2621Sllai1 
463*2621Sllai1 	ASSERT(i_ddi_io_initialized());
464*2621Sllai1 	ASSERT(SDEVTOV(dv)->v_type == VDIR);
465*2621Sllai1 
466*2621Sllai1 	if (sdev_nc_disable)
467*2621Sllai1 		return (0);
468*2621Sllai1 
469*2621Sllai1 	n = strlen(dv->sdev_path) + strlen(nm) + 2;
470*2621Sllai1 	path = kmem_alloc(n, KM_SLEEP);
471*2621Sllai1 	(void) sprintf(path, "%s/%s", dv->sdev_path, nm);
472*2621Sllai1 
473*2621Sllai1 	rw_enter(&ncl->ncl_lock, RW_READER);
474*2621Sllai1 	if ((lp = sdev_nc_findpath(ncl, path)) != NULL) {
475*2621Sllai1 		sdcmn_err5(("%s/%s: lookup by %s cached, no reconfig\n",
476*2621Sllai1 		    dv->sdev_name, nm, curproc->p_user.u_comm));
477*2621Sllai1 		if (sdev_nc_verbose) {
478*2621Sllai1 			cmn_err(CE_CONT,
479*2621Sllai1 			    "?%s/%s: lookup by %s cached, no reconfig\n",
480*2621Sllai1 			    dv->sdev_name, nm, curproc->p_user.u_comm);
481*2621Sllai1 		}
482*2621Sllai1 		mutex_enter(&ncl->ncl_mutex);
483*2621Sllai1 		lp->ncn_flags |= NCN_ACTIVE;
484*2621Sllai1 		if (sdev_nc_expirecnt > 0 && lp->ncn_expirecnt > 0 &&
485*2621Sllai1 		    lp->ncn_expirecnt < sdev_nc_expirecnt) {
486*2621Sllai1 			lp->ncn_expirecnt = sdev_nc_expirecnt;
487*2621Sllai1 			ncl->ncl_flags |= NCL_LIST_DIRTY;
488*2621Sllai1 			changed = 1;
489*2621Sllai1 		}
490*2621Sllai1 		mutex_exit(&ncl->ncl_mutex);
491*2621Sllai1 		rval = 1;
492*2621Sllai1 	}
493*2621Sllai1 	rw_exit(&ncl->ncl_lock);
494*2621Sllai1 	kmem_free(path, n);
495*2621Sllai1 	if (changed)
496*2621Sllai1 		sdev_nc_flush_boot_update();
497*2621Sllai1 	return (rval);
498*2621Sllai1 }
499*2621Sllai1 
500*2621Sllai1 void
501*2621Sllai1 sdev_lookup_failed(sdev_node_t *dv, char *nm, int failed_flags)
502*2621Sllai1 {
503*2621Sllai1 	if (sdev_nc_disable)
504*2621Sllai1 		return;
505*2621Sllai1 
506*2621Sllai1 	/*
507*2621Sllai1 	 * If we're still in the initial boot stage, always update
508*2621Sllai1 	 * the cache - we may not have received notice of the
509*2621Sllai1 	 * reconfig boot state yet.  On a reconfigure boot, entries
510*2621Sllai1 	 * from the backing store are not re-persisted on update,
511*2621Sllai1 	 * but new entries are marked as needing an update.
512*2621Sllai1 	 * Never cache dynamic or non-global nodes.
513*2621Sllai1 	 */
514*2621Sllai1 	if (SDEV_IS_GLOBAL(dv) && !SDEV_IS_DYNAMIC(dv) &&
515*2621Sllai1 	    !SDEV_IS_NO_NCACHE(dv) &&
516*2621Sllai1 	    ((failed_flags & SLF_NO_NCACHE) == 0) &&
517*2621Sllai1 	    ((sdev_reconfig_boot &&
518*2621Sllai1 		(sdev_boot_state != SDEV_BOOT_STATE_COMPLETE)) ||
519*2621Sllai1 	    (!sdev_reconfig_boot && ((failed_flags & SLF_REBUILT))))) {
520*2621Sllai1 			sdev_nc_addname(sdev_ncache,
521*2621Sllai1 			    dv, nm, NCN_SRC_CURRENT|NCN_ACTIVE);
522*2621Sllai1 	}
523*2621Sllai1 }
524*2621Sllai1 
525*2621Sllai1 static sdev_nc_list_t *
526*2621Sllai1 sdev_nc_newlist(void)
527*2621Sllai1 {
528*2621Sllai1 	sdev_nc_list_t	*ncl;
529*2621Sllai1 
530*2621Sllai1 	ncl = kmem_zalloc(sizeof (sdev_nc_list_t), KM_SLEEP);
531*2621Sllai1 
532*2621Sllai1 	rw_init(&ncl->ncl_lock, NULL, RW_DEFAULT, NULL);
533*2621Sllai1 	mutex_init(&ncl->ncl_mutex, NULL, MUTEX_DEFAULT, NULL);
534*2621Sllai1 	list_create(&ncl->ncl_list, sizeof (sdev_nc_node_t),
535*2621Sllai1 	    offsetof(sdev_nc_node_t, ncn_link));
536*2621Sllai1 
537*2621Sllai1 	return (ncl);
538*2621Sllai1 }
539*2621Sllai1 
540*2621Sllai1 static void
541*2621Sllai1 sdev_nc_free_unlinked_node(sdev_nc_node_t *lp)
542*2621Sllai1 {
543*2621Sllai1 	kmem_free(lp->ncn_name, strlen(lp->ncn_name) + 1);
544*2621Sllai1 	kmem_free(lp, sizeof (sdev_nc_node_t));
545*2621Sllai1 }
546*2621Sllai1 
547*2621Sllai1 static void
548*2621Sllai1 sdev_nc_free_all_nodes(sdev_nc_list_t *ncl)
549*2621Sllai1 {
550*2621Sllai1 	sdev_nc_node_t *lp;
551*2621Sllai1 
552*2621Sllai1 	while ((lp = list_head(&ncl->ncl_list)) != NULL) {
553*2621Sllai1 		list_remove(&ncl->ncl_list, lp);
554*2621Sllai1 		sdev_nc_free_unlinked_node(lp);
555*2621Sllai1 		ncl->ncl_nentries--;
556*2621Sllai1 	}
557*2621Sllai1 	ASSERT(ncl->ncl_nentries == 0);
558*2621Sllai1 }
559*2621Sllai1 
560*2621Sllai1 static void
561*2621Sllai1 sdev_nc_freelist(sdev_nc_list_t *ncl)
562*2621Sllai1 {
563*2621Sllai1 	if (!list_is_empty(&ncl->ncl_list))
564*2621Sllai1 		sdev_nc_free_all_nodes(ncl);
565*2621Sllai1 	ASSERT(list_is_empty(&ncl->ncl_list));
566*2621Sllai1 	ASSERT(ncl->ncl_nentries == 0);
567*2621Sllai1 
568*2621Sllai1 	mutex_destroy(&ncl->ncl_mutex);
569*2621Sllai1 	rw_destroy(&ncl->ncl_lock);
570*2621Sllai1 	list_destroy(&ncl->ncl_list);
571*2621Sllai1 	kmem_free(ncl, sizeof (sdev_nc_list_t));
572*2621Sllai1 }
573*2621Sllai1 
574*2621Sllai1 static sdev_nc_node_t *
575*2621Sllai1 sdev_nc_findpath(sdev_nc_list_t *ncl, char *path)
576*2621Sllai1 {
577*2621Sllai1 	sdev_nc_node_t *lp;
578*2621Sllai1 
579*2621Sllai1 	ASSERT(RW_LOCK_HELD(&ncl->ncl_lock));
580*2621Sllai1 
581*2621Sllai1 	for (lp = list_head(&ncl->ncl_list); lp;
582*2621Sllai1 	    lp = list_next(&ncl->ncl_list, lp)) {
583*2621Sllai1 		if (strcmp(path, lp->ncn_name) == 0)
584*2621Sllai1 			return (lp);
585*2621Sllai1 	}
586*2621Sllai1 
587*2621Sllai1 	return (NULL);
588*2621Sllai1 }
589*2621Sllai1 
590*2621Sllai1 static void
591*2621Sllai1 sdev_nc_insertnode(sdev_nc_list_t *ncl, sdev_nc_node_t *new)
592*2621Sllai1 {
593*2621Sllai1 	sdev_nc_node_t *lp;
594*2621Sllai1 
595*2621Sllai1 	rw_enter(&ncl->ncl_lock, RW_WRITER);
596*2621Sllai1 
597*2621Sllai1 	lp = sdev_nc_findpath(ncl, new->ncn_name);
598*2621Sllai1 	if (lp == NULL) {
599*2621Sllai1 		if (ncl->ncl_nentries == sdev_nc_max_entries) {
600*2621Sllai1 			sdcmn_err5((
601*2621Sllai1 			    "%s by %s: not adding to ncache (max %d)\n",
602*2621Sllai1 			    new->ncn_name, curproc->p_user.u_comm,
603*2621Sllai1 			    ncl->ncl_nentries));
604*2621Sllai1 			if (sdev_nc_verbose) {
605*2621Sllai1 				cmn_err(CE_CONT, "?%s by %s: "
606*2621Sllai1 				    "not adding to ncache (max %d)\n",
607*2621Sllai1 				    new->ncn_name, curproc->p_user.u_comm,
608*2621Sllai1 				    ncl->ncl_nentries);
609*2621Sllai1 			}
610*2621Sllai1 			rw_exit(&ncl->ncl_lock);
611*2621Sllai1 			sdev_nc_free_unlinked_node(new);
612*2621Sllai1 		} else {
613*2621Sllai1 
614*2621Sllai1 			list_insert_tail(&ncl->ncl_list, new);
615*2621Sllai1 			ncl->ncl_nentries++;
616*2621Sllai1 
617*2621Sllai1 			/* don't mark list dirty for nodes from store */
618*2621Sllai1 			mutex_enter(&ncl->ncl_mutex);
619*2621Sllai1 			if ((new->ncn_flags & NCN_SRC_STORE) == 0) {
620*2621Sllai1 				sdcmn_err5(("%s by %s: add to ncache\n",
621*2621Sllai1 				    new->ncn_name, curproc->p_user.u_comm));
622*2621Sllai1 				if (sdev_nc_verbose) {
623*2621Sllai1 					cmn_err(CE_CONT,
624*2621Sllai1 					    "?%s by %s: add to ncache\n",
625*2621Sllai1 					    new->ncn_name,
626*2621Sllai1 					    curproc->p_user.u_comm);
627*2621Sllai1 				}
628*2621Sllai1 				ncl->ncl_flags |= NCL_LIST_DIRTY;
629*2621Sllai1 			}
630*2621Sllai1 			mutex_exit(&ncl->ncl_mutex);
631*2621Sllai1 			rw_exit(&ncl->ncl_lock);
632*2621Sllai1 			lp = new;
633*2621Sllai1 			sdev_nc_flush_boot_update();
634*2621Sllai1 		}
635*2621Sllai1 	} else {
636*2621Sllai1 		mutex_enter(&ncl->ncl_mutex);
637*2621Sllai1 		lp->ncn_flags |= new->ncn_flags;
638*2621Sllai1 		mutex_exit(&ncl->ncl_mutex);
639*2621Sllai1 		rw_exit(&ncl->ncl_lock);
640*2621Sllai1 		sdev_nc_free_unlinked_node(new);
641*2621Sllai1 	}
642*2621Sllai1 }
643*2621Sllai1 
644*2621Sllai1 void
645*2621Sllai1 sdev_nc_addname(sdev_nc_list_t *ncl, sdev_node_t *dv, char *nm, int flags)
646*2621Sllai1 {
647*2621Sllai1 	int n;
648*2621Sllai1 	sdev_nc_node_t *lp;
649*2621Sllai1 
650*2621Sllai1 	ASSERT(SDEVTOV(dv)->v_type == VDIR);
651*2621Sllai1 
652*2621Sllai1 	lp = kmem_zalloc(sizeof (sdev_nc_node_t), KM_SLEEP);
653*2621Sllai1 
654*2621Sllai1 	n = strlen(dv->sdev_path) + strlen(nm) + 2;
655*2621Sllai1 	lp->ncn_name = kmem_alloc(n, KM_SLEEP);
656*2621Sllai1 	(void) sprintf(lp->ncn_name, "%s/%s",
657*2621Sllai1 		dv->sdev_path, nm);
658*2621Sllai1 	lp->ncn_flags = flags;
659*2621Sllai1 	lp->ncn_expirecnt = sdev_nc_expirecnt;
660*2621Sllai1 	sdev_nc_insertnode(ncl, lp);
661*2621Sllai1 }
662*2621Sllai1 
663*2621Sllai1 void
664*2621Sllai1 sdev_nc_node_exists(sdev_node_t *dv)
665*2621Sllai1 {
666*2621Sllai1 	/* dynamic and non-global nodes are never cached */
667*2621Sllai1 	if (SDEV_IS_GLOBAL(dv) && !SDEV_IS_DYNAMIC(dv) &&
668*2621Sllai1 	    !SDEV_IS_NO_NCACHE(dv)) {
669*2621Sllai1 		sdev_nc_path_exists(sdev_ncache, dv->sdev_path);
670*2621Sllai1 	}
671*2621Sllai1 }
672*2621Sllai1 
673*2621Sllai1 void
674*2621Sllai1 sdev_nc_path_exists(sdev_nc_list_t *ncl, char *path)
675*2621Sllai1 {
676*2621Sllai1 	sdev_nc_node_t *lp;
677*2621Sllai1 
678*2621Sllai1 	if (sdev_nc_disable)
679*2621Sllai1 		return;
680*2621Sllai1 
681*2621Sllai1 	rw_enter(&ncl->ncl_lock, RW_READER);
682*2621Sllai1 	if ((lp = sdev_nc_findpath(ncl, path)) == NULL) {
683*2621Sllai1 		rw_exit(&ncl->ncl_lock);
684*2621Sllai1 		return;
685*2621Sllai1 	}
686*2621Sllai1 	if (rw_tryupgrade(&ncl->ncl_lock) == 0) {
687*2621Sllai1 		rw_exit(&ncl->ncl_lock);
688*2621Sllai1 		rw_enter(&ncl->ncl_lock, RW_WRITER);
689*2621Sllai1 		lp = sdev_nc_findpath(ncl, path);
690*2621Sllai1 	}
691*2621Sllai1 	if (lp) {
692*2621Sllai1 		list_remove(&ncl->ncl_list, lp);
693*2621Sllai1 		ncl->ncl_nentries--;
694*2621Sllai1 		mutex_enter(&ncl->ncl_mutex);
695*2621Sllai1 		ncl->ncl_flags |= NCL_LIST_DIRTY;
696*2621Sllai1 		if (ncl->ncl_flags & NCL_LIST_WENABLE) {
697*2621Sllai1 			mutex_exit(&ncl->ncl_mutex);
698*2621Sllai1 			rw_exit(&ncl->ncl_lock);
699*2621Sllai1 			sdev_nc_flush_updates();
700*2621Sllai1 		} else {
701*2621Sllai1 			mutex_exit(&ncl->ncl_mutex);
702*2621Sllai1 			rw_exit(&ncl->ncl_lock);
703*2621Sllai1 		}
704*2621Sllai1 		sdev_nc_free_unlinked_node(lp);
705*2621Sllai1 		sdcmn_err5(("%s by %s: removed from ncache\n",
706*2621Sllai1 		    path, curproc->p_user.u_comm));
707*2621Sllai1 		if (sdev_nc_verbose) {
708*2621Sllai1 			cmn_err(CE_CONT, "?%s by %s: removed from ncache\n",
709*2621Sllai1 			    path, curproc->p_user.u_comm);
710*2621Sllai1 		}
711*2621Sllai1 	} else
712*2621Sllai1 		rw_exit(&ncl->ncl_lock);
713*2621Sllai1 }
714*2621Sllai1 
715*2621Sllai1 static void
716*2621Sllai1 sdev_nc_free_bootonly(void)
717*2621Sllai1 {
718*2621Sllai1 	sdev_nc_list_t	*ncl = sdev_ncache;
719*2621Sllai1 	sdev_nc_node_t *lp;
720*2621Sllai1 	sdev_nc_node_t *next;
721*2621Sllai1 
722*2621Sllai1 	ASSERT(sdev_reconfig_boot);
723*2621Sllai1 
724*2621Sllai1 	rw_enter(&ncl->ncl_lock, RW_WRITER);
725*2621Sllai1 
726*2621Sllai1 	for (lp = list_head(&ncl->ncl_list); lp; lp = next) {
727*2621Sllai1 		next = list_next(&ncl->ncl_list, lp);
728*2621Sllai1 		if ((lp->ncn_flags & NCN_SRC_CURRENT) == 0) {
729*2621Sllai1 			sdcmn_err5(("freeing %s\n", lp->ncn_name));
730*2621Sllai1 			mutex_enter(&ncl->ncl_mutex);
731*2621Sllai1 			ncl->ncl_flags |= NCL_LIST_DIRTY;
732*2621Sllai1 			mutex_exit(&ncl->ncl_mutex);
733*2621Sllai1 			list_remove(&ncl->ncl_list, lp);
734*2621Sllai1 			sdev_nc_free_unlinked_node(lp);
735*2621Sllai1 			ncl->ncl_nentries--;
736*2621Sllai1 		}
737*2621Sllai1 	}
738*2621Sllai1 
739*2621Sllai1 	rw_exit(&ncl->ncl_lock);
740*2621Sllai1 }
741