xref: /onnv-gate/usr/src/cmd/fm/modules/common/zfs-retire/zfs_retire.c (revision 12066:b5d1c83f5cfc)
12082Seschrock /*
22082Seschrock  * CDDL HEADER START
32082Seschrock  *
42082Seschrock  * The contents of this file are subject to the terms of the
52082Seschrock  * Common Development and Distribution License (the "License").
62082Seschrock  * You may not use this file except in compliance with the License.
72082Seschrock  *
82082Seschrock  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
92082Seschrock  * or http://www.opensolaris.org/os/licensing.
102082Seschrock  * See the License for the specific language governing permissions
112082Seschrock  * and limitations under the License.
122082Seschrock  *
132082Seschrock  * When distributing Covered Code, include this CDDL HEADER in each
142082Seschrock  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
152082Seschrock  * If applicable, add the following below this CDDL HEADER, with the
162082Seschrock  * fields enclosed by brackets "[]" replaced with your own identifying
172082Seschrock  * information: Portions Copyright [yyyy] [name of copyright owner]
182082Seschrock  *
192082Seschrock  * CDDL HEADER END
202082Seschrock  */
212082Seschrock /*
22*12066SRobert.Johnston@Sun.COM  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
232082Seschrock  */
242082Seschrock 
252082Seschrock /*
262082Seschrock  * The ZFS retire agent is responsible for managing hot spares across all pools.
274451Seschrock  * When we see a device fault or a device removal, we try to open the associated
284451Seschrock  * pool and look for any hot spares.  We iterate over any available hot spares
294451Seschrock  * and attempt a 'zpool replace' for each one.
304451Seschrock  *
314451Seschrock  * For vdevs diagnosed as faulty, the agent is also responsible for proactively
324451Seschrock  * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors).
332082Seschrock  */
342082Seschrock 
352082Seschrock #include <fm/fmd_api.h>
362082Seschrock #include <sys/fs/zfs.h>
372082Seschrock #include <sys/fm/protocol.h>
382082Seschrock #include <sys/fm/fs/zfs.h>
392082Seschrock #include <libzfs.h>
4010817SEric.Schrock@Sun.COM #include <fm/libtopo.h>
414451Seschrock #include <string.h>
422082Seschrock 
4310817SEric.Schrock@Sun.COM typedef struct zfs_retire_repaired {
4410817SEric.Schrock@Sun.COM 	struct zfs_retire_repaired	*zrr_next;
4510817SEric.Schrock@Sun.COM 	uint64_t			zrr_pool;
4610817SEric.Schrock@Sun.COM 	uint64_t			zrr_vdev;
4710817SEric.Schrock@Sun.COM } zfs_retire_repaired_t;
4810817SEric.Schrock@Sun.COM 
4910817SEric.Schrock@Sun.COM typedef struct zfs_retire_data {
5010817SEric.Schrock@Sun.COM 	libzfs_handle_t			*zrd_hdl;
5110817SEric.Schrock@Sun.COM 	zfs_retire_repaired_t		*zrd_repaired;
5210817SEric.Schrock@Sun.COM } zfs_retire_data_t;
5310817SEric.Schrock@Sun.COM 
5410817SEric.Schrock@Sun.COM static void
zfs_retire_clear_data(fmd_hdl_t * hdl,zfs_retire_data_t * zdp)5510817SEric.Schrock@Sun.COM zfs_retire_clear_data(fmd_hdl_t *hdl, zfs_retire_data_t *zdp)
5610817SEric.Schrock@Sun.COM {
5710817SEric.Schrock@Sun.COM 	zfs_retire_repaired_t *zrp;
5810817SEric.Schrock@Sun.COM 
5910817SEric.Schrock@Sun.COM 	while ((zrp = zdp->zrd_repaired) != NULL) {
6010817SEric.Schrock@Sun.COM 		zdp->zrd_repaired = zrp->zrr_next;
6110817SEric.Schrock@Sun.COM 		fmd_hdl_free(hdl, zrp, sizeof (zfs_retire_repaired_t));
6210817SEric.Schrock@Sun.COM 	}
6310817SEric.Schrock@Sun.COM }
6410817SEric.Schrock@Sun.COM 
652082Seschrock /*
662082Seschrock  * Find a pool with a matching GUID.
672082Seschrock  */
682082Seschrock typedef struct find_cbdata {
692082Seschrock 	uint64_t	cb_guid;
7010817SEric.Schrock@Sun.COM 	const char	*cb_fru;
712082Seschrock 	zpool_handle_t	*cb_zhp;
7210817SEric.Schrock@Sun.COM 	nvlist_t	*cb_vdev;
732082Seschrock } find_cbdata_t;
742082Seschrock 
752082Seschrock static int
find_pool(zpool_handle_t * zhp,void * data)762082Seschrock find_pool(zpool_handle_t *zhp, void *data)
772082Seschrock {
782082Seschrock 	find_cbdata_t *cbp = data;
792082Seschrock 
805094Slling 	if (cbp->cb_guid ==
815094Slling 	    zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) {
822082Seschrock 		cbp->cb_zhp = zhp;
832082Seschrock 		return (1);
842082Seschrock 	}
852082Seschrock 
862082Seschrock 	zpool_close(zhp);
872082Seschrock 	return (0);
882082Seschrock }
892082Seschrock 
902082Seschrock /*
912082Seschrock  * Find a vdev within a tree with a matching GUID.
922082Seschrock  */
932082Seschrock static nvlist_t *
find_vdev(libzfs_handle_t * zhdl,nvlist_t * nv,const char * search_fru,uint64_t search_guid)9410817SEric.Schrock@Sun.COM find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, const char *search_fru,
9510817SEric.Schrock@Sun.COM     uint64_t search_guid)
962082Seschrock {
972082Seschrock 	uint64_t guid;
982082Seschrock 	nvlist_t **child;
992082Seschrock 	uint_t c, children;
1002082Seschrock 	nvlist_t *ret;
10110817SEric.Schrock@Sun.COM 	char *fru;
1022082Seschrock 
10310817SEric.Schrock@Sun.COM 	if (search_fru != NULL) {
10410817SEric.Schrock@Sun.COM 		if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &fru) == 0 &&
10510817SEric.Schrock@Sun.COM 		    libzfs_fru_compare(zhdl, fru, search_fru))
10610817SEric.Schrock@Sun.COM 			return (nv);
10710817SEric.Schrock@Sun.COM 	} else {
10810817SEric.Schrock@Sun.COM 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 &&
10910817SEric.Schrock@Sun.COM 		    guid == search_guid)
11010817SEric.Schrock@Sun.COM 			return (nv);
11110817SEric.Schrock@Sun.COM 	}
1122082Seschrock 
1132082Seschrock 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1142082Seschrock 	    &child, &children) != 0)
1152082Seschrock 		return (NULL);
1162082Seschrock 
1172082Seschrock 	for (c = 0; c < children; c++) {
11810817SEric.Schrock@Sun.COM 		if ((ret = find_vdev(zhdl, child[c], search_fru,
11910817SEric.Schrock@Sun.COM 		    search_guid)) != NULL)
1202082Seschrock 			return (ret);
1212082Seschrock 	}
1222082Seschrock 
1236643Seschrock 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1246643Seschrock 	    &child, &children) != 0)
1256643Seschrock 		return (NULL);
1266643Seschrock 
1276643Seschrock 	for (c = 0; c < children; c++) {
12810817SEric.Schrock@Sun.COM 		if ((ret = find_vdev(zhdl, child[c], search_fru,
12910817SEric.Schrock@Sun.COM 		    search_guid)) != NULL)
1306643Seschrock 			return (ret);
1316643Seschrock 	}
1326643Seschrock 
1332082Seschrock 	return (NULL);
1342082Seschrock }
1352082Seschrock 
1364451Seschrock /*
1374451Seschrock  * Given a (pool, vdev) GUID pair, find the matching pool and vdev.
1384451Seschrock  */
1394451Seschrock static zpool_handle_t *
find_by_guid(libzfs_handle_t * zhdl,uint64_t pool_guid,uint64_t vdev_guid,nvlist_t ** vdevp)1404451Seschrock find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid,
1414451Seschrock     nvlist_t **vdevp)
1424451Seschrock {
1434451Seschrock 	find_cbdata_t cb;
1444451Seschrock 	zpool_handle_t *zhp;
1454451Seschrock 	nvlist_t *config, *nvroot;
1464451Seschrock 
1474451Seschrock 	/*
1484451Seschrock 	 * Find the corresponding pool and make sure the vdev still exists.
1494451Seschrock 	 */
1504451Seschrock 	cb.cb_guid = pool_guid;
1514451Seschrock 	if (zpool_iter(zhdl, find_pool, &cb) != 1)
1524451Seschrock 		return (NULL);
1534451Seschrock 
1544451Seschrock 	zhp = cb.cb_zhp;
1554451Seschrock 	config = zpool_get_config(zhp, NULL);
1564451Seschrock 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1574451Seschrock 	    &nvroot) != 0) {
1584451Seschrock 		zpool_close(zhp);
1594451Seschrock 		return (NULL);
1604451Seschrock 	}
1614451Seschrock 
16210817SEric.Schrock@Sun.COM 	if (vdev_guid != 0) {
16310817SEric.Schrock@Sun.COM 		if ((*vdevp = find_vdev(zhdl, nvroot, NULL,
16410817SEric.Schrock@Sun.COM 		    vdev_guid)) == NULL) {
16510817SEric.Schrock@Sun.COM 			zpool_close(zhp);
16610817SEric.Schrock@Sun.COM 			return (NULL);
16710817SEric.Schrock@Sun.COM 		}
1684451Seschrock 	}
1694451Seschrock 
1704451Seschrock 	return (zhp);
1714451Seschrock }
1724451Seschrock 
17310817SEric.Schrock@Sun.COM static int
search_pool(zpool_handle_t * zhp,void * data)17410817SEric.Schrock@Sun.COM search_pool(zpool_handle_t *zhp, void *data)
17510817SEric.Schrock@Sun.COM {
17610817SEric.Schrock@Sun.COM 	find_cbdata_t *cbp = data;
17710817SEric.Schrock@Sun.COM 	nvlist_t *config;
17810817SEric.Schrock@Sun.COM 	nvlist_t *nvroot;
17910817SEric.Schrock@Sun.COM 
18010817SEric.Schrock@Sun.COM 	config = zpool_get_config(zhp, NULL);
18110817SEric.Schrock@Sun.COM 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
18210817SEric.Schrock@Sun.COM 	    &nvroot) != 0) {
18310817SEric.Schrock@Sun.COM 		zpool_close(zhp);
18410817SEric.Schrock@Sun.COM 		return (0);
18510817SEric.Schrock@Sun.COM 	}
18610817SEric.Schrock@Sun.COM 
18710817SEric.Schrock@Sun.COM 	if ((cbp->cb_vdev = find_vdev(zpool_get_handle(zhp), nvroot,
18810817SEric.Schrock@Sun.COM 	    cbp->cb_fru, 0)) != NULL) {
18910817SEric.Schrock@Sun.COM 		cbp->cb_zhp = zhp;
19010817SEric.Schrock@Sun.COM 		return (1);
19110817SEric.Schrock@Sun.COM 	}
19210817SEric.Schrock@Sun.COM 
19310817SEric.Schrock@Sun.COM 	zpool_close(zhp);
19410817SEric.Schrock@Sun.COM 	return (0);
19510817SEric.Schrock@Sun.COM }
19610817SEric.Schrock@Sun.COM 
19710817SEric.Schrock@Sun.COM /*
19810817SEric.Schrock@Sun.COM  * Given a FRU FMRI, find the matching pool and vdev.
19910817SEric.Schrock@Sun.COM  */
20010817SEric.Schrock@Sun.COM static zpool_handle_t *
find_by_fru(libzfs_handle_t * zhdl,const char * fru,nvlist_t ** vdevp)20110817SEric.Schrock@Sun.COM find_by_fru(libzfs_handle_t *zhdl, const char *fru, nvlist_t **vdevp)
20210817SEric.Schrock@Sun.COM {
20310817SEric.Schrock@Sun.COM 	find_cbdata_t cb;
20410817SEric.Schrock@Sun.COM 
20510817SEric.Schrock@Sun.COM 	cb.cb_fru = fru;
20610817SEric.Schrock@Sun.COM 	cb.cb_zhp = NULL;
20710817SEric.Schrock@Sun.COM 	if (zpool_iter(zhdl, search_pool, &cb) != 1)
20810817SEric.Schrock@Sun.COM 		return (NULL);
20910817SEric.Schrock@Sun.COM 
21010817SEric.Schrock@Sun.COM 	*vdevp = cb.cb_vdev;
21110817SEric.Schrock@Sun.COM 	return (cb.cb_zhp);
21210817SEric.Schrock@Sun.COM }
21310817SEric.Schrock@Sun.COM 
2144451Seschrock /*
2154451Seschrock  * Given a vdev, attempt to replace it with every known spare until one
2164451Seschrock  * succeeds.
2174451Seschrock  */
2184451Seschrock static void
replace_with_spare(fmd_hdl_t * hdl,zpool_handle_t * zhp,nvlist_t * vdev)21910817SEric.Schrock@Sun.COM replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
2204451Seschrock {
2214451Seschrock 	nvlist_t *config, *nvroot, *replacement;
2224451Seschrock 	nvlist_t **spares;
2234451Seschrock 	uint_t s, nspares;
2244451Seschrock 	char *dev_name;
2254451Seschrock 
2264451Seschrock 	config = zpool_get_config(zhp, NULL);
2274451Seschrock 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
2284451Seschrock 	    &nvroot) != 0)
2294451Seschrock 		return;
2304451Seschrock 
2314451Seschrock 	/*
2324451Seschrock 	 * Find out if there are any hot spares available in the pool.
2334451Seschrock 	 */
2344451Seschrock 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
2354451Seschrock 	    &spares, &nspares) != 0)
2364451Seschrock 		return;
2374451Seschrock 
23810817SEric.Schrock@Sun.COM 	replacement = fmd_nvl_alloc(hdl, FMD_SLEEP);
2394451Seschrock 
24010817SEric.Schrock@Sun.COM 	(void) nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE,
24110817SEric.Schrock@Sun.COM 	    VDEV_TYPE_ROOT);
2424451Seschrock 
24310594SGeorge.Wilson@Sun.COM 	dev_name = zpool_vdev_name(NULL, zhp, vdev, B_FALSE);
2444451Seschrock 
2454451Seschrock 	/*
2464451Seschrock 	 * Try to replace each spare, ending when we successfully
2474451Seschrock 	 * replace it.
2484451Seschrock 	 */
2494451Seschrock 	for (s = 0; s < nspares; s++) {
2504451Seschrock 		char *spare_name;
2514451Seschrock 
2524451Seschrock 		if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH,
2534451Seschrock 		    &spare_name) != 0)
2544451Seschrock 			continue;
2554451Seschrock 
25610817SEric.Schrock@Sun.COM 		(void) nvlist_add_nvlist_array(replacement,
25710817SEric.Schrock@Sun.COM 		    ZPOOL_CONFIG_CHILDREN, &spares[s], 1);
2584451Seschrock 
2594451Seschrock 		if (zpool_vdev_attach(zhp, dev_name, spare_name,
2604451Seschrock 		    replacement, B_TRUE) == 0)
2614451Seschrock 			break;
2624451Seschrock 	}
2634451Seschrock 
2644451Seschrock 	free(dev_name);
2654451Seschrock 	nvlist_free(replacement);
2664451Seschrock }
2674451Seschrock 
26810817SEric.Schrock@Sun.COM /*
26910817SEric.Schrock@Sun.COM  * Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and
27010817SEric.Schrock@Sun.COM  * ASRU is now usable.  ZFS has found the device to be present and
27110817SEric.Schrock@Sun.COM  * functioning.
27210817SEric.Schrock@Sun.COM  */
27310817SEric.Schrock@Sun.COM /*ARGSUSED*/
27410817SEric.Schrock@Sun.COM void
zfs_vdev_repair(fmd_hdl_t * hdl,nvlist_t * nvl)27510817SEric.Schrock@Sun.COM zfs_vdev_repair(fmd_hdl_t *hdl, nvlist_t *nvl)
27610817SEric.Schrock@Sun.COM {
27710817SEric.Schrock@Sun.COM 	zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);
27810817SEric.Schrock@Sun.COM 	zfs_retire_repaired_t *zrp;
27910817SEric.Schrock@Sun.COM 	uint64_t pool_guid, vdev_guid;
28010817SEric.Schrock@Sun.COM 	nvlist_t *asru;
28110817SEric.Schrock@Sun.COM 
28210817SEric.Schrock@Sun.COM 	if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
28310817SEric.Schrock@Sun.COM 	    &pool_guid) != 0 || nvlist_lookup_uint64(nvl,
28410817SEric.Schrock@Sun.COM 	    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)
28510817SEric.Schrock@Sun.COM 		return;
28610817SEric.Schrock@Sun.COM 
28710817SEric.Schrock@Sun.COM 	/*
28810817SEric.Schrock@Sun.COM 	 * Before checking the state of the ASRU, go through and see if we've
28910817SEric.Schrock@Sun.COM 	 * already made an attempt to repair this ASRU.  This list is cleared
29010817SEric.Schrock@Sun.COM 	 * whenever we receive any kind of list event, and is designed to
29110817SEric.Schrock@Sun.COM 	 * prevent us from generating a feedback loop when we attempt repairs
29210817SEric.Schrock@Sun.COM 	 * against a faulted pool.  The problem is that checking the unusable
29310817SEric.Schrock@Sun.COM 	 * state of the ASRU can involve opening the pool, which can post
29410817SEric.Schrock@Sun.COM 	 * statechange events but otherwise leave the pool in the faulted
29510817SEric.Schrock@Sun.COM 	 * state.  This list allows us to detect when a statechange event is
29610817SEric.Schrock@Sun.COM 	 * due to our own request.
29710817SEric.Schrock@Sun.COM 	 */
29810817SEric.Schrock@Sun.COM 	for (zrp = zdp->zrd_repaired; zrp != NULL; zrp = zrp->zrr_next) {
29910817SEric.Schrock@Sun.COM 		if (zrp->zrr_pool == pool_guid &&
30010817SEric.Schrock@Sun.COM 		    zrp->zrr_vdev == vdev_guid)
30110817SEric.Schrock@Sun.COM 			return;
30210817SEric.Schrock@Sun.COM 	}
30310817SEric.Schrock@Sun.COM 
30410817SEric.Schrock@Sun.COM 	asru = fmd_nvl_alloc(hdl, FMD_SLEEP);
30510817SEric.Schrock@Sun.COM 
30610817SEric.Schrock@Sun.COM 	(void) nvlist_add_uint8(asru, FM_VERSION, ZFS_SCHEME_VERSION0);
30710817SEric.Schrock@Sun.COM 	(void) nvlist_add_string(asru, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS);
30810817SEric.Schrock@Sun.COM 	(void) nvlist_add_uint64(asru, FM_FMRI_ZFS_POOL, pool_guid);
30910817SEric.Schrock@Sun.COM 	(void) nvlist_add_uint64(asru, FM_FMRI_ZFS_VDEV, vdev_guid);
31010817SEric.Schrock@Sun.COM 
31110817SEric.Schrock@Sun.COM 	/*
31210817SEric.Schrock@Sun.COM 	 * We explicitly check for the unusable state here to make sure we
31310817SEric.Schrock@Sun.COM 	 * aren't responding to a transient state change.  As part of opening a
31410817SEric.Schrock@Sun.COM 	 * vdev, it's possible to see the 'statechange' event, only to be
31510817SEric.Schrock@Sun.COM 	 * followed by a vdev failure later.  If we don't check the current
31610817SEric.Schrock@Sun.COM 	 * state of the vdev (or pool) before marking it repaired, then we risk
31710817SEric.Schrock@Sun.COM 	 * generating spurious repair events followed immediately by the same
31810817SEric.Schrock@Sun.COM 	 * diagnosis.
31910817SEric.Schrock@Sun.COM 	 *
32010817SEric.Schrock@Sun.COM 	 * This assumes that the ZFS scheme code associated unusable (i.e.
32110817SEric.Schrock@Sun.COM 	 * isolated) with its own definition of faulty state.  In the case of a
32210817SEric.Schrock@Sun.COM 	 * DEGRADED leaf vdev (due to checksum errors), this is not the case.
32310817SEric.Schrock@Sun.COM 	 * This works, however, because the transient state change is not
32410817SEric.Schrock@Sun.COM 	 * posted in this case.  This could be made more explicit by not
32510817SEric.Schrock@Sun.COM 	 * relying on the scheme's unusable callback and instead directly
32610817SEric.Schrock@Sun.COM 	 * checking the vdev state, where we could correctly account for
32710817SEric.Schrock@Sun.COM 	 * DEGRADED state.
32810817SEric.Schrock@Sun.COM 	 */
32910817SEric.Schrock@Sun.COM 	if (!fmd_nvl_fmri_unusable(hdl, asru) && fmd_nvl_fmri_has_fault(hdl,
33010817SEric.Schrock@Sun.COM 	    asru, FMD_HAS_FAULT_ASRU, NULL)) {
33110817SEric.Schrock@Sun.COM 		topo_hdl_t *thp;
33210817SEric.Schrock@Sun.COM 		char *fmri = NULL;
33310817SEric.Schrock@Sun.COM 		int err;
33410817SEric.Schrock@Sun.COM 
33510817SEric.Schrock@Sun.COM 		thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION);
33610817SEric.Schrock@Sun.COM 		if (topo_fmri_nvl2str(thp, asru, &fmri, &err) == 0)
33710866SEric.Schrock@Sun.COM 			(void) fmd_repair_asru(hdl, fmri);
33810817SEric.Schrock@Sun.COM 		fmd_hdl_topo_rele(hdl, thp);
33910817SEric.Schrock@Sun.COM 
34010817SEric.Schrock@Sun.COM 		topo_hdl_strfree(thp, fmri);
34110817SEric.Schrock@Sun.COM 	}
342*12066SRobert.Johnston@Sun.COM 	nvlist_free(asru);
34310817SEric.Schrock@Sun.COM 	zrp = fmd_hdl_alloc(hdl, sizeof (zfs_retire_repaired_t), FMD_SLEEP);
34410817SEric.Schrock@Sun.COM 	zrp->zrr_next = zdp->zrd_repaired;
34510817SEric.Schrock@Sun.COM 	zrp->zrr_pool = pool_guid;
34610817SEric.Schrock@Sun.COM 	zrp->zrr_vdev = vdev_guid;
34710817SEric.Schrock@Sun.COM 	zdp->zrd_repaired = zrp;
34810817SEric.Schrock@Sun.COM }
34910817SEric.Schrock@Sun.COM 
3502082Seschrock /*ARGSUSED*/
3512082Seschrock static void
zfs_retire_recv(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class)3522082Seschrock zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
3532082Seschrock     const char *class)
3542082Seschrock {
3552082Seschrock 	uint64_t pool_guid, vdev_guid;
3562082Seschrock 	zpool_handle_t *zhp;
35710817SEric.Schrock@Sun.COM 	nvlist_t *resource, *fault, *fru;
3584451Seschrock 	nvlist_t **faults;
3594451Seschrock 	uint_t f, nfaults;
36010817SEric.Schrock@Sun.COM 	zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);
36110817SEric.Schrock@Sun.COM 	libzfs_handle_t *zhdl = zdp->zrd_hdl;
3624451Seschrock 	boolean_t fault_device, degrade_device;
3634451Seschrock 	boolean_t is_repair;
36410817SEric.Schrock@Sun.COM 	char *scheme, *fmri;
3652082Seschrock 	nvlist_t *vdev;
3667275Sstephh 	char *uuid;
3677275Sstephh 	int repair_done = 0;
3689120SStephen.Hanson@Sun.COM 	boolean_t retire;
36910817SEric.Schrock@Sun.COM 	boolean_t is_disk;
37010817SEric.Schrock@Sun.COM 	vdev_aux_t aux;
37110817SEric.Schrock@Sun.COM 	topo_hdl_t *thp;
37210817SEric.Schrock@Sun.COM 	int err;
3732082Seschrock 
3742082Seschrock 	/*
3754451Seschrock 	 * If this is a resource notifying us of device removal, then simply
3764451Seschrock 	 * check for an available spare and continue.
3774451Seschrock 	 */
3784451Seschrock 	if (strcmp(class, "resource.fs.zfs.removed") == 0) {
3794451Seschrock 		if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
3804451Seschrock 		    &pool_guid) != 0 ||
3814451Seschrock 		    nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
3824451Seschrock 		    &vdev_guid) != 0)
3834451Seschrock 			return;
3844451Seschrock 
3854451Seschrock 		if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid,
3864451Seschrock 		    &vdev)) == NULL)
3874451Seschrock 			return;
3884451Seschrock 
3894451Seschrock 		if (fmd_prop_get_int32(hdl, "spare_on_remove"))
39010817SEric.Schrock@Sun.COM 			replace_with_spare(hdl, zhp, vdev);
3914451Seschrock 		zpool_close(zhp);
3924451Seschrock 		return;
3934451Seschrock 	}
3944451Seschrock 
3959120SStephen.Hanson@Sun.COM 	if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0)
3969120SStephen.Hanson@Sun.COM 		return;
3979120SStephen.Hanson@Sun.COM 
39810817SEric.Schrock@Sun.COM 	if (strcmp(class, "resource.fs.zfs.statechange") == 0 ||
39910817SEric.Schrock@Sun.COM 	    strcmp(class,
40010817SEric.Schrock@Sun.COM 	    "resource.sysevent.EC_zfs.ESC_ZFS_vdev_remove") == 0) {
40110817SEric.Schrock@Sun.COM 		zfs_vdev_repair(hdl, nvl);
40210817SEric.Schrock@Sun.COM 		return;
40310817SEric.Schrock@Sun.COM 	}
40410817SEric.Schrock@Sun.COM 
40510817SEric.Schrock@Sun.COM 	zfs_retire_clear_data(hdl, zdp);
40610817SEric.Schrock@Sun.COM 
4077275Sstephh 	if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0)
4084451Seschrock 		is_repair = B_TRUE;
4094451Seschrock 	else
4104451Seschrock 		is_repair = B_FALSE;
4114451Seschrock 
4124451Seschrock 	/*
4134451Seschrock 	 * We subscribe to zfs faults as well as all repair events.
4142082Seschrock 	 */
4152082Seschrock 	if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
4162082Seschrock 	    &faults, &nfaults) != 0)
4172082Seschrock 		return;
4182082Seschrock 
4192082Seschrock 	for (f = 0; f < nfaults; f++) {
4204451Seschrock 		fault = faults[f];
4214451Seschrock 
4224451Seschrock 		fault_device = B_FALSE;
4234451Seschrock 		degrade_device = B_FALSE;
42410817SEric.Schrock@Sun.COM 		is_disk = B_FALSE;
4254451Seschrock 
4269120SStephen.Hanson@Sun.COM 		if (nvlist_lookup_boolean_value(fault, FM_SUSPECT_RETIRE,
4279120SStephen.Hanson@Sun.COM 		    &retire) == 0 && retire == 0)
4289120SStephen.Hanson@Sun.COM 			continue;
4299120SStephen.Hanson@Sun.COM 
4304451Seschrock 		/*
4314451Seschrock 		 * While we subscribe to fault.fs.zfs.*, we only take action
4324451Seschrock 		 * for faults targeting a specific vdev (open failure or SERD
43310817SEric.Schrock@Sun.COM 		 * failure).  We also subscribe to fault.io.* events, so that
43410817SEric.Schrock@Sun.COM 		 * faulty disks will be faulted in the ZFS configuration.
4354451Seschrock 		 */
43610817SEric.Schrock@Sun.COM 		if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.io")) {
43710817SEric.Schrock@Sun.COM 			fault_device = B_TRUE;
43810817SEric.Schrock@Sun.COM 		} else if (fmd_nvl_class_match(hdl, fault,
43910817SEric.Schrock@Sun.COM 		    "fault.fs.zfs.vdev.checksum")) {
44010817SEric.Schrock@Sun.COM 			degrade_device = B_TRUE;
44110817SEric.Schrock@Sun.COM 		} else if (fmd_nvl_class_match(hdl, fault,
44210817SEric.Schrock@Sun.COM 		    "fault.fs.zfs.device")) {
44310817SEric.Schrock@Sun.COM 			fault_device = B_FALSE;
44410817SEric.Schrock@Sun.COM 		} else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) {
44510817SEric.Schrock@Sun.COM 			is_disk = B_TRUE;
4464451Seschrock 			fault_device = B_TRUE;
44710817SEric.Schrock@Sun.COM 		} else {
4484451Seschrock 			continue;
44910817SEric.Schrock@Sun.COM 		}
45010817SEric.Schrock@Sun.COM 
45110817SEric.Schrock@Sun.COM 		if (is_disk) {
45210817SEric.Schrock@Sun.COM 			/*
45310817SEric.Schrock@Sun.COM 			 * This is a disk fault.  Lookup the FRU, convert it to
45410817SEric.Schrock@Sun.COM 			 * an FMRI string, and attempt to find a matching vdev.
45510817SEric.Schrock@Sun.COM 			 */
45610817SEric.Schrock@Sun.COM 			if (nvlist_lookup_nvlist(fault, FM_FAULT_FRU,
45710817SEric.Schrock@Sun.COM 			    &fru) != 0 ||
45810817SEric.Schrock@Sun.COM 			    nvlist_lookup_string(fru, FM_FMRI_SCHEME,
45910817SEric.Schrock@Sun.COM 			    &scheme) != 0)
46010817SEric.Schrock@Sun.COM 				continue;
46110817SEric.Schrock@Sun.COM 
46210817SEric.Schrock@Sun.COM 			if (strcmp(scheme, FM_FMRI_SCHEME_HC) != 0)
46310817SEric.Schrock@Sun.COM 				continue;
46410817SEric.Schrock@Sun.COM 
46510817SEric.Schrock@Sun.COM 			thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION);
46610817SEric.Schrock@Sun.COM 			if (topo_fmri_nvl2str(thp, fru, &fmri, &err) != 0) {
46710817SEric.Schrock@Sun.COM 				fmd_hdl_topo_rele(hdl, thp);
46810817SEric.Schrock@Sun.COM 				continue;
46910817SEric.Schrock@Sun.COM 			}
47010817SEric.Schrock@Sun.COM 
47110817SEric.Schrock@Sun.COM 			zhp = find_by_fru(zhdl, fmri, &vdev);
47210817SEric.Schrock@Sun.COM 			topo_hdl_strfree(thp, fmri);
47310817SEric.Schrock@Sun.COM 			fmd_hdl_topo_rele(hdl, thp);
47410817SEric.Schrock@Sun.COM 
47510817SEric.Schrock@Sun.COM 			if (zhp == NULL)
47610817SEric.Schrock@Sun.COM 				continue;
4774451Seschrock 
47810817SEric.Schrock@Sun.COM 			(void) nvlist_lookup_uint64(vdev,
47910817SEric.Schrock@Sun.COM 			    ZPOOL_CONFIG_GUID, &vdev_guid);
48010817SEric.Schrock@Sun.COM 			aux = VDEV_AUX_EXTERNAL;
48110817SEric.Schrock@Sun.COM 		} else {
48210817SEric.Schrock@Sun.COM 			/*
48310817SEric.Schrock@Sun.COM 			 * This is a ZFS fault.  Lookup the resource, and
48410817SEric.Schrock@Sun.COM 			 * attempt to find the matching vdev.
48510817SEric.Schrock@Sun.COM 			 */
48610817SEric.Schrock@Sun.COM 			if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE,
48710817SEric.Schrock@Sun.COM 			    &resource) != 0 ||
48810817SEric.Schrock@Sun.COM 			    nvlist_lookup_string(resource, FM_FMRI_SCHEME,
48910817SEric.Schrock@Sun.COM 			    &scheme) != 0)
49010817SEric.Schrock@Sun.COM 				continue;
4914451Seschrock 
49210817SEric.Schrock@Sun.COM 			if (strcmp(scheme, FM_FMRI_SCHEME_ZFS) != 0)
49310817SEric.Schrock@Sun.COM 				continue;
49410817SEric.Schrock@Sun.COM 
49510817SEric.Schrock@Sun.COM 			if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL,
49610817SEric.Schrock@Sun.COM 			    &pool_guid) != 0)
49710817SEric.Schrock@Sun.COM 				continue;
4984451Seschrock 
49910817SEric.Schrock@Sun.COM 			if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV,
50010817SEric.Schrock@Sun.COM 			    &vdev_guid) != 0) {
50110817SEric.Schrock@Sun.COM 				if (is_repair)
50210817SEric.Schrock@Sun.COM 					vdev_guid = 0;
50310817SEric.Schrock@Sun.COM 				else
50410817SEric.Schrock@Sun.COM 					continue;
50510817SEric.Schrock@Sun.COM 			}
50610817SEric.Schrock@Sun.COM 
50710817SEric.Schrock@Sun.COM 			if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid,
50810817SEric.Schrock@Sun.COM 			    &vdev)) == NULL)
50910817SEric.Schrock@Sun.COM 				continue;
51010817SEric.Schrock@Sun.COM 
51110817SEric.Schrock@Sun.COM 			aux = VDEV_AUX_ERR_EXCEEDED;
51210817SEric.Schrock@Sun.COM 		}
51310817SEric.Schrock@Sun.COM 
51410817SEric.Schrock@Sun.COM 		if (vdev_guid == 0) {
51510817SEric.Schrock@Sun.COM 			/*
51610817SEric.Schrock@Sun.COM 			 * For pool-level repair events, clear the entire pool.
51710817SEric.Schrock@Sun.COM 			 */
51810921STim.Haley@Sun.COM 			(void) zpool_clear(zhp, NULL, NULL);
51910817SEric.Schrock@Sun.COM 			zpool_close(zhp);
5202082Seschrock 			continue;
52110817SEric.Schrock@Sun.COM 		}
5222082Seschrock 
5234451Seschrock 		/*
5244451Seschrock 		 * If this is a repair event, then mark the vdev as repaired and
5254451Seschrock 		 * continue.
5264451Seschrock 		 */
5274451Seschrock 		if (is_repair) {
5287275Sstephh 			repair_done = 1;
5294451Seschrock 			(void) zpool_vdev_clear(zhp, vdev_guid);
5302082Seschrock 			zpool_close(zhp);
5312082Seschrock 			continue;
5322082Seschrock 		}
5332082Seschrock 
5342082Seschrock 		/*
5354451Seschrock 		 * Actively fault the device if needed.
5362082Seschrock 		 */
5374451Seschrock 		if (fault_device)
53810817SEric.Schrock@Sun.COM 			(void) zpool_vdev_fault(zhp, vdev_guid, aux);
5394451Seschrock 		if (degrade_device)
54010817SEric.Schrock@Sun.COM 			(void) zpool_vdev_degrade(zhp, vdev_guid, aux);
5412082Seschrock 
5422082Seschrock 		/*
5434451Seschrock 		 * Attempt to substitute a hot spare.
5442082Seschrock 		 */
54510817SEric.Schrock@Sun.COM 		replace_with_spare(hdl, zhp, vdev);
5462082Seschrock 		zpool_close(zhp);
5472082Seschrock 	}
5487275Sstephh 
5497275Sstephh 	if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && repair_done &&
5507275Sstephh 	    nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0)
5517275Sstephh 		fmd_case_uuresolved(hdl, uuid);
5522082Seschrock }
5532082Seschrock 
5542082Seschrock static const fmd_hdl_ops_t fmd_ops = {
5552082Seschrock 	zfs_retire_recv,	/* fmdo_recv */
5562082Seschrock 	NULL,			/* fmdo_timeout */
5572082Seschrock 	NULL,			/* fmdo_close */
5582082Seschrock 	NULL,			/* fmdo_stats */
5592082Seschrock 	NULL,			/* fmdo_gc */
5602082Seschrock };
5612082Seschrock 
5622082Seschrock static const fmd_prop_t fmd_props[] = {
5634451Seschrock 	{ "spare_on_remove", FMD_TYPE_BOOL, "true" },
5642082Seschrock 	{ NULL, 0, NULL }
5652082Seschrock };
5662082Seschrock 
5672082Seschrock static const fmd_hdl_info_t fmd_info = {
5682082Seschrock 	"ZFS Retire Agent", "1.0", &fmd_ops, fmd_props
5692082Seschrock };
5702082Seschrock 
5712082Seschrock void
_fmd_init(fmd_hdl_t * hdl)5722082Seschrock _fmd_init(fmd_hdl_t *hdl)
5732082Seschrock {
57410817SEric.Schrock@Sun.COM 	zfs_retire_data_t *zdp;
5752082Seschrock 	libzfs_handle_t *zhdl;
5762082Seschrock 
5772082Seschrock 	if ((zhdl = libzfs_init()) == NULL)
5782082Seschrock 		return;
5792082Seschrock 
5802082Seschrock 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
5812082Seschrock 		libzfs_fini(zhdl);
5822082Seschrock 		return;
5832082Seschrock 	}
5842082Seschrock 
58510817SEric.Schrock@Sun.COM 	zdp = fmd_hdl_zalloc(hdl, sizeof (zfs_retire_data_t), FMD_SLEEP);
58610817SEric.Schrock@Sun.COM 	zdp->zrd_hdl = zhdl;
58710817SEric.Schrock@Sun.COM 
58810817SEric.Schrock@Sun.COM 	fmd_hdl_setspecific(hdl, zdp);
5892082Seschrock }
5902082Seschrock 
5912082Seschrock void
_fmd_fini(fmd_hdl_t * hdl)5922082Seschrock _fmd_fini(fmd_hdl_t *hdl)
5932082Seschrock {
59410817SEric.Schrock@Sun.COM 	zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);
5952082Seschrock 
59610817SEric.Schrock@Sun.COM 	if (zdp != NULL) {
59710817SEric.Schrock@Sun.COM 		zfs_retire_clear_data(hdl, zdp);
59810817SEric.Schrock@Sun.COM 		libzfs_fini(zdp->zrd_hdl);
59910817SEric.Schrock@Sun.COM 		fmd_hdl_free(hdl, zdp, sizeof (zfs_retire_data_t));
60010817SEric.Schrock@Sun.COM 	}
6012082Seschrock }
602