1789Sahrens /*
2789Sahrens  * CDDL HEADER START
3789Sahrens  *
4789Sahrens  * The contents of this file are subject to the terms of the
5*1544Seschrock  * Common Development and Distribution License (the "License").
6*1544Seschrock  * You may not use this file except in compliance with the License.
7789Sahrens  *
8789Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9789Sahrens  * or http://www.opensolaris.org/os/licensing.
10789Sahrens  * See the License for the specific language governing permissions
11789Sahrens  * and limitations under the License.
12789Sahrens  *
13789Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14789Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15789Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16789Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17789Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18789Sahrens  *
19789Sahrens  * CDDL HEADER END
20789Sahrens  */
21789Sahrens /*
22*1544Seschrock  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23789Sahrens  * Use is subject to license terms.
24789Sahrens  */
25789Sahrens 
26789Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
27789Sahrens 
28789Sahrens /*
29789Sahrens  * This file contains the functions which analyze the status of a pool.  This
30789Sahrens  * include both the status of an active pool, as well as the status exported
31789Sahrens  * pools.  Returns one of the ZPOOL_STATUS_* defines describing the status of
32789Sahrens  * the pool.  This status is independent (to a certain degree) from the state of
33789Sahrens  * the pool.  A pool's state descsribes only whether or not it is capable of
34789Sahrens  * providing the necessary fault tolerance for data.  The status describes the
35789Sahrens  * overall status of devices.  A pool that is online can still have a device
36789Sahrens  * that is experiencing errors.
37789Sahrens  *
38789Sahrens  * Only a subset of the possible faults can be detected using 'zpool status',
39789Sahrens  * and not all possible errors correspond to a FMA message ID.  The explanation
40789Sahrens  * is left up to the caller, depending on whether it is a live pool or an
41789Sahrens  * import.
42789Sahrens  */
43789Sahrens 
44789Sahrens #include <libzfs.h>
45789Sahrens #include <string.h>
46789Sahrens #include "libzfs_impl.h"
47789Sahrens 
48789Sahrens /*
49789Sahrens  * Message ID table.  This must be kep in sync with the ZPOOL_STATUS_* defines
50789Sahrens  * in libzfs.h.  Note that there are some status results which go past the end
51789Sahrens  * of this table, and hence have no associated message ID.
52789Sahrens  */
53789Sahrens static char *msgid_table[] = {
54789Sahrens 	"ZFS-8000-14",
55789Sahrens 	"ZFS-8000-2Q",
56789Sahrens 	"ZFS-8000-3C",
57789Sahrens 	"ZFS-8000-4J",
58789Sahrens 	"ZFS-8000-5E",
59789Sahrens 	"ZFS-8000-6X",
60789Sahrens 	"ZFS-8000-72",
61789Sahrens 	"ZFS-8000-8A",
62789Sahrens 	"ZFS-8000-9P",
63789Sahrens 	"ZFS-8000-A5"
64789Sahrens };
65789Sahrens 
66*1544Seschrock /*
67*1544Seschrock  * If the pool is active, a certain class of static errors is overridden by the
68*1544Seschrock  * faults as analayzed by FMA.  These faults have separate knowledge articles,
69*1544Seschrock  * and the article referred to by 'zpool status' must match that indicated by
70*1544Seschrock  * the syslog error message.  We override missing data as well as corrupt pool.
71*1544Seschrock  */
72*1544Seschrock static char *msgid_table_active[] = {
73*1544Seschrock 	"ZFS-8000-14",
74*1544Seschrock 	"ZFS-8000-D3",		/* overridden */
75*1544Seschrock 	"ZFS-8000-D3",		/* overridden */
76*1544Seschrock 	"ZFS-8000-4J",
77*1544Seschrock 	"ZFS-8000-5E",
78*1544Seschrock 	"ZFS-8000-6X",
79*1544Seschrock 	"ZFS-8000-CS",		/* overridden */
80*1544Seschrock 	"ZFS-8000-8A",
81*1544Seschrock 	"ZFS-8000-9P",
82*1544Seschrock 	"ZFS-8000-CS",		/* overridden */
83*1544Seschrock };
84*1544Seschrock 
85789Sahrens #define	NMSGID	(sizeof (msgid_table) / sizeof (msgid_table[0]))
86789Sahrens 
87789Sahrens /* ARGSUSED */
88789Sahrens static int
89789Sahrens vdev_missing(uint64_t state, uint64_t aux, uint64_t errs)
90789Sahrens {
91789Sahrens 	return (state == VDEV_STATE_CANT_OPEN &&
92789Sahrens 	    aux == VDEV_AUX_OPEN_FAILED);
93789Sahrens }
94789Sahrens 
95789Sahrens /* ARGSUSED */
96789Sahrens static int
97789Sahrens vdev_errors(uint64_t state, uint64_t aux, uint64_t errs)
98789Sahrens {
99789Sahrens 	return (errs != 0);
100789Sahrens }
101789Sahrens 
102789Sahrens /* ARGSUSED */
103789Sahrens static int
104789Sahrens vdev_broken(uint64_t state, uint64_t aux, uint64_t errs)
105789Sahrens {
106789Sahrens 	return (state == VDEV_STATE_CANT_OPEN);
107789Sahrens }
108789Sahrens 
109789Sahrens /* ARGSUSED */
110789Sahrens static int
111789Sahrens vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs)
112789Sahrens {
113789Sahrens 	return (state == VDEV_STATE_OFFLINE);
114789Sahrens }
115789Sahrens 
116789Sahrens /*
117789Sahrens  * Detect if any leaf devices that have seen errors or could not be opened.
118789Sahrens  */
119789Sahrens static int
120789Sahrens find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
121789Sahrens {
122789Sahrens 	nvlist_t **child;
123789Sahrens 	vdev_stat_t *vs;
124789Sahrens 	uint_t c, children;
125789Sahrens 	char *type;
126789Sahrens 
127789Sahrens 	/*
128789Sahrens 	 * Ignore problems within a 'replacing' vdev, since we're presumably in
129789Sahrens 	 * the process of repairing any such errors, and don't want to call them
130789Sahrens 	 * out again.  We'll pick up the fact that a resilver is happening
131789Sahrens 	 * later.
132789Sahrens 	 */
133789Sahrens 	verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type) == 0);
134789Sahrens 	if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
135789Sahrens 		return (FALSE);
136789Sahrens 
137789Sahrens 	if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child,
138789Sahrens 	    &children) == 0) {
139789Sahrens 		for (c = 0; c < children; c++)
140789Sahrens 			if (find_vdev_problem(child[c], func))
141789Sahrens 				return (TRUE);
142789Sahrens 	} else {
143789Sahrens 		verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_STATS,
144789Sahrens 		    (uint64_t **)&vs, &c) == 0);
145789Sahrens 
146789Sahrens 		if (func(vs->vs_state, vs->vs_aux,
147789Sahrens 		    vs->vs_read_errors +
148789Sahrens 		    vs->vs_write_errors +
149789Sahrens 		    vs->vs_checksum_errors))
150789Sahrens 			return (TRUE);
151789Sahrens 	}
152789Sahrens 
153789Sahrens 	return (FALSE);
154789Sahrens }
155789Sahrens 
156789Sahrens /*
157789Sahrens  * Active pool health status.
158789Sahrens  *
159789Sahrens  * To determine the status for a pool, we make several passes over the config,
160789Sahrens  * picking the most egregious error we find.  In order of importance, we do the
161789Sahrens  * following:
162789Sahrens  *
163789Sahrens  *	- Check for a complete and valid configuration
164*1544Seschrock  *	- Look for any missing devices in a non-replicated config
165*1544Seschrock  *	- Check for any data errors
166*1544Seschrock  *	- Check for any missing devices in a replicated config
167789Sahrens  *	- Look for any devices showing errors
168789Sahrens  *	- Check for any resilvering devices
169789Sahrens  *
170789Sahrens  * There can obviously be multiple errors within a single pool, so this routine
171789Sahrens  * only picks the most damaging of all the current errors to report.
172789Sahrens  */
173789Sahrens static zpool_status_t
174789Sahrens check_status(nvlist_t *config, int isimport)
175789Sahrens {
176789Sahrens 	nvlist_t *nvroot;
177789Sahrens 	vdev_stat_t *vs;
178789Sahrens 	uint_t vsc;
179*1544Seschrock 	uint64_t nerr;
180789Sahrens 
181789Sahrens 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
182789Sahrens 	    &nvroot) == 0);
183789Sahrens 	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
184789Sahrens 	    (uint64_t **)&vs, &vsc) == 0);
185789Sahrens 
186789Sahrens 	/*
187789Sahrens 	 * Check that the config is complete.
188789Sahrens 	 */
189789Sahrens 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
190*1544Seschrock 	    vs->vs_aux == VDEV_AUX_BAD_GUID_SUM)
191789Sahrens 		return (ZPOOL_STATUS_BAD_GUID_SUM);
192*1544Seschrock 
193*1544Seschrock 	/*
194*1544Seschrock 	 * Missing devices in non-replicated config.
195*1544Seschrock 	 */
196*1544Seschrock 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
197*1544Seschrock 	    find_vdev_problem(nvroot, vdev_missing))
198*1544Seschrock 		return (ZPOOL_STATUS_MISSING_DEV_NR);
199*1544Seschrock 
200*1544Seschrock 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
201*1544Seschrock 	    find_vdev_problem(nvroot, vdev_broken))
202*1544Seschrock 		return (ZPOOL_STATUS_CORRUPT_LABEL_NR);
203*1544Seschrock 
204*1544Seschrock 	/*
205*1544Seschrock 	 * Corrupted pool metadata
206*1544Seschrock 	 */
207*1544Seschrock 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
208*1544Seschrock 	    vs->vs_aux == VDEV_AUX_CORRUPT_DATA)
209*1544Seschrock 		return (ZPOOL_STATUS_CORRUPT_POOL);
210*1544Seschrock 
211*1544Seschrock 	/*
212*1544Seschrock 	 * Persistent data errors.
213*1544Seschrock 	 */
214*1544Seschrock 	if (!isimport) {
215*1544Seschrock 		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
216*1544Seschrock 		    &nerr) == 0 && nerr != 0)
217*1544Seschrock 			return (ZPOOL_STATUS_CORRUPT_DATA);
218789Sahrens 	}
219789Sahrens 
220789Sahrens 	/*
221*1544Seschrock 	 * Missing devices in a replicated config.
222789Sahrens 	 */
223*1544Seschrock 	if (find_vdev_problem(nvroot, vdev_missing))
224*1544Seschrock 		return (ZPOOL_STATUS_MISSING_DEV_R);
225*1544Seschrock 	if (find_vdev_problem(nvroot, vdev_broken))
226*1544Seschrock 		return (ZPOOL_STATUS_CORRUPT_LABEL_R);
227789Sahrens 
228789Sahrens 	/*
229789Sahrens 	 * Devices with errors
230789Sahrens 	 */
231789Sahrens 	if (!isimport && find_vdev_problem(nvroot, vdev_errors))
232789Sahrens 		return (ZPOOL_STATUS_FAILING_DEV);
233789Sahrens 
234789Sahrens 	/*
235789Sahrens 	 * Offlined devices
236789Sahrens 	 */
237789Sahrens 	if (find_vdev_problem(nvroot, vdev_offlined))
238789Sahrens 		return (ZPOOL_STATUS_OFFLINE_DEV);
239789Sahrens 
240789Sahrens 	/*
241789Sahrens 	 * Currently resilvering
242789Sahrens 	 */
243789Sahrens 	if (!vs->vs_scrub_complete && vs->vs_scrub_type == POOL_SCRUB_RESILVER)
244789Sahrens 		return (ZPOOL_STATUS_RESILVERING);
245789Sahrens 
246789Sahrens 	/*
247789Sahrens 	 * We currently have no way to detect the following errors:
248789Sahrens 	 *
249789Sahrens 	 * 	CORRUPT_CACHE
250789Sahrens 	 * 	VERSION_MISMATCH
251789Sahrens 	 */
252789Sahrens 
253789Sahrens 	return (ZPOOL_STATUS_OK);
254789Sahrens }
255789Sahrens 
256789Sahrens zpool_status_t
257789Sahrens zpool_get_status(zpool_handle_t *zhp, char **msgid)
258789Sahrens {
259789Sahrens 	zpool_status_t ret = check_status(zhp->zpool_config, FALSE);
260789Sahrens 
261789Sahrens 	if (ret >= NMSGID)
262789Sahrens 		*msgid = NULL;
263789Sahrens 	else
264*1544Seschrock 		*msgid = msgid_table_active[ret];
265789Sahrens 
266789Sahrens 	return (ret);
267789Sahrens }
268789Sahrens 
269789Sahrens zpool_status_t
270789Sahrens zpool_import_status(nvlist_t *config, char **msgid)
271789Sahrens {
272789Sahrens 	zpool_status_t ret = check_status(config, TRUE);
273789Sahrens 
274789Sahrens 	if (ret >= NMSGID)
275789Sahrens 		*msgid = NULL;
276789Sahrens 	else
277789Sahrens 		*msgid = msgid_table[ret];
278789Sahrens 
279789Sahrens 	return (ret);
280789Sahrens }
281