1*789Sahrens /*
2*789Sahrens  * CDDL HEADER START
3*789Sahrens  *
4*789Sahrens  * The contents of this file are subject to the terms of the
5*789Sahrens  * Common Development and Distribution License, Version 1.0 only
6*789Sahrens  * (the "License").  You may not use this file except in compliance
7*789Sahrens  * with the License.
8*789Sahrens  *
9*789Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*789Sahrens  * or http://www.opensolaris.org/os/licensing.
11*789Sahrens  * See the License for the specific language governing permissions
12*789Sahrens  * and limitations under the License.
13*789Sahrens  *
14*789Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
15*789Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*789Sahrens  * If applicable, add the following below this CDDL HEADER, with the
17*789Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
18*789Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
19*789Sahrens  *
20*789Sahrens  * CDDL HEADER END
21*789Sahrens  */
22*789Sahrens /*
23*789Sahrens  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*789Sahrens  * Use is subject to license terms.
25*789Sahrens  */
26*789Sahrens 
27*789Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*789Sahrens 
29*789Sahrens /*
30*789Sahrens  * This file contains the functions which analyze the status of a pool.  This
31*789Sahrens  * include both the status of an active pool, as well as the status exported
32*789Sahrens  * pools.  Returns one of the ZPOOL_STATUS_* defines describing the status of
33*789Sahrens  * the pool.  This status is independent (to a certain degree) from the state of
34*789Sahrens  * the pool.  A pool's state descsribes only whether or not it is capable of
35*789Sahrens  * providing the necessary fault tolerance for data.  The status describes the
36*789Sahrens  * overall status of devices.  A pool that is online can still have a device
37*789Sahrens  * that is experiencing errors.
38*789Sahrens  *
39*789Sahrens  * Only a subset of the possible faults can be detected using 'zpool status',
40*789Sahrens  * and not all possible errors correspond to a FMA message ID.  The explanation
41*789Sahrens  * is left up to the caller, depending on whether it is a live pool or an
42*789Sahrens  * import.
43*789Sahrens  */
44*789Sahrens 
45*789Sahrens #include <libzfs.h>
46*789Sahrens #include <string.h>
47*789Sahrens #include "libzfs_impl.h"
48*789Sahrens 
49*789Sahrens /*
50*789Sahrens  * Message ID table.  This must be kep in sync with the ZPOOL_STATUS_* defines
51*789Sahrens  * in libzfs.h.  Note that there are some status results which go past the end
52*789Sahrens  * of this table, and hence have no associated message ID.
53*789Sahrens  */
54*789Sahrens static char *msgid_table[] = {
55*789Sahrens 	"ZFS-8000-14",
56*789Sahrens 	"ZFS-8000-2Q",
57*789Sahrens 	"ZFS-8000-3C",
58*789Sahrens 	"ZFS-8000-4J",
59*789Sahrens 	"ZFS-8000-5E",
60*789Sahrens 	"ZFS-8000-6X",
61*789Sahrens 	"ZFS-8000-72",
62*789Sahrens 	"ZFS-8000-8A",
63*789Sahrens 	"ZFS-8000-9P",
64*789Sahrens 	"ZFS-8000-A5"
65*789Sahrens };
66*789Sahrens 
67*789Sahrens #define	NMSGID	(sizeof (msgid_table) / sizeof (msgid_table[0]))
68*789Sahrens 
69*789Sahrens /* ARGSUSED */
70*789Sahrens static int
71*789Sahrens vdev_missing(uint64_t state, uint64_t aux, uint64_t errs)
72*789Sahrens {
73*789Sahrens 	return (state == VDEV_STATE_CANT_OPEN &&
74*789Sahrens 	    aux == VDEV_AUX_OPEN_FAILED);
75*789Sahrens }
76*789Sahrens 
77*789Sahrens /* ARGSUSED */
78*789Sahrens static int
79*789Sahrens vdev_errors(uint64_t state, uint64_t aux, uint64_t errs)
80*789Sahrens {
81*789Sahrens 	return (errs != 0);
82*789Sahrens }
83*789Sahrens 
84*789Sahrens /* ARGSUSED */
85*789Sahrens static int
86*789Sahrens vdev_broken(uint64_t state, uint64_t aux, uint64_t errs)
87*789Sahrens {
88*789Sahrens 	return (state == VDEV_STATE_CANT_OPEN);
89*789Sahrens }
90*789Sahrens 
91*789Sahrens /* ARGSUSED */
92*789Sahrens static int
93*789Sahrens vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs)
94*789Sahrens {
95*789Sahrens 	return (state == VDEV_STATE_OFFLINE);
96*789Sahrens }
97*789Sahrens 
98*789Sahrens /*
99*789Sahrens  * Detect if any leaf devices that have seen errors or could not be opened.
100*789Sahrens  */
101*789Sahrens static int
102*789Sahrens find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
103*789Sahrens {
104*789Sahrens 	nvlist_t **child;
105*789Sahrens 	vdev_stat_t *vs;
106*789Sahrens 	uint_t c, children;
107*789Sahrens 	char *type;
108*789Sahrens 
109*789Sahrens 	/*
110*789Sahrens 	 * Ignore problems within a 'replacing' vdev, since we're presumably in
111*789Sahrens 	 * the process of repairing any such errors, and don't want to call them
112*789Sahrens 	 * out again.  We'll pick up the fact that a resilver is happening
113*789Sahrens 	 * later.
114*789Sahrens 	 */
115*789Sahrens 	verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type) == 0);
116*789Sahrens 	if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
117*789Sahrens 		return (FALSE);
118*789Sahrens 
119*789Sahrens 	if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child,
120*789Sahrens 	    &children) == 0) {
121*789Sahrens 		for (c = 0; c < children; c++)
122*789Sahrens 			if (find_vdev_problem(child[c], func))
123*789Sahrens 				return (TRUE);
124*789Sahrens 	} else {
125*789Sahrens 		verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_STATS,
126*789Sahrens 		    (uint64_t **)&vs, &c) == 0);
127*789Sahrens 
128*789Sahrens 		if (func(vs->vs_state, vs->vs_aux,
129*789Sahrens 		    vs->vs_read_errors +
130*789Sahrens 		    vs->vs_write_errors +
131*789Sahrens 		    vs->vs_checksum_errors))
132*789Sahrens 			return (TRUE);
133*789Sahrens 	}
134*789Sahrens 
135*789Sahrens 	return (FALSE);
136*789Sahrens }
137*789Sahrens 
138*789Sahrens /*
139*789Sahrens  * Active pool health status.
140*789Sahrens  *
141*789Sahrens  * To determine the status for a pool, we make several passes over the config,
142*789Sahrens  * picking the most egregious error we find.  In order of importance, we do the
143*789Sahrens  * following:
144*789Sahrens  *
145*789Sahrens  *	- Check for a complete and valid configuration
146*789Sahrens  *	- Look for any missing devices
147*789Sahrens  *	- Look for any devices showing errors
148*789Sahrens  *	- Check for any data errors
149*789Sahrens  *	- Check for any resilvering devices
150*789Sahrens  *
151*789Sahrens  * There can obviously be multiple errors within a single pool, so this routine
152*789Sahrens  * only picks the most damaging of all the current errors to report.
153*789Sahrens  */
154*789Sahrens static zpool_status_t
155*789Sahrens check_status(nvlist_t *config, int isimport)
156*789Sahrens {
157*789Sahrens 	nvlist_t *nvroot;
158*789Sahrens 	vdev_stat_t *vs;
159*789Sahrens 	uint_t vsc;
160*789Sahrens 
161*789Sahrens 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
162*789Sahrens 	    &nvroot) == 0);
163*789Sahrens 	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
164*789Sahrens 	    (uint64_t **)&vs, &vsc) == 0);
165*789Sahrens 
166*789Sahrens 	/*
167*789Sahrens 	 * Check that the config is complete.
168*789Sahrens 	 */
169*789Sahrens 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
170*789Sahrens 	    vs->vs_aux == VDEV_AUX_BAD_GUID_SUM) {
171*789Sahrens 		return (ZPOOL_STATUS_BAD_GUID_SUM);
172*789Sahrens 	}
173*789Sahrens 
174*789Sahrens 	/*
175*789Sahrens 	 * Missing devices
176*789Sahrens 	 */
177*789Sahrens 	if (find_vdev_problem(nvroot, vdev_missing)) {
178*789Sahrens 		if (vs->vs_state == VDEV_STATE_CANT_OPEN)
179*789Sahrens 			return (ZPOOL_STATUS_MISSING_DEV_NR);
180*789Sahrens 		else
181*789Sahrens 			return (ZPOOL_STATUS_MISSING_DEV_R);
182*789Sahrens 	}
183*789Sahrens 
184*789Sahrens 	/*
185*789Sahrens 	 * Devices with corrupted labels.
186*789Sahrens 	 */
187*789Sahrens 	if (find_vdev_problem(nvroot, vdev_broken)) {
188*789Sahrens 		if (vs->vs_state == VDEV_STATE_CANT_OPEN)
189*789Sahrens 			return (ZPOOL_STATUS_CORRUPT_LABEL_NR);
190*789Sahrens 		else
191*789Sahrens 			return (ZPOOL_STATUS_CORRUPT_LABEL_R);
192*789Sahrens 	}
193*789Sahrens 
194*789Sahrens 	/*
195*789Sahrens 	 * Devices with errors
196*789Sahrens 	 */
197*789Sahrens 	if (!isimport && find_vdev_problem(nvroot, vdev_errors))
198*789Sahrens 		return (ZPOOL_STATUS_FAILING_DEV);
199*789Sahrens 
200*789Sahrens 	/*
201*789Sahrens 	 * Offlined devices
202*789Sahrens 	 */
203*789Sahrens 	if (find_vdev_problem(nvroot, vdev_offlined))
204*789Sahrens 		return (ZPOOL_STATUS_OFFLINE_DEV);
205*789Sahrens 
206*789Sahrens 	/*
207*789Sahrens 	 * Currently resilvering
208*789Sahrens 	 */
209*789Sahrens 	if (!vs->vs_scrub_complete && vs->vs_scrub_type == POOL_SCRUB_RESILVER)
210*789Sahrens 		return (ZPOOL_STATUS_RESILVERING);
211*789Sahrens 
212*789Sahrens 	/*
213*789Sahrens 	 * We currently have no way to detect the following errors:
214*789Sahrens 	 *
215*789Sahrens 	 * 	CORRUPT_CACHE
216*789Sahrens 	 * 	VERSION_MISMATCH
217*789Sahrens 	 * 	CORRUPT_POOL
218*789Sahrens 	 * 	CORRUPT_DATA
219*789Sahrens 	 */
220*789Sahrens 
221*789Sahrens 	return (ZPOOL_STATUS_OK);
222*789Sahrens }
223*789Sahrens 
224*789Sahrens zpool_status_t
225*789Sahrens zpool_get_status(zpool_handle_t *zhp, char **msgid)
226*789Sahrens {
227*789Sahrens 	zpool_status_t ret = check_status(zhp->zpool_config, FALSE);
228*789Sahrens 
229*789Sahrens 	if (ret >= NMSGID)
230*789Sahrens 		*msgid = NULL;
231*789Sahrens 	else
232*789Sahrens 		*msgid = msgid_table[ret];
233*789Sahrens 
234*789Sahrens 	return (ret);
235*789Sahrens }
236*789Sahrens 
237*789Sahrens zpool_status_t
238*789Sahrens zpool_import_status(nvlist_t *config, char **msgid)
239*789Sahrens {
240*789Sahrens 	zpool_status_t ret = check_status(config, TRUE);
241*789Sahrens 
242*789Sahrens 	if (ret >= NMSGID)
243*789Sahrens 		*msgid = NULL;
244*789Sahrens 	else
245*789Sahrens 		*msgid = msgid_table[ret];
246*789Sahrens 
247*789Sahrens 	return (ret);
248*789Sahrens }
249