1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 5*1544Seschrock * Common Development and Distribution License (the "License"). 6*1544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 22*1544Seschrock * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27789Sahrens 28789Sahrens /* 29789Sahrens * This file contains the functions which analyze the status of a pool. This 30789Sahrens * include both the status of an active pool, as well as the status exported 31789Sahrens * pools. Returns one of the ZPOOL_STATUS_* defines describing the status of 32789Sahrens * the pool. This status is independent (to a certain degree) from the state of 33789Sahrens * the pool. A pool's state descsribes only whether or not it is capable of 34789Sahrens * providing the necessary fault tolerance for data. The status describes the 35789Sahrens * overall status of devices. A pool that is online can still have a device 36789Sahrens * that is experiencing errors. 37789Sahrens * 38789Sahrens * Only a subset of the possible faults can be detected using 'zpool status', 39789Sahrens * and not all possible errors correspond to a FMA message ID. The explanation 40789Sahrens * is left up to the caller, depending on whether it is a live pool or an 41789Sahrens * import. 42789Sahrens */ 43789Sahrens 44789Sahrens #include <libzfs.h> 45789Sahrens #include <string.h> 46789Sahrens #include "libzfs_impl.h" 47789Sahrens 48789Sahrens /* 49789Sahrens * Message ID table. This must be kep in sync with the ZPOOL_STATUS_* defines 50789Sahrens * in libzfs.h. Note that there are some status results which go past the end 51789Sahrens * of this table, and hence have no associated message ID. 52789Sahrens */ 53789Sahrens static char *msgid_table[] = { 54789Sahrens "ZFS-8000-14", 55789Sahrens "ZFS-8000-2Q", 56789Sahrens "ZFS-8000-3C", 57789Sahrens "ZFS-8000-4J", 58789Sahrens "ZFS-8000-5E", 59789Sahrens "ZFS-8000-6X", 60789Sahrens "ZFS-8000-72", 61789Sahrens "ZFS-8000-8A", 62789Sahrens "ZFS-8000-9P", 63789Sahrens "ZFS-8000-A5" 64789Sahrens }; 65789Sahrens 66*1544Seschrock /* 67*1544Seschrock * If the pool is active, a certain class of static errors is overridden by the 68*1544Seschrock * faults as analayzed by FMA. These faults have separate knowledge articles, 69*1544Seschrock * and the article referred to by 'zpool status' must match that indicated by 70*1544Seschrock * the syslog error message. We override missing data as well as corrupt pool. 71*1544Seschrock */ 72*1544Seschrock static char *msgid_table_active[] = { 73*1544Seschrock "ZFS-8000-14", 74*1544Seschrock "ZFS-8000-D3", /* overridden */ 75*1544Seschrock "ZFS-8000-D3", /* overridden */ 76*1544Seschrock "ZFS-8000-4J", 77*1544Seschrock "ZFS-8000-5E", 78*1544Seschrock "ZFS-8000-6X", 79*1544Seschrock "ZFS-8000-CS", /* overridden */ 80*1544Seschrock "ZFS-8000-8A", 81*1544Seschrock "ZFS-8000-9P", 82*1544Seschrock "ZFS-8000-CS", /* overridden */ 83*1544Seschrock }; 84*1544Seschrock 85789Sahrens #define NMSGID (sizeof (msgid_table) / sizeof (msgid_table[0])) 86789Sahrens 87789Sahrens /* ARGSUSED */ 88789Sahrens static int 89789Sahrens vdev_missing(uint64_t state, uint64_t aux, uint64_t errs) 90789Sahrens { 91789Sahrens return (state == VDEV_STATE_CANT_OPEN && 92789Sahrens aux == VDEV_AUX_OPEN_FAILED); 93789Sahrens } 94789Sahrens 95789Sahrens /* ARGSUSED */ 96789Sahrens static int 97789Sahrens vdev_errors(uint64_t state, uint64_t aux, uint64_t errs) 98789Sahrens { 99789Sahrens return (errs != 0); 100789Sahrens } 101789Sahrens 102789Sahrens /* ARGSUSED */ 103789Sahrens static int 104789Sahrens vdev_broken(uint64_t state, uint64_t aux, uint64_t errs) 105789Sahrens { 106789Sahrens return (state == VDEV_STATE_CANT_OPEN); 107789Sahrens } 108789Sahrens 109789Sahrens /* ARGSUSED */ 110789Sahrens static int 111789Sahrens vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs) 112789Sahrens { 113789Sahrens return (state == VDEV_STATE_OFFLINE); 114789Sahrens } 115789Sahrens 116789Sahrens /* 117789Sahrens * Detect if any leaf devices that have seen errors or could not be opened. 118789Sahrens */ 119789Sahrens static int 120789Sahrens find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t)) 121789Sahrens { 122789Sahrens nvlist_t **child; 123789Sahrens vdev_stat_t *vs; 124789Sahrens uint_t c, children; 125789Sahrens char *type; 126789Sahrens 127789Sahrens /* 128789Sahrens * Ignore problems within a 'replacing' vdev, since we're presumably in 129789Sahrens * the process of repairing any such errors, and don't want to call them 130789Sahrens * out again. We'll pick up the fact that a resilver is happening 131789Sahrens * later. 132789Sahrens */ 133789Sahrens verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type) == 0); 134789Sahrens if (strcmp(type, VDEV_TYPE_REPLACING) == 0) 135789Sahrens return (FALSE); 136789Sahrens 137789Sahrens if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child, 138789Sahrens &children) == 0) { 139789Sahrens for (c = 0; c < children; c++) 140789Sahrens if (find_vdev_problem(child[c], func)) 141789Sahrens return (TRUE); 142789Sahrens } else { 143789Sahrens verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_STATS, 144789Sahrens (uint64_t **)&vs, &c) == 0); 145789Sahrens 146789Sahrens if (func(vs->vs_state, vs->vs_aux, 147789Sahrens vs->vs_read_errors + 148789Sahrens vs->vs_write_errors + 149789Sahrens vs->vs_checksum_errors)) 150789Sahrens return (TRUE); 151789Sahrens } 152789Sahrens 153789Sahrens return (FALSE); 154789Sahrens } 155789Sahrens 156789Sahrens /* 157789Sahrens * Active pool health status. 158789Sahrens * 159789Sahrens * To determine the status for a pool, we make several passes over the config, 160789Sahrens * picking the most egregious error we find. In order of importance, we do the 161789Sahrens * following: 162789Sahrens * 163789Sahrens * - Check for a complete and valid configuration 164*1544Seschrock * - Look for any missing devices in a non-replicated config 165*1544Seschrock * - Check for any data errors 166*1544Seschrock * - Check for any missing devices in a replicated config 167789Sahrens * - Look for any devices showing errors 168789Sahrens * - Check for any resilvering devices 169789Sahrens * 170789Sahrens * There can obviously be multiple errors within a single pool, so this routine 171789Sahrens * only picks the most damaging of all the current errors to report. 172789Sahrens */ 173789Sahrens static zpool_status_t 174789Sahrens check_status(nvlist_t *config, int isimport) 175789Sahrens { 176789Sahrens nvlist_t *nvroot; 177789Sahrens vdev_stat_t *vs; 178789Sahrens uint_t vsc; 179*1544Seschrock uint64_t nerr; 180789Sahrens 181789Sahrens verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 182789Sahrens &nvroot) == 0); 183789Sahrens verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS, 184789Sahrens (uint64_t **)&vs, &vsc) == 0); 185789Sahrens 186789Sahrens /* 187789Sahrens * Check that the config is complete. 188789Sahrens */ 189789Sahrens if (vs->vs_state == VDEV_STATE_CANT_OPEN && 190*1544Seschrock vs->vs_aux == VDEV_AUX_BAD_GUID_SUM) 191789Sahrens return (ZPOOL_STATUS_BAD_GUID_SUM); 192*1544Seschrock 193*1544Seschrock /* 194*1544Seschrock * Missing devices in non-replicated config. 195*1544Seschrock */ 196*1544Seschrock if (vs->vs_state == VDEV_STATE_CANT_OPEN && 197*1544Seschrock find_vdev_problem(nvroot, vdev_missing)) 198*1544Seschrock return (ZPOOL_STATUS_MISSING_DEV_NR); 199*1544Seschrock 200*1544Seschrock if (vs->vs_state == VDEV_STATE_CANT_OPEN && 201*1544Seschrock find_vdev_problem(nvroot, vdev_broken)) 202*1544Seschrock return (ZPOOL_STATUS_CORRUPT_LABEL_NR); 203*1544Seschrock 204*1544Seschrock /* 205*1544Seschrock * Corrupted pool metadata 206*1544Seschrock */ 207*1544Seschrock if (vs->vs_state == VDEV_STATE_CANT_OPEN && 208*1544Seschrock vs->vs_aux == VDEV_AUX_CORRUPT_DATA) 209*1544Seschrock return (ZPOOL_STATUS_CORRUPT_POOL); 210*1544Seschrock 211*1544Seschrock /* 212*1544Seschrock * Persistent data errors. 213*1544Seschrock */ 214*1544Seschrock if (!isimport) { 215*1544Seschrock if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, 216*1544Seschrock &nerr) == 0 && nerr != 0) 217*1544Seschrock return (ZPOOL_STATUS_CORRUPT_DATA); 218789Sahrens } 219789Sahrens 220789Sahrens /* 221*1544Seschrock * Missing devices in a replicated config. 222789Sahrens */ 223*1544Seschrock if (find_vdev_problem(nvroot, vdev_missing)) 224*1544Seschrock return (ZPOOL_STATUS_MISSING_DEV_R); 225*1544Seschrock if (find_vdev_problem(nvroot, vdev_broken)) 226*1544Seschrock return (ZPOOL_STATUS_CORRUPT_LABEL_R); 227789Sahrens 228789Sahrens /* 229789Sahrens * Devices with errors 230789Sahrens */ 231789Sahrens if (!isimport && find_vdev_problem(nvroot, vdev_errors)) 232789Sahrens return (ZPOOL_STATUS_FAILING_DEV); 233789Sahrens 234789Sahrens /* 235789Sahrens * Offlined devices 236789Sahrens */ 237789Sahrens if (find_vdev_problem(nvroot, vdev_offlined)) 238789Sahrens return (ZPOOL_STATUS_OFFLINE_DEV); 239789Sahrens 240789Sahrens /* 241789Sahrens * Currently resilvering 242789Sahrens */ 243789Sahrens if (!vs->vs_scrub_complete && vs->vs_scrub_type == POOL_SCRUB_RESILVER) 244789Sahrens return (ZPOOL_STATUS_RESILVERING); 245789Sahrens 246789Sahrens /* 247789Sahrens * We currently have no way to detect the following errors: 248789Sahrens * 249789Sahrens * CORRUPT_CACHE 250789Sahrens * VERSION_MISMATCH 251789Sahrens */ 252789Sahrens 253789Sahrens return (ZPOOL_STATUS_OK); 254789Sahrens } 255789Sahrens 256789Sahrens zpool_status_t 257789Sahrens zpool_get_status(zpool_handle_t *zhp, char **msgid) 258789Sahrens { 259789Sahrens zpool_status_t ret = check_status(zhp->zpool_config, FALSE); 260789Sahrens 261789Sahrens if (ret >= NMSGID) 262789Sahrens *msgid = NULL; 263789Sahrens else 264*1544Seschrock *msgid = msgid_table_active[ret]; 265789Sahrens 266789Sahrens return (ret); 267789Sahrens } 268789Sahrens 269789Sahrens zpool_status_t 270789Sahrens zpool_import_status(nvlist_t *config, char **msgid) 271789Sahrens { 272789Sahrens zpool_status_t ret = check_status(config, TRUE); 273789Sahrens 274789Sahrens if (ret >= NMSGID) 275789Sahrens *msgid = NULL; 276789Sahrens else 277789Sahrens *msgid = msgid_table[ret]; 278789Sahrens 279789Sahrens return (ret); 280789Sahrens } 281