1*1544Seschrock /* 2*1544Seschrock * CDDL HEADER START 3*1544Seschrock * 4*1544Seschrock * The contents of this file are subject to the terms of the 5*1544Seschrock * Common Development and Distribution License (the "License"). 6*1544Seschrock * You may not use this file except in compliance with the License. 7*1544Seschrock * 8*1544Seschrock * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*1544Seschrock * or http://www.opensolaris.org/os/licensing. 10*1544Seschrock * See the License for the specific language governing permissions 11*1544Seschrock * and limitations under the License. 12*1544Seschrock * 13*1544Seschrock * When distributing Covered Code, include this CDDL HEADER in each 14*1544Seschrock * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*1544Seschrock * If applicable, add the following below this CDDL HEADER, with the 16*1544Seschrock * fields enclosed by brackets "[]" replaced with your own identifying 17*1544Seschrock * information: Portions Copyright [yyyy] [name of copyright owner] 18*1544Seschrock * 19*1544Seschrock * CDDL HEADER END 20*1544Seschrock */ 21*1544Seschrock /* 22*1544Seschrock * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23*1544Seschrock * Use is subject to license terms. 24*1544Seschrock */ 25*1544Seschrock 26*1544Seschrock #pragma ident "%Z%%M% %I% %E% SMI" 27*1544Seschrock 28*1544Seschrock /* 29*1544Seschrock * Routines to manage the on-disk persistent error log. 30*1544Seschrock * 31*1544Seschrock * Each pool stores a log of all logical data errors seen during normal 32*1544Seschrock * operation. This is actually the union of two distinct logs: the last log, 33*1544Seschrock * and the current log. All errors seen are logged to the current log. When a 34*1544Seschrock * scrub completes, the current log becomes the last log, the last log is thrown 35*1544Seschrock * out, and the current log is reinitialized. This way, if an error is somehow 36*1544Seschrock * corrected, a new scrub will show that that it no longer exists, and will be 37*1544Seschrock * deleted from the log when the scrub completes. 38*1544Seschrock * 39*1544Seschrock * The log is stored using a ZAP object whose key is a string form of the 40*1544Seschrock * zbookmark tuple (objset, object, level, blkid), and whose contents is an 41*1544Seschrock * optional 'objset:object' human-readable string describing the data. When an 42*1544Seschrock * error is first logged, this string will be empty, indicating that no name is 43*1544Seschrock * known. This prevents us from having to issue a potentially large amount of 44*1544Seschrock * I/O to discover the object name during an error path. Instead, we do the 45*1544Seschrock * calculation when the data is requested, storing the result so future queries 46*1544Seschrock * will be faster. 47*1544Seschrock * 48*1544Seschrock * This log is then shipped into an nvlist where the key is the dataset name and 49*1544Seschrock * the value is the object name. Userland is then responsible for uniquifying 50*1544Seschrock * this list and displaying it to the user. 51*1544Seschrock */ 52*1544Seschrock 53*1544Seschrock #include <sys/dmu_tx.h> 54*1544Seschrock #include <sys/spa.h> 55*1544Seschrock #include <sys/spa_impl.h> 56*1544Seschrock #include <sys/zap.h> 57*1544Seschrock #include <sys/zio.h> 58*1544Seschrock 59*1544Seschrock /* 60*1544Seschrock * This is a stripped-down version of strtoull, suitable only for converting 61*1544Seschrock * lowercase hexidecimal numbers that don't overflow. 62*1544Seschrock */ 63*1544Seschrock static uint64_t 64*1544Seschrock strtonum(char *str, char **nptr) 65*1544Seschrock { 66*1544Seschrock uint64_t val = 0; 67*1544Seschrock char c; 68*1544Seschrock int digit; 69*1544Seschrock 70*1544Seschrock while ((c = *str) != '\0') { 71*1544Seschrock if (c >= '0' && c <= '9') 72*1544Seschrock digit = c - '0'; 73*1544Seschrock else if (c >= 'a' && c <= 'f') 74*1544Seschrock digit = 10 + c - 'a'; 75*1544Seschrock else 76*1544Seschrock break; 77*1544Seschrock 78*1544Seschrock val *= 16; 79*1544Seschrock val += digit; 80*1544Seschrock 81*1544Seschrock str++; 82*1544Seschrock } 83*1544Seschrock 84*1544Seschrock *nptr = str; 85*1544Seschrock 86*1544Seschrock return (val); 87*1544Seschrock } 88*1544Seschrock 89*1544Seschrock /* 90*1544Seschrock * Convert a bookmark to a string. 91*1544Seschrock */ 92*1544Seschrock static void 93*1544Seschrock bookmark_to_name(zbookmark_t *zb, char *buf, size_t len) 94*1544Seschrock { 95*1544Seschrock (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", 96*1544Seschrock (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, 97*1544Seschrock (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid); 98*1544Seschrock } 99*1544Seschrock 100*1544Seschrock /* 101*1544Seschrock * Convert a string to a bookmark 102*1544Seschrock */ 103*1544Seschrock static void 104*1544Seschrock name_to_bookmark(char *buf, zbookmark_t *zb) 105*1544Seschrock { 106*1544Seschrock zb->zb_objset = strtonum(buf, &buf); 107*1544Seschrock ASSERT(*buf == ':'); 108*1544Seschrock zb->zb_object = strtonum(buf + 1, &buf); 109*1544Seschrock ASSERT(*buf == ':'); 110*1544Seschrock zb->zb_level = (int)strtonum(buf + 1, &buf); 111*1544Seschrock ASSERT(*buf == ':'); 112*1544Seschrock zb->zb_blkid = strtonum(buf + 1, &buf); 113*1544Seschrock ASSERT(*buf == '\0'); 114*1544Seschrock } 115*1544Seschrock 116*1544Seschrock /* 117*1544Seschrock * Log an uncorrectable error to the persistent error log. We add it to the 118*1544Seschrock * spa's list of pending errors. The changes are actually synced out to disk 119*1544Seschrock * during spa_errlog_sync(). 120*1544Seschrock */ 121*1544Seschrock void 122*1544Seschrock spa_log_error(spa_t *spa, zio_t *zio) 123*1544Seschrock { 124*1544Seschrock zbookmark_t *zb = &zio->io_logical->io_bookmark; 125*1544Seschrock spa_error_entry_t search; 126*1544Seschrock spa_error_entry_t *new; 127*1544Seschrock avl_tree_t *tree; 128*1544Seschrock avl_index_t where; 129*1544Seschrock 130*1544Seschrock /* 131*1544Seschrock * If we are trying to import a pool, ignore any errors, as we won't be 132*1544Seschrock * writing to the pool any time soon. 133*1544Seschrock */ 134*1544Seschrock if (spa->spa_load_state == SPA_LOAD_TRYIMPORT) 135*1544Seschrock return; 136*1544Seschrock 137*1544Seschrock mutex_enter(&spa->spa_errlist_lock); 138*1544Seschrock 139*1544Seschrock /* 140*1544Seschrock * If we have had a request to rotate the log, log it to the next list 141*1544Seschrock * instead of the current one. 142*1544Seschrock */ 143*1544Seschrock if (spa->spa_scrub_active || spa->spa_scrub_finished) 144*1544Seschrock tree = &spa->spa_errlist_scrub; 145*1544Seschrock else 146*1544Seschrock tree = &spa->spa_errlist_last; 147*1544Seschrock 148*1544Seschrock search.se_bookmark = *zb; 149*1544Seschrock if (avl_find(tree, &search, &where) != NULL) { 150*1544Seschrock mutex_exit(&spa->spa_errlist_lock); 151*1544Seschrock return; 152*1544Seschrock } 153*1544Seschrock 154*1544Seschrock new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); 155*1544Seschrock new->se_bookmark = *zb; 156*1544Seschrock avl_insert(tree, new, where); 157*1544Seschrock 158*1544Seschrock mutex_exit(&spa->spa_errlist_lock); 159*1544Seschrock } 160*1544Seschrock 161*1544Seschrock /* 162*1544Seschrock * Return the number of errors currently in the error log. This is actually the 163*1544Seschrock * sum of both the last log and the current log, since we don't know the union 164*1544Seschrock * of these logs until we reach userland. 165*1544Seschrock */ 166*1544Seschrock uint64_t 167*1544Seschrock spa_get_errlog_size(spa_t *spa) 168*1544Seschrock { 169*1544Seschrock uint64_t total = 0, count; 170*1544Seschrock 171*1544Seschrock mutex_enter(&spa->spa_errlog_lock); 172*1544Seschrock if (spa->spa_errlog_scrub != 0 && 173*1544Seschrock zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, 174*1544Seschrock &count) == 0) 175*1544Seschrock total += count; 176*1544Seschrock 177*1544Seschrock if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && 178*1544Seschrock zap_count(spa->spa_meta_objset, spa->spa_errlog_last, 179*1544Seschrock &count) == 0) 180*1544Seschrock total += count; 181*1544Seschrock mutex_exit(&spa->spa_errlog_lock); 182*1544Seschrock 183*1544Seschrock mutex_enter(&spa->spa_errlist_lock); 184*1544Seschrock total += avl_numnodes(&spa->spa_errlist_last); 185*1544Seschrock total += avl_numnodes(&spa->spa_errlist_scrub); 186*1544Seschrock mutex_exit(&spa->spa_errlist_lock); 187*1544Seschrock 188*1544Seschrock return (total); 189*1544Seschrock } 190*1544Seschrock 191*1544Seschrock #ifdef _KERNEL 192*1544Seschrock static int 193*1544Seschrock process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) 194*1544Seschrock { 195*1544Seschrock zap_cursor_t zc; 196*1544Seschrock zap_attribute_t za; 197*1544Seschrock zbookmark_t zb; 198*1544Seschrock 199*1544Seschrock if (obj == 0) 200*1544Seschrock return (0); 201*1544Seschrock 202*1544Seschrock for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); 203*1544Seschrock zap_cursor_retrieve(&zc, &za) == 0; 204*1544Seschrock zap_cursor_advance(&zc)) { 205*1544Seschrock 206*1544Seschrock if (*count == 0) { 207*1544Seschrock zap_cursor_fini(&zc); 208*1544Seschrock return (ENOMEM); 209*1544Seschrock } 210*1544Seschrock 211*1544Seschrock name_to_bookmark(za.za_name, &zb); 212*1544Seschrock 213*1544Seschrock if (copyout(&zb, (char *)addr + 214*1544Seschrock (*count - 1) * sizeof (zbookmark_t), 215*1544Seschrock sizeof (zbookmark_t)) != 0) 216*1544Seschrock return (EFAULT); 217*1544Seschrock 218*1544Seschrock *count -= 1; 219*1544Seschrock } 220*1544Seschrock 221*1544Seschrock zap_cursor_fini(&zc); 222*1544Seschrock 223*1544Seschrock return (0); 224*1544Seschrock } 225*1544Seschrock 226*1544Seschrock static int 227*1544Seschrock process_error_list(avl_tree_t *list, void *addr, size_t *count) 228*1544Seschrock { 229*1544Seschrock spa_error_entry_t *se; 230*1544Seschrock 231*1544Seschrock for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { 232*1544Seschrock 233*1544Seschrock if (*count == 0) 234*1544Seschrock return (ENOMEM); 235*1544Seschrock 236*1544Seschrock if (copyout(&se->se_bookmark, (char *)addr + 237*1544Seschrock (*count - 1) * sizeof (zbookmark_t), 238*1544Seschrock sizeof (zbookmark_t)) != 0) 239*1544Seschrock return (EFAULT); 240*1544Seschrock 241*1544Seschrock *count -= 1; 242*1544Seschrock } 243*1544Seschrock 244*1544Seschrock return (0); 245*1544Seschrock } 246*1544Seschrock #endif 247*1544Seschrock 248*1544Seschrock /* 249*1544Seschrock * Copy all known errors to userland as an array of bookmarks. This is 250*1544Seschrock * actually a union of the on-disk last log and current log, as well as any 251*1544Seschrock * pending error requests. 252*1544Seschrock * 253*1544Seschrock * Because the act of reading the on-disk log could cause errors to be 254*1544Seschrock * generated, we have two separate locks: one for the error log and one for the 255*1544Seschrock * in-core error lists. We only need the error list lock to log and error, so 256*1544Seschrock * we grab the error log lock while we read the on-disk logs, and only pick up 257*1544Seschrock * the error list lock when we are finished. 258*1544Seschrock */ 259*1544Seschrock int 260*1544Seschrock spa_get_errlog(spa_t *spa, void *uaddr, size_t *count) 261*1544Seschrock { 262*1544Seschrock int ret = 0; 263*1544Seschrock 264*1544Seschrock #ifdef _KERNEL 265*1544Seschrock mutex_enter(&spa->spa_errlog_lock); 266*1544Seschrock 267*1544Seschrock ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count); 268*1544Seschrock 269*1544Seschrock if (!ret && !spa->spa_scrub_finished) 270*1544Seschrock ret = process_error_log(spa, spa->spa_errlog_last, uaddr, 271*1544Seschrock count); 272*1544Seschrock 273*1544Seschrock mutex_enter(&spa->spa_errlist_lock); 274*1544Seschrock if (!ret) 275*1544Seschrock ret = process_error_list(&spa->spa_errlist_scrub, uaddr, 276*1544Seschrock count); 277*1544Seschrock if (!ret) 278*1544Seschrock ret = process_error_list(&spa->spa_errlist_last, uaddr, 279*1544Seschrock count); 280*1544Seschrock mutex_exit(&spa->spa_errlist_lock); 281*1544Seschrock 282*1544Seschrock mutex_exit(&spa->spa_errlog_lock); 283*1544Seschrock #endif 284*1544Seschrock 285*1544Seschrock return (ret); 286*1544Seschrock } 287*1544Seschrock 288*1544Seschrock /* 289*1544Seschrock * Called when a scrub completes. This simply set a bit which tells which AVL 290*1544Seschrock * tree to add new errors. spa_errlog_sync() is responsible for actually 291*1544Seschrock * syncing the changes to the underlying objects. 292*1544Seschrock */ 293*1544Seschrock void 294*1544Seschrock spa_errlog_rotate(spa_t *spa) 295*1544Seschrock { 296*1544Seschrock mutex_enter(&spa->spa_errlist_lock); 297*1544Seschrock 298*1544Seschrock ASSERT(!spa->spa_scrub_finished); 299*1544Seschrock spa->spa_scrub_finished = B_TRUE; 300*1544Seschrock 301*1544Seschrock mutex_exit(&spa->spa_errlist_lock); 302*1544Seschrock } 303*1544Seschrock 304*1544Seschrock /* 305*1544Seschrock * Discard any pending errors from the spa_t. Called when unloading a faulted 306*1544Seschrock * pool, as the errors encountered during the open cannot be synced to disk. 307*1544Seschrock */ 308*1544Seschrock void 309*1544Seschrock spa_errlog_drain(spa_t *spa) 310*1544Seschrock { 311*1544Seschrock spa_error_entry_t *se; 312*1544Seschrock void *cookie; 313*1544Seschrock 314*1544Seschrock mutex_enter(&spa->spa_errlist_lock); 315*1544Seschrock 316*1544Seschrock cookie = NULL; 317*1544Seschrock while ((se = avl_destroy_nodes(&spa->spa_errlist_last, 318*1544Seschrock &cookie)) != NULL) 319*1544Seschrock kmem_free(se, sizeof (spa_error_entry_t)); 320*1544Seschrock cookie = NULL; 321*1544Seschrock while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub, 322*1544Seschrock &cookie)) != NULL) 323*1544Seschrock kmem_free(se, sizeof (spa_error_entry_t)); 324*1544Seschrock 325*1544Seschrock mutex_exit(&spa->spa_errlist_lock); 326*1544Seschrock } 327*1544Seschrock 328*1544Seschrock /* 329*1544Seschrock * Process a list of errors into the current on-disk log. 330*1544Seschrock */ 331*1544Seschrock static void 332*1544Seschrock sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) 333*1544Seschrock { 334*1544Seschrock spa_error_entry_t *se; 335*1544Seschrock char buf[64]; 336*1544Seschrock void *cookie; 337*1544Seschrock 338*1544Seschrock if (avl_numnodes(t) != 0) { 339*1544Seschrock /* create log if necessary */ 340*1544Seschrock if (*obj == 0) 341*1544Seschrock *obj = zap_create(spa->spa_meta_objset, 342*1544Seschrock DMU_OT_ERROR_LOG, DMU_OT_NONE, 343*1544Seschrock 0, tx); 344*1544Seschrock 345*1544Seschrock /* add errors to the current log */ 346*1544Seschrock for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { 347*1544Seschrock char *name = se->se_name ? se->se_name : ""; 348*1544Seschrock 349*1544Seschrock bookmark_to_name(&se->se_bookmark, buf, sizeof (buf)); 350*1544Seschrock 351*1544Seschrock (void) zap_update(spa->spa_meta_objset, 352*1544Seschrock *obj, buf, 1, strlen(name) + 1, name, tx); 353*1544Seschrock } 354*1544Seschrock 355*1544Seschrock /* purge the error list */ 356*1544Seschrock cookie = NULL; 357*1544Seschrock while ((se = avl_destroy_nodes(t, &cookie)) != NULL) 358*1544Seschrock kmem_free(se, sizeof (spa_error_entry_t)); 359*1544Seschrock } 360*1544Seschrock } 361*1544Seschrock 362*1544Seschrock /* 363*1544Seschrock * Sync the error log out to disk. This is a little tricky because the act of 364*1544Seschrock * writing the error log requires the spa_errlist_lock. So, we need to lock the 365*1544Seschrock * error lists, take a copy of the lists, and then reinitialize them. Then, we 366*1544Seschrock * drop the error list lock and take the error log lock, at which point we 367*1544Seschrock * do the errlog processing. Then, if we encounter an I/O error during this 368*1544Seschrock * process, we can successfully add the error to the list. Note that this will 369*1544Seschrock * result in the perpetual recycling of errors, but it is an unlikely situation 370*1544Seschrock * and not a performance critical operation. 371*1544Seschrock */ 372*1544Seschrock void 373*1544Seschrock spa_errlog_sync(spa_t *spa, uint64_t txg) 374*1544Seschrock { 375*1544Seschrock dmu_tx_t *tx; 376*1544Seschrock avl_tree_t scrub, last; 377*1544Seschrock int scrub_finished; 378*1544Seschrock 379*1544Seschrock mutex_enter(&spa->spa_errlist_lock); 380*1544Seschrock 381*1544Seschrock /* 382*1544Seschrock * Bail out early under normal circumstances. 383*1544Seschrock */ 384*1544Seschrock if (avl_numnodes(&spa->spa_errlist_scrub) == 0 && 385*1544Seschrock avl_numnodes(&spa->spa_errlist_last) == 0 && 386*1544Seschrock !spa->spa_scrub_finished) { 387*1544Seschrock mutex_exit(&spa->spa_errlist_lock); 388*1544Seschrock return; 389*1544Seschrock } 390*1544Seschrock 391*1544Seschrock spa_get_errlists(spa, &last, &scrub); 392*1544Seschrock scrub_finished = spa->spa_scrub_finished; 393*1544Seschrock spa->spa_scrub_finished = B_FALSE; 394*1544Seschrock 395*1544Seschrock mutex_exit(&spa->spa_errlist_lock); 396*1544Seschrock mutex_enter(&spa->spa_errlog_lock); 397*1544Seschrock 398*1544Seschrock tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 399*1544Seschrock 400*1544Seschrock /* 401*1544Seschrock * Sync out the current list of errors. 402*1544Seschrock */ 403*1544Seschrock sync_error_list(spa, &last, &spa->spa_errlog_last, tx); 404*1544Seschrock 405*1544Seschrock /* 406*1544Seschrock * Rotate the log if necessary. 407*1544Seschrock */ 408*1544Seschrock if (scrub_finished) { 409*1544Seschrock if (spa->spa_errlog_last != 0) 410*1544Seschrock VERIFY(dmu_object_free(spa->spa_meta_objset, 411*1544Seschrock spa->spa_errlog_last, tx) == 0); 412*1544Seschrock spa->spa_errlog_last = spa->spa_errlog_scrub; 413*1544Seschrock spa->spa_errlog_scrub = 0; 414*1544Seschrock 415*1544Seschrock sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx); 416*1544Seschrock } 417*1544Seschrock 418*1544Seschrock /* 419*1544Seschrock * Sync out any pending scrub errors. 420*1544Seschrock */ 421*1544Seschrock sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx); 422*1544Seschrock 423*1544Seschrock /* 424*1544Seschrock * Update the MOS to reflect the new values. 425*1544Seschrock */ 426*1544Seschrock (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 427*1544Seschrock DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1, 428*1544Seschrock &spa->spa_errlog_last, tx); 429*1544Seschrock (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 430*1544Seschrock DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1, 431*1544Seschrock &spa->spa_errlog_scrub, tx); 432*1544Seschrock 433*1544Seschrock dmu_tx_commit(tx); 434*1544Seschrock 435*1544Seschrock mutex_exit(&spa->spa_errlog_lock); 436*1544Seschrock } 437