1*0Sstevel@tonic-gate /* 2*0Sstevel@tonic-gate * CDDL HEADER START 3*0Sstevel@tonic-gate * 4*0Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*0Sstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only 6*0Sstevel@tonic-gate * (the "License"). You may not use this file except in compliance 7*0Sstevel@tonic-gate * with the License. 8*0Sstevel@tonic-gate * 9*0Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10*0Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 11*0Sstevel@tonic-gate * See the License for the specific language governing permissions 12*0Sstevel@tonic-gate * and limitations under the License. 13*0Sstevel@tonic-gate * 14*0Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 15*0Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16*0Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 17*0Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 18*0Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 19*0Sstevel@tonic-gate * 20*0Sstevel@tonic-gate * CDDL HEADER END 21*0Sstevel@tonic-gate */ 22*0Sstevel@tonic-gate 23*0Sstevel@tonic-gate /* 24*0Sstevel@tonic-gate * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 25*0Sstevel@tonic-gate * Use is subject to license terms. 26*0Sstevel@tonic-gate */ 27*0Sstevel@tonic-gate 28*0Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 29*0Sstevel@tonic-gate 30*0Sstevel@tonic-gate #include <meta.h> 31*0Sstevel@tonic-gate #include <sdssc.h> 32*0Sstevel@tonic-gate #include <signal.h> 33*0Sstevel@tonic-gate #include <syslog.h> 34*0Sstevel@tonic-gate #include <sys/types.h> 35*0Sstevel@tonic-gate #include <sys/wait.h> 36*0Sstevel@tonic-gate #include <sys/lvm/md_mirror.h> 37*0Sstevel@tonic-gate #include <metad.h> 38*0Sstevel@tonic-gate 39*0Sstevel@tonic-gate #define MY_VERSION "1.0" /* the highest supported version */ 40*0Sstevel@tonic-gate #define MAX_DEBUG_LEVEL 5 /* maximum verbosity level */ 41*0Sstevel@tonic-gate 42*0Sstevel@tonic-gate #define RESET_OWNER 0x0001 43*0Sstevel@tonic-gate #define CHOOSE_OWNER 0x0002 44*0Sstevel@tonic-gate #define RESET_ABR 0x0004 45*0Sstevel@tonic-gate #define UPDATE_ABR 0x0008 46*0Sstevel@tonic-gate #define GET_MIRROR_STATE 0x0010 47*0Sstevel@tonic-gate 48*0Sstevel@tonic-gate #define SET_INFO_NO_WR 0x0002 49*0Sstevel@tonic-gate #define SET_INFO_MN 0x0004 50*0Sstevel@tonic-gate 51*0Sstevel@tonic-gate /* 52*0Sstevel@tonic-gate * This table defines all the metaclust reconfig steps we understand 53*0Sstevel@tonic-gate */ 54*0Sstevel@tonic-gate typedef enum stpnum { 55*0Sstevel@tonic-gate MC_UNK = 0, 56*0Sstevel@tonic-gate MC_START, 57*0Sstevel@tonic-gate MC_STOP, 58*0Sstevel@tonic-gate MC_ABORT, 59*0Sstevel@tonic-gate MC_RETURN, 60*0Sstevel@tonic-gate MC_STEP1, 61*0Sstevel@tonic-gate MC_STEP2, 62*0Sstevel@tonic-gate MC_STEP3, 63*0Sstevel@tonic-gate MC_STEP4 64*0Sstevel@tonic-gate } stepnum_t; 65*0Sstevel@tonic-gate 66*0Sstevel@tonic-gate /* 67*0Sstevel@tonic-gate * Structure for step_name -> step_number mapping 68*0Sstevel@tonic-gate */ 69*0Sstevel@tonic-gate struct step_t { 70*0Sstevel@tonic-gate char *step_nam; 71*0Sstevel@tonic-gate stepnum_t step_num; 72*0Sstevel@tonic-gate }; 73*0Sstevel@tonic-gate 74*0Sstevel@tonic-gate /* 75*0Sstevel@tonic-gate * Step name to step number mapping table 76*0Sstevel@tonic-gate * This table MUST be sorted alphabetically in ascending order of step name 77*0Sstevel@tonic-gate */ 78*0Sstevel@tonic-gate static struct step_t step_table[] = { 79*0Sstevel@tonic-gate { "abort", MC_ABORT }, 80*0Sstevel@tonic-gate { "return", MC_RETURN }, 81*0Sstevel@tonic-gate { "start", MC_START }, 82*0Sstevel@tonic-gate { "step1", MC_STEP1 }, 83*0Sstevel@tonic-gate { "step2", MC_STEP2 }, 84*0Sstevel@tonic-gate { "step3", MC_STEP3 }, 85*0Sstevel@tonic-gate { "step4", MC_STEP4 }, 86*0Sstevel@tonic-gate { "stop", MC_STOP } 87*0Sstevel@tonic-gate }; 88*0Sstevel@tonic-gate 89*0Sstevel@tonic-gate /* 90*0Sstevel@tonic-gate * If support for a different version is added, the new version number should 91*0Sstevel@tonic-gate * be appended to the version_table below. This list will be searched to 92*0Sstevel@tonic-gate * determine if a version requested via the -V option is supported or not. 93*0Sstevel@tonic-gate */ 94*0Sstevel@tonic-gate static char *version_table[] = { 95*0Sstevel@tonic-gate MY_VERSION 96*0Sstevel@tonic-gate }; 97*0Sstevel@tonic-gate 98*0Sstevel@tonic-gate uint_t timeout = 0; /* disable timeout by default */ 99*0Sstevel@tonic-gate char *version = MY_VERSION; /* use latest version by default */ 100*0Sstevel@tonic-gate int stepnum = MC_UNK; /* reconfiguration step number */ 101*0Sstevel@tonic-gate pid_t c_pid; /* child process id */ 102*0Sstevel@tonic-gate 103*0Sstevel@tonic-gate /* 104*0Sstevel@tonic-gate * Binary search comparison routine 105*0Sstevel@tonic-gate */ 106*0Sstevel@tonic-gate static int 107*0Sstevel@tonic-gate mc_compare(const void *stp1, const void *stp2) 108*0Sstevel@tonic-gate { 109*0Sstevel@tonic-gate return (strcmp((const char *)stp1, 110*0Sstevel@tonic-gate ((const struct step_t *)stp2)->step_nam)); 111*0Sstevel@tonic-gate } 112*0Sstevel@tonic-gate 113*0Sstevel@tonic-gate /* 114*0Sstevel@tonic-gate * Timeout expiry alarm signal handler 115*0Sstevel@tonic-gate */ 116*0Sstevel@tonic-gate /*ARGSUSED*/ 117*0Sstevel@tonic-gate static void 118*0Sstevel@tonic-gate sigalarmhandler(int sig) 119*0Sstevel@tonic-gate { 120*0Sstevel@tonic-gate int i, n, ret, stat_loc = 0; 121*0Sstevel@tonic-gate 122*0Sstevel@tonic-gate n = sizeof (step_table) / sizeof (step_table[0]); 123*0Sstevel@tonic-gate for (i = 0; i < n; i++) { 124*0Sstevel@tonic-gate if (stepnum == step_table[i].step_num) 125*0Sstevel@tonic-gate break; 126*0Sstevel@tonic-gate } 127*0Sstevel@tonic-gate 128*0Sstevel@tonic-gate assert(i != n); 129*0Sstevel@tonic-gate 130*0Sstevel@tonic-gate meta_mc_log(MC_LOG1, gettext("Timeout expired in %s: %s"), 131*0Sstevel@tonic-gate step_table[i].step_nam, 132*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 133*0Sstevel@tonic-gate 134*0Sstevel@tonic-gate if ((ret = kill(c_pid, SIGKILL)) == 0) { 135*0Sstevel@tonic-gate /* 136*0Sstevel@tonic-gate * The child will wait forever until the status is retrieved 137*0Sstevel@tonic-gate * so get it now. Keep retrying if the call is interrupted. 138*0Sstevel@tonic-gate * 139*0Sstevel@tonic-gate * The possible results are, 140*0Sstevel@tonic-gate * 141*0Sstevel@tonic-gate * - child killed successfully 142*0Sstevel@tonic-gate * - signal sent but child not killed 143*0Sstevel@tonic-gate * - waitpid failed/interrupted 144*0Sstevel@tonic-gate */ 145*0Sstevel@tonic-gate sleep(2); 146*0Sstevel@tonic-gate while ((ret = waitpid(c_pid, &stat_loc, WNOHANG)) < 0) { 147*0Sstevel@tonic-gate if (errno != EINTR) { 148*0Sstevel@tonic-gate break; 149*0Sstevel@tonic-gate } 150*0Sstevel@tonic-gate } 151*0Sstevel@tonic-gate if ((ret == c_pid) || (errno == ECHILD)) { 152*0Sstevel@tonic-gate ret = 0; 153*0Sstevel@tonic-gate } else { 154*0Sstevel@tonic-gate ret = 1; 155*0Sstevel@tonic-gate } 156*0Sstevel@tonic-gate } else if (errno == ESRCH) { 157*0Sstevel@tonic-gate /* 158*0Sstevel@tonic-gate * If the kill did not catch the child then it means the child 159*0Sstevel@tonic-gate * exited immediately after the timeout occured. 160*0Sstevel@tonic-gate */ 161*0Sstevel@tonic-gate ret = 0; 162*0Sstevel@tonic-gate } 163*0Sstevel@tonic-gate 164*0Sstevel@tonic-gate /* 165*0Sstevel@tonic-gate * make sure not to exit with 205 for any steps other than step1-step4. 166*0Sstevel@tonic-gate * Suncluster reconfiguration can't handle it otherwise. 167*0Sstevel@tonic-gate */ 168*0Sstevel@tonic-gate switch (stepnum) { 169*0Sstevel@tonic-gate case MC_STEP1: 170*0Sstevel@tonic-gate case MC_STEP2: 171*0Sstevel@tonic-gate case MC_STEP3: 172*0Sstevel@tonic-gate case MC_STEP4: 173*0Sstevel@tonic-gate /* 174*0Sstevel@tonic-gate * If the child was killed successfully return 205 for a 175*0Sstevel@tonic-gate * new reconfig cycle otherwise send 1 to panic the node. 176*0Sstevel@tonic-gate */ 177*0Sstevel@tonic-gate if (ret != 0) { 178*0Sstevel@tonic-gate md_eprintf(gettext("Could not kill child\n")); 179*0Sstevel@tonic-gate exit(1); 180*0Sstevel@tonic-gate } else { 181*0Sstevel@tonic-gate exit(205); 182*0Sstevel@tonic-gate } 183*0Sstevel@tonic-gate break; 184*0Sstevel@tonic-gate case MC_START: 185*0Sstevel@tonic-gate case MC_STOP: 186*0Sstevel@tonic-gate case MC_ABORT: 187*0Sstevel@tonic-gate case MC_RETURN: 188*0Sstevel@tonic-gate default: 189*0Sstevel@tonic-gate exit(1); 190*0Sstevel@tonic-gate break; 191*0Sstevel@tonic-gate } 192*0Sstevel@tonic-gate } 193*0Sstevel@tonic-gate 194*0Sstevel@tonic-gate /* 195*0Sstevel@tonic-gate * Attempt to load local set. 196*0Sstevel@tonic-gate * Returns: 197*0Sstevel@tonic-gate * pointer to mdsetname_t for local set (local_sp) is successful. 198*0Sstevel@tonic-gate * 0 if failure 199*0Sstevel@tonic-gate * if there are no local set mddbs, no error message is printed. 200*0Sstevel@tonic-gate * Otherwise, error message is printed so that user 201*0Sstevel@tonic-gate * can determine why the local set didn't start. 202*0Sstevel@tonic-gate */ 203*0Sstevel@tonic-gate mdsetname_t * 204*0Sstevel@tonic-gate load_local_set(md_error_t *ep) 205*0Sstevel@tonic-gate { 206*0Sstevel@tonic-gate mdsetname_t *local_sp = NULL; 207*0Sstevel@tonic-gate 208*0Sstevel@tonic-gate /* Does local set exist? If not, give no error */ 209*0Sstevel@tonic-gate if ((local_sp = metasetname(MD_LOCAL_NAME, ep)) == NULL) { 210*0Sstevel@tonic-gate return (0); 211*0Sstevel@tonic-gate } 212*0Sstevel@tonic-gate 213*0Sstevel@tonic-gate /* 214*0Sstevel@tonic-gate * snarf local set 215*0Sstevel@tonic-gate * If fails with MDE_DB_NODB, then just return 1 printing 216*0Sstevel@tonic-gate * no failure. 217*0Sstevel@tonic-gate * Otherwise, print error message, and return 1. 218*0Sstevel@tonic-gate */ 219*0Sstevel@tonic-gate if (meta_setup_db_locations(ep) != 0) { 220*0Sstevel@tonic-gate if (!(mdismddberror(ep, MDE_DB_NODB))) 221*0Sstevel@tonic-gate mde_perror(ep, ""); 222*0Sstevel@tonic-gate return (0); 223*0Sstevel@tonic-gate } 224*0Sstevel@tonic-gate 225*0Sstevel@tonic-gate /* local set loaded successfully */ 226*0Sstevel@tonic-gate return (local_sp); 227*0Sstevel@tonic-gate } 228*0Sstevel@tonic-gate 229*0Sstevel@tonic-gate /* 230*0Sstevel@tonic-gate * Purpose: Compose a full path name for a metadevice 231*0Sstevel@tonic-gate * 232*0Sstevel@tonic-gate * On entry: sp - setname pointer 233*0Sstevel@tonic-gate * mnum - minor number of metadevice 234*0Sstevel@tonic-gate * pathname - pointer to array to return path string 235*0Sstevel@tonic-gate * pathlen - max length of pathname array 236*0Sstevel@tonic-gate */ 237*0Sstevel@tonic-gate static int 238*0Sstevel@tonic-gate compose_path(mdsetname_t *sp, int mnum, char *pathname, int pathlen) 239*0Sstevel@tonic-gate { 240*0Sstevel@tonic-gate int rtn; 241*0Sstevel@tonic-gate 242*0Sstevel@tonic-gate if (MD_MIN2SET(mnum) != sp->setno) { 243*0Sstevel@tonic-gate md_eprintf(gettext("minor number 0x%x invalid for set %d\n"), 244*0Sstevel@tonic-gate mnum, sp->setno); 245*0Sstevel@tonic-gate return (-1); 246*0Sstevel@tonic-gate } 247*0Sstevel@tonic-gate rtn = snprintf(pathname, pathlen, "/dev/md/%s/rdsk/d%u", 248*0Sstevel@tonic-gate sp->setname, (unsigned)MD_MIN2UNIT(mnum)); 249*0Sstevel@tonic-gate 250*0Sstevel@tonic-gate if ((pathname[0] == '\0') || (rtn >= pathlen)) { 251*0Sstevel@tonic-gate md_eprintf(gettext( 252*0Sstevel@tonic-gate "Could not create path for device %s/d%u\n"), 253*0Sstevel@tonic-gate sp->setname, (unsigned)MD_MIN2UNIT(mnum)); 254*0Sstevel@tonic-gate return (-1); 255*0Sstevel@tonic-gate } 256*0Sstevel@tonic-gate return (0); 257*0Sstevel@tonic-gate } 258*0Sstevel@tonic-gate 259*0Sstevel@tonic-gate /* 260*0Sstevel@tonic-gate * Purpose: Walk through all the devices specified for the given set 261*0Sstevel@tonic-gate * and do the action specified in mode 262*0Sstevel@tonic-gate */ 263*0Sstevel@tonic-gate static int 264*0Sstevel@tonic-gate reset_state(uint_t mode, mdsetname_t *sp, char *drivername, md_error_t *ep) 265*0Sstevel@tonic-gate { 266*0Sstevel@tonic-gate mdnamelist_t *devnlp = NULL; 267*0Sstevel@tonic-gate mdnamelist_t *p; 268*0Sstevel@tonic-gate mdname_t *devnp = NULL; 269*0Sstevel@tonic-gate md_set_mmown_params_t ownpar_p; 270*0Sstevel@tonic-gate md_set_mmown_params_t *ownpar = &ownpar_p; 271*0Sstevel@tonic-gate md_unit_t *mm; 272*0Sstevel@tonic-gate int mirror_dev = 0; 273*0Sstevel@tonic-gate mndiskset_membershiplist_t *nl; 274*0Sstevel@tonic-gate int cnt; 275*0Sstevel@tonic-gate int has_parent; 276*0Sstevel@tonic-gate md_mn_get_mir_state_t mir_state_p; 277*0Sstevel@tonic-gate md_mn_get_mir_state_t *mir_state = &mir_state_p; 278*0Sstevel@tonic-gate 279*0Sstevel@tonic-gate /* 280*0Sstevel@tonic-gate * if we are choosing or resetting the owners then make sure 281*0Sstevel@tonic-gate * we are only doing it for mirror devices 282*0Sstevel@tonic-gate */ 283*0Sstevel@tonic-gate mirror_dev = (strcmp(MD_MIRROR, drivername) == 0); 284*0Sstevel@tonic-gate if ((mode & (RESET_OWNER | CHOOSE_OWNER)) && !mirror_dev) { 285*0Sstevel@tonic-gate return (-1); 286*0Sstevel@tonic-gate } 287*0Sstevel@tonic-gate 288*0Sstevel@tonic-gate /* get a list of all the metadevices for current set */ 289*0Sstevel@tonic-gate if (mirror_dev && meta_get_mirror_names(sp, &devnlp, 0, ep) < 0) { 290*0Sstevel@tonic-gate mde_perror(ep, gettext("Could not get mirrors for set %s"), 291*0Sstevel@tonic-gate sp->setname); 292*0Sstevel@tonic-gate return (-1); 293*0Sstevel@tonic-gate } else if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) { 294*0Sstevel@tonic-gate mde_perror(ep, gettext( 295*0Sstevel@tonic-gate "Could not get soft partitions for set %s"), sp->setname); 296*0Sstevel@tonic-gate return (-1); 297*0Sstevel@tonic-gate } 298*0Sstevel@tonic-gate 299*0Sstevel@tonic-gate /* If resetting the owner, get the known membership list */ 300*0Sstevel@tonic-gate if (mode & RESET_OWNER) { 301*0Sstevel@tonic-gate if (meta_read_nodelist(&cnt, &nl, ep)) { 302*0Sstevel@tonic-gate mde_perror(ep, "Could not get nodelist"); 303*0Sstevel@tonic-gate return (-1); 304*0Sstevel@tonic-gate } 305*0Sstevel@tonic-gate } 306*0Sstevel@tonic-gate 307*0Sstevel@tonic-gate /* for each metadevice */ 308*0Sstevel@tonic-gate for (p = devnlp; (p != NULL); p = p->next) { 309*0Sstevel@tonic-gate devnp = p->namep; 310*0Sstevel@tonic-gate 311*0Sstevel@tonic-gate /* 312*0Sstevel@tonic-gate * Get the current setting for mirror ABR state and all of the 313*0Sstevel@tonic-gate * submirror state and flags from the master node. We only 314*0Sstevel@tonic-gate * perform this when going through a 'start' cycle. 315*0Sstevel@tonic-gate */ 316*0Sstevel@tonic-gate if ((mode & GET_MIRROR_STATE) && mirror_dev) { 317*0Sstevel@tonic-gate char *miscname; 318*0Sstevel@tonic-gate 319*0Sstevel@tonic-gate /* 320*0Sstevel@tonic-gate * Ensure that we ignore soft-parts that are returned 321*0Sstevel@tonic-gate * from the meta_get_mirror_names() call 322*0Sstevel@tonic-gate */ 323*0Sstevel@tonic-gate if ((miscname = metagetmiscname(devnp, ep)) == NULL) 324*0Sstevel@tonic-gate goto out; 325*0Sstevel@tonic-gate if (strcmp(miscname, MD_MIRROR) != 0) 326*0Sstevel@tonic-gate continue; 327*0Sstevel@tonic-gate 328*0Sstevel@tonic-gate mir_state->mnum = meta_getminor(devnp->dev); 329*0Sstevel@tonic-gate MD_SETDRIVERNAME(mir_state, MD_MIRROR, sp->setno); 330*0Sstevel@tonic-gate meta_mc_log(MC_LOG4, gettext("Getting mirror state" 331*0Sstevel@tonic-gate " for %s/d%u: %s"), sp->setname, 332*0Sstevel@tonic-gate (unsigned)MD_MIN2UNIT(mir_state->mnum), 333*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 334*0Sstevel@tonic-gate 335*0Sstevel@tonic-gate if (metaioctl(MD_MN_GET_MIRROR_STATE, mir_state, ep, 336*0Sstevel@tonic-gate "MD_MN_GET_MIRROR_STATE") != 0) { 337*0Sstevel@tonic-gate mde_perror(ep, gettext("Unable to get " 338*0Sstevel@tonic-gate "mirror state for %s/d%u"), sp->setname, 339*0Sstevel@tonic-gate (unsigned)MD_MIN2UNIT(mir_state->mnum)); 340*0Sstevel@tonic-gate goto out; 341*0Sstevel@tonic-gate } else { 342*0Sstevel@tonic-gate continue; 343*0Sstevel@tonic-gate } 344*0Sstevel@tonic-gate } 345*0Sstevel@tonic-gate 346*0Sstevel@tonic-gate /* check if this is a top level metadevice */ 347*0Sstevel@tonic-gate if ((mm = meta_get_mdunit(sp, devnp, ep)) == NULL) 348*0Sstevel@tonic-gate goto out; 349*0Sstevel@tonic-gate if (MD_HAS_PARENT(MD_PARENT(mm))) { 350*0Sstevel@tonic-gate has_parent = 1; 351*0Sstevel@tonic-gate } else { 352*0Sstevel@tonic-gate has_parent = 0; 353*0Sstevel@tonic-gate } 354*0Sstevel@tonic-gate Free(mm); 355*0Sstevel@tonic-gate 356*0Sstevel@tonic-gate if (mode & (RESET_OWNER | CHOOSE_OWNER)) { 357*0Sstevel@tonic-gate char *miscname; 358*0Sstevel@tonic-gate 359*0Sstevel@tonic-gate /* 360*0Sstevel@tonic-gate * we can only do these for mirrors so make sure we 361*0Sstevel@tonic-gate * really have a mirror device and not a softpartition 362*0Sstevel@tonic-gate * imitating one. meta_get_mirror_names seems to think 363*0Sstevel@tonic-gate * softparts on top of a mirror are mirrors! 364*0Sstevel@tonic-gate */ 365*0Sstevel@tonic-gate if ((miscname = metagetmiscname(devnp, ep)) == NULL) 366*0Sstevel@tonic-gate goto out; 367*0Sstevel@tonic-gate if (strcmp(miscname, MD_MIRROR) != 0) 368*0Sstevel@tonic-gate continue; 369*0Sstevel@tonic-gate 370*0Sstevel@tonic-gate (void) memset(ownpar, 0, sizeof (*ownpar)); 371*0Sstevel@tonic-gate ownpar->d.mnum = meta_getminor(devnp->dev); 372*0Sstevel@tonic-gate MD_SETDRIVERNAME(ownpar, MD_MIRROR, sp->setno); 373*0Sstevel@tonic-gate 374*0Sstevel@tonic-gate meta_mc_log(MC_LOG4, gettext("Setting owner " 375*0Sstevel@tonic-gate "for %s/d%u: %s"), sp->setname, 376*0Sstevel@tonic-gate (unsigned)MD_MIN2UNIT(ownpar->d.mnum), 377*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 378*0Sstevel@tonic-gate 379*0Sstevel@tonic-gate /* get the current owner id */ 380*0Sstevel@tonic-gate if (metaioctl(MD_MN_GET_MM_OWNER, ownpar, ep, 381*0Sstevel@tonic-gate "MD_MN_GET_MM_OWNER") != 0) { 382*0Sstevel@tonic-gate mde_perror(ep, gettext("Unable to get " 383*0Sstevel@tonic-gate "mirror owner for %s/d%u"), sp->setname, 384*0Sstevel@tonic-gate (unsigned)MD_MIN2UNIT(ownpar->d.mnum)); 385*0Sstevel@tonic-gate goto out; 386*0Sstevel@tonic-gate } 387*0Sstevel@tonic-gate } 388*0Sstevel@tonic-gate 389*0Sstevel@tonic-gate if (mode & RESET_OWNER) { 390*0Sstevel@tonic-gate if (ownpar->d.owner == MD_MN_MIRROR_UNOWNED) { 391*0Sstevel@tonic-gate mdclrerror(ep); 392*0Sstevel@tonic-gate continue; 393*0Sstevel@tonic-gate } 394*0Sstevel@tonic-gate 395*0Sstevel@tonic-gate /* 396*0Sstevel@tonic-gate * reset owner only if the current owner is 397*0Sstevel@tonic-gate * not in the membership list 398*0Sstevel@tonic-gate * Also kill the resync thread so that when the resync 399*0Sstevel@tonic-gate * is started, it will perform an optimized resync 400*0Sstevel@tonic-gate * for any resync regions that were dirty when the 401*0Sstevel@tonic-gate * current owner left the membership. 402*0Sstevel@tonic-gate */ 403*0Sstevel@tonic-gate if (meta_is_member(NULL, ownpar->d.owner, nl) != 1) { 404*0Sstevel@tonic-gate if (meta_mn_change_owner(&ownpar, 405*0Sstevel@tonic-gate sp->setno, ownpar->d.mnum, 406*0Sstevel@tonic-gate MD_MN_MIRROR_UNOWNED, 407*0Sstevel@tonic-gate MD_MN_MM_ALLOW_CHANGE) == -1) { 408*0Sstevel@tonic-gate md_eprintf(gettext( 409*0Sstevel@tonic-gate "Unable to reset mirror owner " 410*0Sstevel@tonic-gate "for %s/d%u\n"), sp->setname, 411*0Sstevel@tonic-gate (unsigned)MD_MIN2UNIT( 412*0Sstevel@tonic-gate ownpar->d.mnum)); 413*0Sstevel@tonic-gate goto out; 414*0Sstevel@tonic-gate } 415*0Sstevel@tonic-gate if (meta_mirror_resync(sp, devnp, 0, ep, 416*0Sstevel@tonic-gate MD_RESYNC_KILL_NO_WAIT) != 0) { 417*0Sstevel@tonic-gate md_eprintf(gettext( 418*0Sstevel@tonic-gate "Unable to kill resync for" 419*0Sstevel@tonic-gate " %s/d%u\n"), sp->setname, 420*0Sstevel@tonic-gate (unsigned)MD_MIN2UNIT( 421*0Sstevel@tonic-gate ownpar->d.mnum)); 422*0Sstevel@tonic-gate goto out; 423*0Sstevel@tonic-gate } 424*0Sstevel@tonic-gate } 425*0Sstevel@tonic-gate } 426*0Sstevel@tonic-gate 427*0Sstevel@tonic-gate if (mode & CHOOSE_OWNER) { 428*0Sstevel@tonic-gate /* 429*0Sstevel@tonic-gate * only orphaned resyncs will have no owner. 430*0Sstevel@tonic-gate * if that is the case choose a new owner. Otherwise 431*0Sstevel@tonic-gate * re-establish the existing owner. This covers the 432*0Sstevel@tonic-gate * case where a node that owned the mirror 433*0Sstevel@tonic-gate * reboots/panics and comes back into the cluster before 434*0Sstevel@tonic-gate * the reconfig cycle has completed. In this case the 435*0Sstevel@tonic-gate * other cluster nodes will have the mirror owner marked 436*0Sstevel@tonic-gate * as the rebooted node while it has the owner marked 437*0Sstevel@tonic-gate * as 'None'. We have to reestablish the ownership so 438*0Sstevel@tonic-gate * that the subsequent resync can continue. 439*0Sstevel@tonic-gate */ 440*0Sstevel@tonic-gate if (meta_mn_change_owner(&ownpar, sp->setno, 441*0Sstevel@tonic-gate ownpar->d.mnum, ownpar->d.owner, 442*0Sstevel@tonic-gate MD_MN_MM_CHOOSE_OWNER) == -1) { 443*0Sstevel@tonic-gate md_eprintf(gettext("Unable to choose " 444*0Sstevel@tonic-gate "mirror owner for %s/d%u\n"), sp->setname, 445*0Sstevel@tonic-gate (unsigned)MD_MIN2UNIT(ownpar->d.mnum)); 446*0Sstevel@tonic-gate goto out; 447*0Sstevel@tonic-gate } 448*0Sstevel@tonic-gate } 449*0Sstevel@tonic-gate 450*0Sstevel@tonic-gate /* 451*0Sstevel@tonic-gate * For RESET_ABR and UPDATE_ABR - only handle top 452*0Sstevel@tonic-gate * level metadevices. 453*0Sstevel@tonic-gate */ 454*0Sstevel@tonic-gate if (has_parent) 455*0Sstevel@tonic-gate continue; 456*0Sstevel@tonic-gate 457*0Sstevel@tonic-gate if (mode & RESET_ABR) { 458*0Sstevel@tonic-gate /* 459*0Sstevel@tonic-gate * Reset the ABR (application based recovery) 460*0Sstevel@tonic-gate * value on all nodes. We are dealing with 461*0Sstevel@tonic-gate * the possibility that we have ABR set but the 462*0Sstevel@tonic-gate * only node that had the device open with ABR has 463*0Sstevel@tonic-gate * left the cluster. We simply open and close the 464*0Sstevel@tonic-gate * device and if this is the last close in the 465*0Sstevel@tonic-gate * cluster, ABR will be cleared on all nodes. 466*0Sstevel@tonic-gate */ 467*0Sstevel@tonic-gate char *miscname; 468*0Sstevel@tonic-gate char name[MD_MAX_CTDLEN]; 469*0Sstevel@tonic-gate int mnum, fd; 470*0Sstevel@tonic-gate 471*0Sstevel@tonic-gate name[0] = '\0'; 472*0Sstevel@tonic-gate mnum = meta_getminor(devnp->dev); 473*0Sstevel@tonic-gate 474*0Sstevel@tonic-gate /* 475*0Sstevel@tonic-gate * Ensure that we don't include soft-parts in the 476*0Sstevel@tonic-gate * mirror-only call to RESET_ABR. meta_get_mirror_names 477*0Sstevel@tonic-gate * returns a bogus list that includes all soft-parts 478*0Sstevel@tonic-gate * built on mirrors. 479*0Sstevel@tonic-gate */ 480*0Sstevel@tonic-gate if ((miscname = metagetmiscname(devnp, ep)) == NULL) 481*0Sstevel@tonic-gate goto out; 482*0Sstevel@tonic-gate if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0)) 483*0Sstevel@tonic-gate continue; 484*0Sstevel@tonic-gate 485*0Sstevel@tonic-gate meta_mc_log(MC_LOG4, gettext("Re-setting ABR state " 486*0Sstevel@tonic-gate "for %s/d%u: %s"), sp->setname, 487*0Sstevel@tonic-gate (unsigned)MD_MIN2UNIT(mnum), 488*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 489*0Sstevel@tonic-gate 490*0Sstevel@tonic-gate /* compose the absolute device path and open it */ 491*0Sstevel@tonic-gate if (compose_path(sp, mnum, &name[0], 492*0Sstevel@tonic-gate sizeof (name)) != 0) 493*0Sstevel@tonic-gate goto out; 494*0Sstevel@tonic-gate if ((fd = open(name, O_RDWR, 0)) < 0) { 495*0Sstevel@tonic-gate md_perror(gettext("Could not open device %s"), 496*0Sstevel@tonic-gate name); 497*0Sstevel@tonic-gate continue; 498*0Sstevel@tonic-gate } 499*0Sstevel@tonic-gate 500*0Sstevel@tonic-gate (void) close(fd); 501*0Sstevel@tonic-gate } 502*0Sstevel@tonic-gate 503*0Sstevel@tonic-gate if (mode & UPDATE_ABR) { 504*0Sstevel@tonic-gate /* 505*0Sstevel@tonic-gate * Update the ABR value on this node. We obtain the 506*0Sstevel@tonic-gate * current ABR state from the master node. 507*0Sstevel@tonic-gate */ 508*0Sstevel@tonic-gate 509*0Sstevel@tonic-gate char *miscname; 510*0Sstevel@tonic-gate char name[MD_MAX_CTDLEN]; 511*0Sstevel@tonic-gate int mnum, fd; 512*0Sstevel@tonic-gate volcap_t vc; 513*0Sstevel@tonic-gate uint_t tstate; 514*0Sstevel@tonic-gate 515*0Sstevel@tonic-gate name[0] = '\0'; 516*0Sstevel@tonic-gate mnum = meta_getminor(devnp->dev); 517*0Sstevel@tonic-gate 518*0Sstevel@tonic-gate /* 519*0Sstevel@tonic-gate * Ensure that we don't include soft-parts in the 520*0Sstevel@tonic-gate * mirror-only call to UPDATE_ABR. meta_get_mirror_names 521*0Sstevel@tonic-gate * returns a bogus list that includes all soft-parts 522*0Sstevel@tonic-gate * built on mirrors. 523*0Sstevel@tonic-gate */ 524*0Sstevel@tonic-gate if ((miscname = metagetmiscname(devnp, ep)) == NULL) 525*0Sstevel@tonic-gate goto out; 526*0Sstevel@tonic-gate if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0)) 527*0Sstevel@tonic-gate continue; 528*0Sstevel@tonic-gate 529*0Sstevel@tonic-gate /* Get tstate from Master */ 530*0Sstevel@tonic-gate if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep) 531*0Sstevel@tonic-gate != 0) 532*0Sstevel@tonic-gate continue; 533*0Sstevel@tonic-gate /* If not set on the master, nothing to do */ 534*0Sstevel@tonic-gate if (!(tstate & MD_ABR_CAP)) 535*0Sstevel@tonic-gate continue; 536*0Sstevel@tonic-gate 537*0Sstevel@tonic-gate meta_mc_log(MC_LOG4, gettext("Updating ABR state " 538*0Sstevel@tonic-gate "for %s/d%u: %s"), sp->setname, 539*0Sstevel@tonic-gate (unsigned)MD_MIN2UNIT(mnum), 540*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 541*0Sstevel@tonic-gate 542*0Sstevel@tonic-gate /* compose the absolute device path and open it */ 543*0Sstevel@tonic-gate if (compose_path(sp, mnum, &name[0], 544*0Sstevel@tonic-gate sizeof (name)) != 0) 545*0Sstevel@tonic-gate goto out; 546*0Sstevel@tonic-gate if ((fd = open(name, O_RDWR, 0)) < 0) { 547*0Sstevel@tonic-gate md_perror(gettext("Could not open device %s"), 548*0Sstevel@tonic-gate name); 549*0Sstevel@tonic-gate continue; 550*0Sstevel@tonic-gate } 551*0Sstevel@tonic-gate 552*0Sstevel@tonic-gate /* set ABR state */ 553*0Sstevel@tonic-gate vc.vc_info = 0; 554*0Sstevel@tonic-gate vc.vc_set = 0; 555*0Sstevel@tonic-gate if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) { 556*0Sstevel@tonic-gate /* 557*0Sstevel@tonic-gate * Ignore if device does not support this 558*0Sstevel@tonic-gate * ioctl 559*0Sstevel@tonic-gate */ 560*0Sstevel@tonic-gate if ((errno != ENOTTY) && (errno != ENOTSUP)) { 561*0Sstevel@tonic-gate md_perror(gettext("Could not get " 562*0Sstevel@tonic-gate "ABR/DMR state for device %s"), 563*0Sstevel@tonic-gate name); 564*0Sstevel@tonic-gate } 565*0Sstevel@tonic-gate (void) close(fd); 566*0Sstevel@tonic-gate continue; 567*0Sstevel@tonic-gate } 568*0Sstevel@tonic-gate if (!(vc.vc_info & (DKV_ABR_CAP | DKV_DMR_CAP))) { 569*0Sstevel@tonic-gate (void) close(fd); 570*0Sstevel@tonic-gate continue; 571*0Sstevel@tonic-gate } 572*0Sstevel@tonic-gate 573*0Sstevel@tonic-gate vc.vc_set = DKV_ABR_CAP; 574*0Sstevel@tonic-gate if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) { 575*0Sstevel@tonic-gate md_perror(gettext( 576*0Sstevel@tonic-gate "Could not set ABR state for " 577*0Sstevel@tonic-gate "device %s"), name); 578*0Sstevel@tonic-gate (void) close(fd); 579*0Sstevel@tonic-gate goto out; 580*0Sstevel@tonic-gate } else { 581*0Sstevel@tonic-gate md_eprintf(gettext( 582*0Sstevel@tonic-gate "Setting ABR state on device %s\n"), name); 583*0Sstevel@tonic-gate } 584*0Sstevel@tonic-gate 585*0Sstevel@tonic-gate (void) close(fd); 586*0Sstevel@tonic-gate } 587*0Sstevel@tonic-gate } 588*0Sstevel@tonic-gate 589*0Sstevel@tonic-gate /* cleanup */ 590*0Sstevel@tonic-gate if (mode & RESET_OWNER) { 591*0Sstevel@tonic-gate meta_free_nodelist(nl); 592*0Sstevel@tonic-gate } 593*0Sstevel@tonic-gate metafreenamelist(devnlp); 594*0Sstevel@tonic-gate return (0); 595*0Sstevel@tonic-gate 596*0Sstevel@tonic-gate out: 597*0Sstevel@tonic-gate /* cleanup */ 598*0Sstevel@tonic-gate if (mode & RESET_OWNER) { 599*0Sstevel@tonic-gate meta_free_nodelist(nl); 600*0Sstevel@tonic-gate } 601*0Sstevel@tonic-gate metafreenamelist(devnlp); 602*0Sstevel@tonic-gate return (-1); 603*0Sstevel@tonic-gate } 604*0Sstevel@tonic-gate 605*0Sstevel@tonic-gate /* 606*0Sstevel@tonic-gate * Print usage message 607*0Sstevel@tonic-gate */ 608*0Sstevel@tonic-gate static void 609*0Sstevel@tonic-gate usage(mdsetname_t *sp, int eval) 610*0Sstevel@tonic-gate { 611*0Sstevel@tonic-gate (void) fprintf(stderr, gettext("usage:" 612*0Sstevel@tonic-gate "\t%s [-V version] [-t timeout] [-d level] start localnodeid\n" 613*0Sstevel@tonic-gate "\t%s [-V version] [-t timeout] [-d level] step nodelist...\n" 614*0Sstevel@tonic-gate "\t%s [-V version] [-t timeout] [-d level] abort | stop\n" 615*0Sstevel@tonic-gate "\t%s [-V | -? | -h]\n"), 616*0Sstevel@tonic-gate myname, myname, myname, myname); 617*0Sstevel@tonic-gate if (!eval) { 618*0Sstevel@tonic-gate fprintf(stderr, gettext("\n" 619*0Sstevel@tonic-gate "\tValid debug (-d) levels are 1-%d for increasing " 620*0Sstevel@tonic-gate "verbosity.\n\tDefault is -d 3.\n\n" 621*0Sstevel@tonic-gate "\tValid step values are: return | step1 | step2 | " 622*0Sstevel@tonic-gate "step3 | step4\n\n" 623*0Sstevel@tonic-gate "\tNodelist is a space-separated list of node id's\n\n"), 624*0Sstevel@tonic-gate MAX_DEBUG_LEVEL); 625*0Sstevel@tonic-gate } 626*0Sstevel@tonic-gate md_exit(sp, eval); 627*0Sstevel@tonic-gate } 628*0Sstevel@tonic-gate 629*0Sstevel@tonic-gate /* 630*0Sstevel@tonic-gate * Input: Input takes a config step name followed by a list of 631*0Sstevel@tonic-gate * possible node id's. 632*0Sstevel@tonic-gate * 633*0Sstevel@tonic-gate * Returns: 0 - Success 634*0Sstevel@tonic-gate * 1 - Fail 635*0Sstevel@tonic-gate * Node will be removed from cluster membership 636*0Sstevel@tonic-gate * by forcing node to panic. 637*0Sstevel@tonic-gate * 205 - Unsuccessful. Start another reconfig cycle. 638*0Sstevel@tonic-gate * Problem was encountered that could be fixed by 639*0Sstevel@tonic-gate * running another reconfig cycle. 640*0Sstevel@tonic-gate * Problem could be a result of a failure to read 641*0Sstevel@tonic-gate * the nodelist file or that all work could not be 642*0Sstevel@tonic-gate * accomplished in a reconfig step in the amount of 643*0Sstevel@tonic-gate * time given so another reconfig cycle is needed in 644*0Sstevel@tonic-gate * order to finish the current step. 645*0Sstevel@tonic-gate */ 646*0Sstevel@tonic-gate int 647*0Sstevel@tonic-gate main(int argc, char **argv) 648*0Sstevel@tonic-gate { 649*0Sstevel@tonic-gate mdsetname_t *sp = NULL; 650*0Sstevel@tonic-gate md_error_t status = mdnullerror; 651*0Sstevel@tonic-gate md_error_t *ep = &status; 652*0Sstevel@tonic-gate set_t max_sets, setno; 653*0Sstevel@tonic-gate int c, clust = 0; 654*0Sstevel@tonic-gate struct sigaction nsa, osa; 655*0Sstevel@tonic-gate struct step_t *step_ptr; 656*0Sstevel@tonic-gate mdsetname_t *local_sp = NULL; 657*0Sstevel@tonic-gate md_drive_desc *dd; 658*0Sstevel@tonic-gate int rval = 0; 659*0Sstevel@tonic-gate md_set_desc *sd; 660*0Sstevel@tonic-gate mddb_block_parm_t mbp; 661*0Sstevel@tonic-gate uint_t debug = 3; /* log upto MC_LOG3 by default */ 662*0Sstevel@tonic-gate int version_table_size; 663*0Sstevel@tonic-gate mddb_setflags_config_t sf; 664*0Sstevel@tonic-gate int ret_val; 665*0Sstevel@tonic-gate mddb_config_t cfg; 666*0Sstevel@tonic-gate int set_info[MD_MAXSETS]; 667*0Sstevel@tonic-gate 668*0Sstevel@tonic-gate /* 669*0Sstevel@tonic-gate * Get the locale set up before calling any other routines 670*0Sstevel@tonic-gate * with messages to ouput. Just in case we're not in a build 671*0Sstevel@tonic-gate * environment, make sure that TEXT_DOMAIN gets set to 672*0Sstevel@tonic-gate * something. 673*0Sstevel@tonic-gate */ 674*0Sstevel@tonic-gate #if !defined(TEXT_DOMAIN) 675*0Sstevel@tonic-gate #define TEXT_DOMAIN "SYS_TEST" 676*0Sstevel@tonic-gate #endif 677*0Sstevel@tonic-gate (void) setlocale(LC_ALL, ""); 678*0Sstevel@tonic-gate (void) textdomain(TEXT_DOMAIN); 679*0Sstevel@tonic-gate 680*0Sstevel@tonic-gate if ((clust = sdssc_bind_library()) == SDSSC_ERROR) { 681*0Sstevel@tonic-gate md_eprintf(gettext("Interface error with libsds_sc.so\n")); 682*0Sstevel@tonic-gate exit(1); 683*0Sstevel@tonic-gate } 684*0Sstevel@tonic-gate 685*0Sstevel@tonic-gate if (md_init(argc, argv, 1, 1, ep) != 0 || meta_check_root(ep) != 0) { 686*0Sstevel@tonic-gate mde_perror(ep, ""); 687*0Sstevel@tonic-gate md_exit(sp, 1); 688*0Sstevel@tonic-gate } 689*0Sstevel@tonic-gate 690*0Sstevel@tonic-gate /* 691*0Sstevel@tonic-gate * open log and enable libmeta logging. Do it here explicitly 692*0Sstevel@tonic-gate * rather than letting md_init() do it because we are not really 693*0Sstevel@tonic-gate * a daemon and that is what md_init() opens the log as. 694*0Sstevel@tonic-gate */ 695*0Sstevel@tonic-gate openlog("metaclust", LOG_CONS, LOG_USER); 696*0Sstevel@tonic-gate 697*0Sstevel@tonic-gate version_table_size = sizeof (version_table) / sizeof (version_table[0]); 698*0Sstevel@tonic-gate 699*0Sstevel@tonic-gate optind = 1; 700*0Sstevel@tonic-gate opterr = 0; 701*0Sstevel@tonic-gate while ((c = getopt(argc, argv, "hd:V:t:?")) != -1) { 702*0Sstevel@tonic-gate switch (c) { 703*0Sstevel@tonic-gate case 'h': 704*0Sstevel@tonic-gate usage(sp, 0); 705*0Sstevel@tonic-gate break; 706*0Sstevel@tonic-gate 707*0Sstevel@tonic-gate case 'd': 708*0Sstevel@tonic-gate if (sscanf(optarg, "%u", &debug) != 1) { 709*0Sstevel@tonic-gate md_eprintf(gettext("Invalid debug level\n")); 710*0Sstevel@tonic-gate md_exit(sp, 1); 711*0Sstevel@tonic-gate } else if ((debug < 1) || (debug > MAX_DEBUG_LEVEL)) { 712*0Sstevel@tonic-gate debug = min(max(debug, 1), MAX_DEBUG_LEVEL); 713*0Sstevel@tonic-gate md_eprintf(gettext("Debug level must be " 714*0Sstevel@tonic-gate "between 1 and %d inclusive.\n"), 715*0Sstevel@tonic-gate MAX_DEBUG_LEVEL); 716*0Sstevel@tonic-gate md_eprintf(gettext("Debug level set to %d.\n"), 717*0Sstevel@tonic-gate debug); 718*0Sstevel@tonic-gate } 719*0Sstevel@tonic-gate break; 720*0Sstevel@tonic-gate 721*0Sstevel@tonic-gate case 'V': 722*0Sstevel@tonic-gate version = Strdup(optarg); 723*0Sstevel@tonic-gate break; 724*0Sstevel@tonic-gate 725*0Sstevel@tonic-gate case 't': 726*0Sstevel@tonic-gate if (sscanf(optarg, "%u", &timeout) != 1) { 727*0Sstevel@tonic-gate md_eprintf(gettext("Invalid timeout value\n")); 728*0Sstevel@tonic-gate md_exit(sp, 1); 729*0Sstevel@tonic-gate } 730*0Sstevel@tonic-gate break; 731*0Sstevel@tonic-gate 732*0Sstevel@tonic-gate case '?': 733*0Sstevel@tonic-gate if (optopt == '?') { 734*0Sstevel@tonic-gate usage(sp, 0); 735*0Sstevel@tonic-gate } else if (optopt == 'V') { 736*0Sstevel@tonic-gate int i; 737*0Sstevel@tonic-gate 738*0Sstevel@tonic-gate fprintf(stdout, gettext( 739*0Sstevel@tonic-gate "%s: Versions Supported:"), myname); 740*0Sstevel@tonic-gate for (i = 0; i < version_table_size; i++) { 741*0Sstevel@tonic-gate fprintf(stdout, " %s", 742*0Sstevel@tonic-gate version_table[i]); 743*0Sstevel@tonic-gate } 744*0Sstevel@tonic-gate fprintf(stdout, "\n"); 745*0Sstevel@tonic-gate md_exit(sp, 0); 746*0Sstevel@tonic-gate } 747*0Sstevel@tonic-gate /*FALLTHROUGH*/ 748*0Sstevel@tonic-gate 749*0Sstevel@tonic-gate default: 750*0Sstevel@tonic-gate usage(sp, 1); 751*0Sstevel@tonic-gate break; 752*0Sstevel@tonic-gate } 753*0Sstevel@tonic-gate } 754*0Sstevel@tonic-gate 755*0Sstevel@tonic-gate /* initialise the debug level and start time */ 756*0Sstevel@tonic-gate setup_mc_log(debug); 757*0Sstevel@tonic-gate 758*0Sstevel@tonic-gate /* 759*0Sstevel@tonic-gate * check that the version specified (if any) is supported. 760*0Sstevel@tonic-gate */ 761*0Sstevel@tonic-gate if (version != NULL) { 762*0Sstevel@tonic-gate int i, found = 0; 763*0Sstevel@tonic-gate 764*0Sstevel@tonic-gate for (i = 0; i < version_table_size; i++) { 765*0Sstevel@tonic-gate if (strcmp(version, version_table[i]) == 0) { 766*0Sstevel@tonic-gate found = 1; 767*0Sstevel@tonic-gate break; 768*0Sstevel@tonic-gate } 769*0Sstevel@tonic-gate } 770*0Sstevel@tonic-gate if (!found) { 771*0Sstevel@tonic-gate md_eprintf(gettext("Version %s not supported\n"), 772*0Sstevel@tonic-gate version); 773*0Sstevel@tonic-gate md_exit(sp, 1); 774*0Sstevel@tonic-gate } 775*0Sstevel@tonic-gate } 776*0Sstevel@tonic-gate 777*0Sstevel@tonic-gate argc -= optind; 778*0Sstevel@tonic-gate argv += optind; 779*0Sstevel@tonic-gate 780*0Sstevel@tonic-gate /* parse arguments */ 781*0Sstevel@tonic-gate if (argc <= 0) { 782*0Sstevel@tonic-gate usage(sp, 1); 783*0Sstevel@tonic-gate } 784*0Sstevel@tonic-gate 785*0Sstevel@tonic-gate /* convert the step name to the corresponding number */ 786*0Sstevel@tonic-gate step_ptr = bsearch(argv[0], step_table, (sizeof (step_table) / 787*0Sstevel@tonic-gate sizeof (step_table[0])), sizeof (step_table[0]), mc_compare); 788*0Sstevel@tonic-gate if (step_ptr != NULL) { 789*0Sstevel@tonic-gate stepnum = step_ptr->step_num; 790*0Sstevel@tonic-gate } 791*0Sstevel@tonic-gate 792*0Sstevel@tonic-gate --argc; 793*0Sstevel@tonic-gate ++argv; 794*0Sstevel@tonic-gate 795*0Sstevel@tonic-gate /* set timeout alarm signal, a value of 0 will disable timeout */ 796*0Sstevel@tonic-gate if (timeout > 0) { 797*0Sstevel@tonic-gate int stat_loc = 0; 798*0Sstevel@tonic-gate 799*0Sstevel@tonic-gate c_pid = fork(); 800*0Sstevel@tonic-gate 801*0Sstevel@tonic-gate if (c_pid == (pid_t)-1) { 802*0Sstevel@tonic-gate md_perror(gettext("Unable to fork")); 803*0Sstevel@tonic-gate md_exit(sp, 1); 804*0Sstevel@tonic-gate } else if (c_pid) { 805*0Sstevel@tonic-gate /* parent */ 806*0Sstevel@tonic-gate nsa.sa_flags = 0; 807*0Sstevel@tonic-gate if (sigfillset(&nsa.sa_mask) < 0) { 808*0Sstevel@tonic-gate md_perror(gettext("Unable to set signal mask")); 809*0Sstevel@tonic-gate md_exit(sp, 1); 810*0Sstevel@tonic-gate } 811*0Sstevel@tonic-gate 812*0Sstevel@tonic-gate nsa.sa_handler = sigalarmhandler; 813*0Sstevel@tonic-gate if (sigaction(SIGALRM, &nsa, &osa) == -1) { 814*0Sstevel@tonic-gate md_perror(gettext("Unable to set alarm " 815*0Sstevel@tonic-gate "handler")); 816*0Sstevel@tonic-gate md_exit(sp, 1); 817*0Sstevel@tonic-gate } 818*0Sstevel@tonic-gate 819*0Sstevel@tonic-gate (void) alarm(timeout); 820*0Sstevel@tonic-gate 821*0Sstevel@tonic-gate /* 822*0Sstevel@tonic-gate * wait for child to exit or timeout to expire. 823*0Sstevel@tonic-gate * keep retrying if the call is interrupted 824*0Sstevel@tonic-gate */ 825*0Sstevel@tonic-gate while ((ret_val = waitpid(c_pid, &stat_loc, 0)) < 0) { 826*0Sstevel@tonic-gate if (errno != EINTR) { 827*0Sstevel@tonic-gate break; 828*0Sstevel@tonic-gate } 829*0Sstevel@tonic-gate } 830*0Sstevel@tonic-gate if (ret_val == c_pid) { 831*0Sstevel@tonic-gate /* exit with the childs exit value */ 832*0Sstevel@tonic-gate exit(WEXITSTATUS(stat_loc)); 833*0Sstevel@tonic-gate } else if (errno == ECHILD) { 834*0Sstevel@tonic-gate md_exit(sp, 0); 835*0Sstevel@tonic-gate } else { 836*0Sstevel@tonic-gate perror(myname); 837*0Sstevel@tonic-gate md_exit(sp, 1); 838*0Sstevel@tonic-gate } 839*0Sstevel@tonic-gate } 840*0Sstevel@tonic-gate } 841*0Sstevel@tonic-gate 842*0Sstevel@tonic-gate /* 843*0Sstevel@tonic-gate * If a timeout value is given, everything from this point onwards is 844*0Sstevel@tonic-gate * executed in the child process. 845*0Sstevel@tonic-gate */ 846*0Sstevel@tonic-gate 847*0Sstevel@tonic-gate switch (stepnum) { 848*0Sstevel@tonic-gate case MC_START: 849*0Sstevel@tonic-gate /* 850*0Sstevel@tonic-gate * Start Step 851*0Sstevel@tonic-gate * 852*0Sstevel@tonic-gate * - Suspend all rpc.mdcommd messages 853*0Sstevel@tonic-gate */ 854*0Sstevel@tonic-gate 855*0Sstevel@tonic-gate /* expect the local node id to be given only */ 856*0Sstevel@tonic-gate if (argc != 1) 857*0Sstevel@tonic-gate usage(sp, 1); 858*0Sstevel@tonic-gate 859*0Sstevel@tonic-gate meta_mc_log(MC_LOG2, gettext("Starting Start step: %s"), 860*0Sstevel@tonic-gate meta_print_hrtime(0)); 861*0Sstevel@tonic-gate 862*0Sstevel@tonic-gate /* 863*0Sstevel@tonic-gate * Does local set exist? If not, exit with 0 864*0Sstevel@tonic-gate * since there's no reason to have this node panic if 865*0Sstevel@tonic-gate * the local set cannot be started. 866*0Sstevel@tonic-gate */ 867*0Sstevel@tonic-gate if ((local_sp = load_local_set(ep)) == NULL) { 868*0Sstevel@tonic-gate md_exit(local_sp, 0); 869*0Sstevel@tonic-gate } 870*0Sstevel@tonic-gate 871*0Sstevel@tonic-gate if ((max_sets = get_max_sets(ep)) == 0) { 872*0Sstevel@tonic-gate mde_perror(ep, ""); 873*0Sstevel@tonic-gate md_exit(sp, 1); 874*0Sstevel@tonic-gate } 875*0Sstevel@tonic-gate 876*0Sstevel@tonic-gate /* start walking through all possible disksets */ 877*0Sstevel@tonic-gate for (setno = 1; setno < max_sets; setno++) { 878*0Sstevel@tonic-gate if ((sp = metasetnosetname(setno, ep)) == NULL) { 879*0Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) { 880*0Sstevel@tonic-gate /* No set for this setno - continue */ 881*0Sstevel@tonic-gate mdclrerror(ep); 882*0Sstevel@tonic-gate continue; 883*0Sstevel@tonic-gate } else { 884*0Sstevel@tonic-gate mde_perror(ep, gettext("Unable to " 885*0Sstevel@tonic-gate "get set %d information"), setno); 886*0Sstevel@tonic-gate md_exit(sp, 1); 887*0Sstevel@tonic-gate } 888*0Sstevel@tonic-gate } 889*0Sstevel@tonic-gate 890*0Sstevel@tonic-gate /* only check multi-node disksets */ 891*0Sstevel@tonic-gate if (!meta_is_mn_set(sp, ep)) { 892*0Sstevel@tonic-gate mdclrerror(ep); 893*0Sstevel@tonic-gate continue; 894*0Sstevel@tonic-gate } 895*0Sstevel@tonic-gate 896*0Sstevel@tonic-gate meta_mc_log(MC_LOG3, gettext("Start - block parse " 897*0Sstevel@tonic-gate "messages for set %s: %s"), sp->setname, 898*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 899*0Sstevel@tonic-gate 900*0Sstevel@tonic-gate /* 901*0Sstevel@tonic-gate * Mddb parse messages are sent amongst the nodes 902*0Sstevel@tonic-gate * in a diskset whenever the locator block or 903*0Sstevel@tonic-gate * locator names structure has been changed. 904*0Sstevel@tonic-gate * A locator block change could occur as a result 905*0Sstevel@tonic-gate * of a disk failure during the reconfig cycle, 906*0Sstevel@tonic-gate * so block the mddb parse messages while the 907*0Sstevel@tonic-gate * rpc.mdcommd is suspended during the reconfig cycle. 908*0Sstevel@tonic-gate */ 909*0Sstevel@tonic-gate if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 910*0Sstevel@tonic-gate (void) memset(&mbp, 0, sizeof (mbp)); 911*0Sstevel@tonic-gate mbp.c_setno = setno; 912*0Sstevel@tonic-gate mbp.c_blk_flags = MDDB_BLOCK_PARSE; 913*0Sstevel@tonic-gate if (metaioctl(MD_MN_MDDB_BLOCK, &mbp, 914*0Sstevel@tonic-gate &mbp.c_mde, NULL)) { 915*0Sstevel@tonic-gate mdstealerror(ep, &mbp.c_mde); 916*0Sstevel@tonic-gate mde_perror(ep, gettext("Could not " 917*0Sstevel@tonic-gate "block set %s"), sp->setname); 918*0Sstevel@tonic-gate md_exit(sp, 1); 919*0Sstevel@tonic-gate } 920*0Sstevel@tonic-gate } 921*0Sstevel@tonic-gate 922*0Sstevel@tonic-gate /* suspend commd and spin waiting for drain */ 923*0Sstevel@tonic-gate while ((ret_val = mdmn_suspend(setno, 924*0Sstevel@tonic-gate MD_COMM_ALL_CLASSES)) == 925*0Sstevel@tonic-gate MDE_DS_COMMDCTL_SUSPEND_NYD) { 926*0Sstevel@tonic-gate sleep(1); 927*0Sstevel@tonic-gate } 928*0Sstevel@tonic-gate 929*0Sstevel@tonic-gate if (ret_val) { 930*0Sstevel@tonic-gate md_eprintf(gettext("Could not suspend " 931*0Sstevel@tonic-gate "rpc.mdcommd for set %s\n"), sp->setname); 932*0Sstevel@tonic-gate md_exit(sp, 1); 933*0Sstevel@tonic-gate } 934*0Sstevel@tonic-gate 935*0Sstevel@tonic-gate /* 936*0Sstevel@tonic-gate * Set start step flag for set. This is set to indicate 937*0Sstevel@tonic-gate * that the reconfig cycle entered through the start 938*0Sstevel@tonic-gate * step and is used in reconfig step 4 to determine 939*0Sstevel@tonic-gate * whether the node had entered through the start 940*0Sstevel@tonic-gate * step or the return step. 941*0Sstevel@tonic-gate */ 942*0Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf)); 943*0Sstevel@tonic-gate sf.sf_setno = sp->setno; 944*0Sstevel@tonic-gate sf.sf_setflags = MD_SET_MN_START_RC; 945*0Sstevel@tonic-gate sf.sf_flags = MDDB_NM_SET; 946*0Sstevel@tonic-gate /* Use magic to help protect ioctl against attack. */ 947*0Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC; 948*0Sstevel@tonic-gate if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 949*0Sstevel@tonic-gate &sf.sf_mde, NULL)) { 950*0Sstevel@tonic-gate mdstealerror(ep, &sf.sf_mde); 951*0Sstevel@tonic-gate mde_perror(ep, gettext("Could not set " 952*0Sstevel@tonic-gate "start_step flag for set %s"), sp->setname); 953*0Sstevel@tonic-gate md_exit(sp, 1); 954*0Sstevel@tonic-gate } 955*0Sstevel@tonic-gate 956*0Sstevel@tonic-gate } 957*0Sstevel@tonic-gate 958*0Sstevel@tonic-gate meta_mc_log(MC_LOG2, gettext("Start step completed: %s"), 959*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 960*0Sstevel@tonic-gate 961*0Sstevel@tonic-gate break; 962*0Sstevel@tonic-gate 963*0Sstevel@tonic-gate case MC_STOP: 964*0Sstevel@tonic-gate /* 965*0Sstevel@tonic-gate * Stop Step 966*0Sstevel@tonic-gate * 967*0Sstevel@tonic-gate * - ??? 968*0Sstevel@tonic-gate */ 969*0Sstevel@tonic-gate 970*0Sstevel@tonic-gate /* don't expect any more arguments to follow the step name */ 971*0Sstevel@tonic-gate if (argc != 0) 972*0Sstevel@tonic-gate usage(sp, 1); 973*0Sstevel@tonic-gate 974*0Sstevel@tonic-gate break; 975*0Sstevel@tonic-gate 976*0Sstevel@tonic-gate case MC_ABORT: 977*0Sstevel@tonic-gate /* 978*0Sstevel@tonic-gate * Abort Step 979*0Sstevel@tonic-gate * 980*0Sstevel@tonic-gate * - Abort rpc.mdcommd 981*0Sstevel@tonic-gate */ 982*0Sstevel@tonic-gate 983*0Sstevel@tonic-gate /* don't expect any more arguments to follow the step name */ 984*0Sstevel@tonic-gate if (argc != 0) 985*0Sstevel@tonic-gate usage(sp, 1); 986*0Sstevel@tonic-gate 987*0Sstevel@tonic-gate meta_mc_log(MC_LOG2, gettext("Starting Abort step: %s"), 988*0Sstevel@tonic-gate meta_print_hrtime(0)); 989*0Sstevel@tonic-gate 990*0Sstevel@tonic-gate /* 991*0Sstevel@tonic-gate * Does local set exist? If not, exit with 0 992*0Sstevel@tonic-gate * since there's no reason to have this node panic if 993*0Sstevel@tonic-gate * the local set cannot be started. 994*0Sstevel@tonic-gate */ 995*0Sstevel@tonic-gate if ((local_sp = load_local_set(ep)) == NULL) { 996*0Sstevel@tonic-gate md_exit(local_sp, 0); 997*0Sstevel@tonic-gate } 998*0Sstevel@tonic-gate 999*0Sstevel@tonic-gate /* 1000*0Sstevel@tonic-gate * abort the rpc.mdcommd. The abort is only issued on this node 1001*0Sstevel@tonic-gate * meaning that the abort reconfig step is called on this 1002*0Sstevel@tonic-gate * node before a panic while the rest of the cluster will 1003*0Sstevel@tonic-gate * undergo a reconfig cycle. 1004*0Sstevel@tonic-gate * There is no time relation between this node running a 1005*0Sstevel@tonic-gate * reconfig abort and the the rest of the cluster 1006*0Sstevel@tonic-gate * running a reconfig cycle meaning that this node may 1007*0Sstevel@tonic-gate * panic before, during or after the cluster has run 1008*0Sstevel@tonic-gate * a reconfig cycle. 1009*0Sstevel@tonic-gate */ 1010*0Sstevel@tonic-gate mdmn_abort(); 1011*0Sstevel@tonic-gate 1012*0Sstevel@tonic-gate meta_mc_log(MC_LOG2, gettext("Abort step completed: %s"), 1013*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 1014*0Sstevel@tonic-gate 1015*0Sstevel@tonic-gate break; 1016*0Sstevel@tonic-gate 1017*0Sstevel@tonic-gate case MC_RETURN: 1018*0Sstevel@tonic-gate /* 1019*0Sstevel@tonic-gate * Return Step 1020*0Sstevel@tonic-gate * 1021*0Sstevel@tonic-gate * - Grab local set lock, issue rpc.mdcommd DRAIN ALL 1022*0Sstevel@tonic-gate * and release local set lock. Grabbing the local set 1023*0Sstevel@tonic-gate * lock allows any active metaset/metadb commands to 1024*0Sstevel@tonic-gate * terminate gracefully and will keep a metaset/metadb 1025*0Sstevel@tonic-gate * command from starting until the DRAIN ALL is issued. 1026*0Sstevel@tonic-gate * The metaset/metadb commands can issue 1027*0Sstevel@tonic-gate * DRAIN ALL/RESUME ALL commands to rpc.mdcommd, 1028*0Sstevel@tonic-gate * so the return step must not issue the DRAIN ALL command 1029*0Sstevel@tonic-gate * until metaset/metadb have finished or metaset may issue 1030*0Sstevel@tonic-gate * a RESUME ALL after this return reconfig step has issued 1031*0Sstevel@tonic-gate * the DRAIN ALL command. 1032*0Sstevel@tonic-gate * After this reconfig step has issued the DRAIN_ALL and 1033*0Sstevel@tonic-gate * released the local set lock, metaset/metadb will fail 1034*0Sstevel@tonic-gate * when attempting to contact the rpc.mdcommd and will 1035*0Sstevel@tonic-gate * terminate without making any configuration changes. 1036*0Sstevel@tonic-gate * The DRAIN ALL command will keep all other meta* commands 1037*0Sstevel@tonic-gate * from running during the reconfig cycle (these commands 1038*0Sstevel@tonic-gate * will wait until the rpc.mdcommd is resumed) since the 1039*0Sstevel@tonic-gate * reconfig cycle may be changing the diskset configuration. 1040*0Sstevel@tonic-gate */ 1041*0Sstevel@tonic-gate 1042*0Sstevel@tonic-gate /* expect the nodelist to follow the step name */ 1043*0Sstevel@tonic-gate if (argc < 1) 1044*0Sstevel@tonic-gate usage(sp, 1); 1045*0Sstevel@tonic-gate 1046*0Sstevel@tonic-gate meta_mc_log(MC_LOG2, gettext("Starting Return step: %s"), 1047*0Sstevel@tonic-gate meta_print_hrtime(0)); 1048*0Sstevel@tonic-gate 1049*0Sstevel@tonic-gate /* 1050*0Sstevel@tonic-gate * Does local set exist? If not, exit with 0 1051*0Sstevel@tonic-gate * since there's no reason to have this node panic if 1052*0Sstevel@tonic-gate * the local set cannot be started. 1053*0Sstevel@tonic-gate */ 1054*0Sstevel@tonic-gate if ((local_sp = load_local_set(ep)) == NULL) { 1055*0Sstevel@tonic-gate md_exit(local_sp, 0); 1056*0Sstevel@tonic-gate } 1057*0Sstevel@tonic-gate 1058*0Sstevel@tonic-gate /* 1059*0Sstevel@tonic-gate * Suspend any mirror resyncs that are in progress. This 1060*0Sstevel@tonic-gate * stops unnecessary timeouts. 1061*0Sstevel@tonic-gate */ 1062*0Sstevel@tonic-gate meta_mirror_resync_block_all(); 1063*0Sstevel@tonic-gate 1064*0Sstevel@tonic-gate if (meta_lock(local_sp, TRUE, ep) != 0) { 1065*0Sstevel@tonic-gate mde_perror(ep, ""); 1066*0Sstevel@tonic-gate md_exit(local_sp, 1); 1067*0Sstevel@tonic-gate } 1068*0Sstevel@tonic-gate 1069*0Sstevel@tonic-gate /* 1070*0Sstevel@tonic-gate * All metaset and metadb commands on this node have now 1071*0Sstevel@tonic-gate * terminated gracefully. Now, issue a drain all to 1072*0Sstevel@tonic-gate * the rpc.mdcommd. Any meta command issued after the 1073*0Sstevel@tonic-gate * drain all will either spin sending the command to the 1074*0Sstevel@tonic-gate * master until after the reconfig cycle has finished OR 1075*0Sstevel@tonic-gate * will terminate gracefully (metaset/metadb). 1076*0Sstevel@tonic-gate */ 1077*0Sstevel@tonic-gate if ((max_sets = get_max_sets(ep)) == 0) { 1078*0Sstevel@tonic-gate mde_perror(ep, ""); 1079*0Sstevel@tonic-gate md_exit(sp, 1); 1080*0Sstevel@tonic-gate } 1081*0Sstevel@tonic-gate 1082*0Sstevel@tonic-gate /* start walking through all possible disksets */ 1083*0Sstevel@tonic-gate for (setno = 1; setno < max_sets; setno++) { 1084*0Sstevel@tonic-gate if ((sp = metasetnosetname(setno, ep)) == NULL) { 1085*0Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) { 1086*0Sstevel@tonic-gate /* No set for this setno - continue */ 1087*0Sstevel@tonic-gate mdclrerror(ep); 1088*0Sstevel@tonic-gate continue; 1089*0Sstevel@tonic-gate } else { 1090*0Sstevel@tonic-gate mde_perror(ep, gettext("Unable to " 1091*0Sstevel@tonic-gate "get set %d information"), setno); 1092*0Sstevel@tonic-gate md_exit(sp, 1); 1093*0Sstevel@tonic-gate } 1094*0Sstevel@tonic-gate } 1095*0Sstevel@tonic-gate 1096*0Sstevel@tonic-gate /* only check multi-node disksets */ 1097*0Sstevel@tonic-gate if (!meta_is_mn_set(sp, ep)) { 1098*0Sstevel@tonic-gate mdclrerror(ep); 1099*0Sstevel@tonic-gate continue; 1100*0Sstevel@tonic-gate } 1101*0Sstevel@tonic-gate 1102*0Sstevel@tonic-gate meta_mc_log(MC_LOG3, gettext("Return - block parse " 1103*0Sstevel@tonic-gate "messages for set %s: %s"), sp->setname, 1104*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 1105*0Sstevel@tonic-gate 1106*0Sstevel@tonic-gate /* 1107*0Sstevel@tonic-gate * Mddb parse messages are sent amongst the nodes 1108*0Sstevel@tonic-gate * in a diskset whenever the locator block or 1109*0Sstevel@tonic-gate * locator names structure has been changed. 1110*0Sstevel@tonic-gate * A locator block change could occur as a result 1111*0Sstevel@tonic-gate * of a disk failure during the reconfig cycle, 1112*0Sstevel@tonic-gate * so block the mddb parse messages while the 1113*0Sstevel@tonic-gate * rpc.commd is suspended during the reconfig cycle. 1114*0Sstevel@tonic-gate */ 1115*0Sstevel@tonic-gate if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 1116*0Sstevel@tonic-gate (void) memset(&mbp, 0, sizeof (mbp)); 1117*0Sstevel@tonic-gate mbp.c_setno = setno; 1118*0Sstevel@tonic-gate mbp.c_blk_flags = MDDB_BLOCK_PARSE; 1119*0Sstevel@tonic-gate if (metaioctl(MD_MN_MDDB_BLOCK, &mbp, 1120*0Sstevel@tonic-gate &mbp.c_mde, NULL)) { 1121*0Sstevel@tonic-gate mdstealerror(ep, &mbp.c_mde); 1122*0Sstevel@tonic-gate mde_perror(ep, gettext("Could not " 1123*0Sstevel@tonic-gate "block set %s"), sp->setname); 1124*0Sstevel@tonic-gate md_exit(sp, 1); 1125*0Sstevel@tonic-gate } 1126*0Sstevel@tonic-gate } 1127*0Sstevel@tonic-gate 1128*0Sstevel@tonic-gate /* suspend commd and spin waiting for drain */ 1129*0Sstevel@tonic-gate while ((ret_val = mdmn_suspend(setno, 1130*0Sstevel@tonic-gate MD_COMM_ALL_CLASSES)) == 1131*0Sstevel@tonic-gate MDE_DS_COMMDCTL_SUSPEND_NYD) { 1132*0Sstevel@tonic-gate sleep(1); 1133*0Sstevel@tonic-gate } 1134*0Sstevel@tonic-gate 1135*0Sstevel@tonic-gate if (ret_val) { 1136*0Sstevel@tonic-gate md_eprintf(gettext("Could not suspend " 1137*0Sstevel@tonic-gate "rpc.mdcommd for set %s\n"), sp->setname); 1138*0Sstevel@tonic-gate md_exit(sp, 1); 1139*0Sstevel@tonic-gate } 1140*0Sstevel@tonic-gate } 1141*0Sstevel@tonic-gate /* 1142*0Sstevel@tonic-gate * Resume all I/Os for this node for all MN sets in 1143*0Sstevel@tonic-gate * case master node had suspended I/Os but panic'd 1144*0Sstevel@tonic-gate * before resuming I/Os. In case of failure, exit 1145*0Sstevel@tonic-gate * with a 1 since unable to resume I/Os on this node. 1146*0Sstevel@tonic-gate */ 1147*0Sstevel@tonic-gate if (clnt_mn_susp_res_io(mynode(), 0, MN_RES_IO, ep)) { 1148*0Sstevel@tonic-gate mde_perror(ep, gettext( 1149*0Sstevel@tonic-gate "Unable to resume I/O on node %s for all sets"), 1150*0Sstevel@tonic-gate mynode()); 1151*0Sstevel@tonic-gate md_exit(sp, 1); 1152*0Sstevel@tonic-gate } 1153*0Sstevel@tonic-gate 1154*0Sstevel@tonic-gate 1155*0Sstevel@tonic-gate /* 1156*0Sstevel@tonic-gate * Can now unlock local set lock. New metaset/metadb 1157*0Sstevel@tonic-gate * commands are now held off using drain all. 1158*0Sstevel@tonic-gate */ 1159*0Sstevel@tonic-gate (void) meta_unlock(local_sp, ep); 1160*0Sstevel@tonic-gate 1161*0Sstevel@tonic-gate meta_mc_log(MC_LOG2, gettext("Return step completed: %s"), 1162*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 1163*0Sstevel@tonic-gate 1164*0Sstevel@tonic-gate break; 1165*0Sstevel@tonic-gate 1166*0Sstevel@tonic-gate case MC_STEP1: 1167*0Sstevel@tonic-gate /* 1168*0Sstevel@tonic-gate * Step 1 1169*0Sstevel@tonic-gate * 1170*0Sstevel@tonic-gate * - Populate nodelist file if we are on clustering 1171*0Sstevel@tonic-gate * and pick a master node for each MN diskset. 1172*0Sstevel@tonic-gate */ 1173*0Sstevel@tonic-gate 1174*0Sstevel@tonic-gate /* expect the nodelist to follow the step name */ 1175*0Sstevel@tonic-gate if (argc < 1) 1176*0Sstevel@tonic-gate usage(sp, 1); 1177*0Sstevel@tonic-gate 1178*0Sstevel@tonic-gate meta_mc_log(MC_LOG2, gettext("Starting Step1: %s"), 1179*0Sstevel@tonic-gate meta_print_hrtime(0)); 1180*0Sstevel@tonic-gate 1181*0Sstevel@tonic-gate /* Always write nodelist file even if no local set exists */ 1182*0Sstevel@tonic-gate if (clust == SDSSC_OKAY) { 1183*0Sstevel@tonic-gate /* skip to the nodelist args */ 1184*0Sstevel@tonic-gate if (meta_write_nodelist(argc, argv, ep) != 0) { 1185*0Sstevel@tonic-gate mde_perror(ep, gettext( 1186*0Sstevel@tonic-gate "Could not populate nodelist file")); 1187*0Sstevel@tonic-gate md_exit(sp, 1); 1188*0Sstevel@tonic-gate } 1189*0Sstevel@tonic-gate } 1190*0Sstevel@tonic-gate 1191*0Sstevel@tonic-gate /* 1192*0Sstevel@tonic-gate * Does local set exist? If not, exit with 0 1193*0Sstevel@tonic-gate * since there's no reason to have this node panic if 1194*0Sstevel@tonic-gate * the local set cannot be started. 1195*0Sstevel@tonic-gate */ 1196*0Sstevel@tonic-gate if ((local_sp = load_local_set(ep)) == NULL) { 1197*0Sstevel@tonic-gate md_exit(local_sp, 0); 1198*0Sstevel@tonic-gate } 1199*0Sstevel@tonic-gate 1200*0Sstevel@tonic-gate /* 1201*0Sstevel@tonic-gate * At this point, all meta* commands are blocked across 1202*0Sstevel@tonic-gate * all disksets since the master rpc.mdcommd has drained or 1203*0Sstevel@tonic-gate * the master node has died. 1204*0Sstevel@tonic-gate * If a metaset or metadb command had been in progress 1205*0Sstevel@tonic-gate * at the start of the reconfig cycle, this command has 1206*0Sstevel@tonic-gate * either completed or it has been terminated due to 1207*0Sstevel@tonic-gate * the death of the master node. 1208*0Sstevel@tonic-gate * 1209*0Sstevel@tonic-gate * This means that that it is now ok to remove any 1210*0Sstevel@tonic-gate * outstanding clnt_locks associated with multinode 1211*0Sstevel@tonic-gate * disksets on this node due to a node panic during 1212*0Sstevel@tonic-gate * a metaset operation. This allows the routines that 1213*0Sstevel@tonic-gate * choose the master to use rpc.metad to determine the 1214*0Sstevel@tonic-gate * master of the diskset. 1215*0Sstevel@tonic-gate */ 1216*0Sstevel@tonic-gate if (clnt_clr_mnsetlock(mynode(), ep) != 0) { 1217*0Sstevel@tonic-gate meta_mc_log(MC_LOG2, gettext("Step1 aborted:" 1218*0Sstevel@tonic-gate "clear locks failed %s"), 1219*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 1220*0Sstevel@tonic-gate md_exit(local_sp, 1); 1221*0Sstevel@tonic-gate } 1222*0Sstevel@tonic-gate 1223*0Sstevel@tonic-gate /* 1224*0Sstevel@tonic-gate * Call reconfig_choose_master to choose a master for 1225*0Sstevel@tonic-gate * each MN diskset, update the nodelist for each diskset 1226*0Sstevel@tonic-gate * given the member information and send a reinit message 1227*0Sstevel@tonic-gate * to rpc.mdcommd to reload the nodelist. 1228*0Sstevel@tonic-gate */ 1229*0Sstevel@tonic-gate rval = meta_reconfig_choose_master(ep); 1230*0Sstevel@tonic-gate if (rval == 205) { 1231*0Sstevel@tonic-gate /* 1232*0Sstevel@tonic-gate * NOTE: Should issue call to reboot remote host that 1233*0Sstevel@tonic-gate * is causing the RPC failure. Clustering to 1234*0Sstevel@tonic-gate * provide interface in the future. This should 1235*0Sstevel@tonic-gate * stop a never-ending set of 205 reconfig cycles. 1236*0Sstevel@tonic-gate * Remote host causing failure is stored in 1237*0Sstevel@tonic-gate * ep->host if ep is an RPC error. 1238*0Sstevel@tonic-gate * if (mdanyrpcerror(ep)) 1239*0Sstevel@tonic-gate * reboot (ep->host); 1240*0Sstevel@tonic-gate */ 1241*0Sstevel@tonic-gate meta_mc_log(MC_LOG2, gettext("Step1 aborted:" 1242*0Sstevel@tonic-gate "choose master failure of 205 %s"), 1243*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 1244*0Sstevel@tonic-gate md_exit(local_sp, 205); 1245*0Sstevel@tonic-gate } else if (rval != 0) { 1246*0Sstevel@tonic-gate meta_mc_log(MC_LOG2, gettext("Step1 failure: " 1247*0Sstevel@tonic-gate "choose master failure %s"), 1248*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 1249*0Sstevel@tonic-gate md_exit(local_sp, 1); 1250*0Sstevel@tonic-gate } 1251*0Sstevel@tonic-gate 1252*0Sstevel@tonic-gate meta_mc_log(MC_LOG2, gettext("Step1 completed: %s"), 1253*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 1254*0Sstevel@tonic-gate 1255*0Sstevel@tonic-gate md_exit(local_sp, rval); 1256*0Sstevel@tonic-gate break; 1257*0Sstevel@tonic-gate 1258*0Sstevel@tonic-gate case MC_STEP2: 1259*0Sstevel@tonic-gate /* 1260*0Sstevel@tonic-gate * Step 2 1261*0Sstevel@tonic-gate * 1262*0Sstevel@tonic-gate * In Step 2, each node walks the list of disksets. If a 1263*0Sstevel@tonic-gate * node is a master of a MN diskset, it synchronizes 1264*0Sstevel@tonic-gate * the local set USER records for that diskset. 1265*0Sstevel@tonic-gate * 1266*0Sstevel@tonic-gate * If disks exist in the diskset and there is a joined 1267*0Sstevel@tonic-gate * (owner) node in the diskset, the master will also: 1268*0Sstevel@tonic-gate * - synchronize the diskset mddbs to the master 1269*0Sstevel@tonic-gate * - play the change log 1270*0Sstevel@tonic-gate * 1271*0Sstevel@tonic-gate * The master node will now attempt to join any unjoined 1272*0Sstevel@tonic-gate * nodes that are currently members in the membership list. 1273*0Sstevel@tonic-gate */ 1274*0Sstevel@tonic-gate 1275*0Sstevel@tonic-gate /* expect the nodelist to follow the step name */ 1276*0Sstevel@tonic-gate if (argc < 1) 1277*0Sstevel@tonic-gate usage(sp, 1); 1278*0Sstevel@tonic-gate 1279*0Sstevel@tonic-gate meta_mc_log(MC_LOG2, gettext("Starting Step2: %s"), 1280*0Sstevel@tonic-gate meta_print_hrtime(0)); 1281*0Sstevel@tonic-gate 1282*0Sstevel@tonic-gate /* 1283*0Sstevel@tonic-gate * Does local set exist? If not, exit with 0 1284*0Sstevel@tonic-gate * since there's no reason to have this node panic if 1285*0Sstevel@tonic-gate * the local set cannot be started. 1286*0Sstevel@tonic-gate */ 1287*0Sstevel@tonic-gate if ((local_sp = load_local_set(ep)) == NULL) { 1288*0Sstevel@tonic-gate md_exit(local_sp, 0); 1289*0Sstevel@tonic-gate } 1290*0Sstevel@tonic-gate 1291*0Sstevel@tonic-gate if ((max_sets = get_max_sets(ep)) == 0) { 1292*0Sstevel@tonic-gate mde_perror(ep, ""); 1293*0Sstevel@tonic-gate md_exit(local_sp, 1); 1294*0Sstevel@tonic-gate } 1295*0Sstevel@tonic-gate 1296*0Sstevel@tonic-gate /* start walking through all possible disksets */ 1297*0Sstevel@tonic-gate for (setno = 1; setno < max_sets; setno++) { 1298*0Sstevel@tonic-gate if ((sp = metasetnosetname(setno, ep)) == NULL) { 1299*0Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) { 1300*0Sstevel@tonic-gate /* No set for this setno - continue */ 1301*0Sstevel@tonic-gate mdclrerror(ep); 1302*0Sstevel@tonic-gate continue; 1303*0Sstevel@tonic-gate } else if (mdanyrpcerror(ep)) { 1304*0Sstevel@tonic-gate /* Fail on RPC failure to self */ 1305*0Sstevel@tonic-gate mde_perror(ep, gettext( 1306*0Sstevel@tonic-gate "Unable to get information for " 1307*0Sstevel@tonic-gate "set number %d"), setno); 1308*0Sstevel@tonic-gate md_exit(local_sp, 1); 1309*0Sstevel@tonic-gate } else { 1310*0Sstevel@tonic-gate mde_perror(ep, gettext( 1311*0Sstevel@tonic-gate "Unable to get information for " 1312*0Sstevel@tonic-gate "set number %d"), setno); 1313*0Sstevel@tonic-gate mdclrerror(ep); 1314*0Sstevel@tonic-gate continue; 1315*0Sstevel@tonic-gate } 1316*0Sstevel@tonic-gate } 1317*0Sstevel@tonic-gate 1318*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1319*0Sstevel@tonic-gate if (mdanyrpcerror(ep)) { 1320*0Sstevel@tonic-gate /* Fail on RPC failure to self */ 1321*0Sstevel@tonic-gate mde_perror(ep, gettext( 1322*0Sstevel@tonic-gate "Unable to get information for " 1323*0Sstevel@tonic-gate "set number %d"), setno); 1324*0Sstevel@tonic-gate md_exit(local_sp, 1); 1325*0Sstevel@tonic-gate } 1326*0Sstevel@tonic-gate mde_perror(ep, gettext("Unable to get set " 1327*0Sstevel@tonic-gate "%s desc information"), sp->setname); 1328*0Sstevel@tonic-gate mdclrerror(ep); 1329*0Sstevel@tonic-gate continue; 1330*0Sstevel@tonic-gate } 1331*0Sstevel@tonic-gate 1332*0Sstevel@tonic-gate /* Only check MN disksets */ 1333*0Sstevel@tonic-gate if (!(MD_MNSET_DESC(sd))) { 1334*0Sstevel@tonic-gate continue; 1335*0Sstevel@tonic-gate } 1336*0Sstevel@tonic-gate 1337*0Sstevel@tonic-gate /* All actions in step 2 are driven by master */ 1338*0Sstevel@tonic-gate if (!(sd->sd_mn_am_i_master)) { 1339*0Sstevel@tonic-gate continue; 1340*0Sstevel@tonic-gate } 1341*0Sstevel@tonic-gate 1342*0Sstevel@tonic-gate meta_mc_log(MC_LOG3, gettext("Step2 - begin record " 1343*0Sstevel@tonic-gate "synchronization for set %s: %s"), sp->setname, 1344*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 1345*0Sstevel@tonic-gate 1346*0Sstevel@tonic-gate /* 1347*0Sstevel@tonic-gate * Synchronize the USER records in the local mddbs 1348*0Sstevel@tonic-gate * for hosts that are members. The USER records 1349*0Sstevel@tonic-gate * contain set, drive and host information. 1350*0Sstevel@tonic-gate */ 1351*0Sstevel@tonic-gate rval = meta_mnsync_user_records(sp, ep); 1352*0Sstevel@tonic-gate if (rval != 0) { 1353*0Sstevel@tonic-gate mde_perror(ep, gettext( 1354*0Sstevel@tonic-gate "Synchronization of user records " 1355*0Sstevel@tonic-gate "in set %s failed\n"), sp->setname); 1356*0Sstevel@tonic-gate if (rval == 205) { 1357*0Sstevel@tonic-gate /* 1358*0Sstevel@tonic-gate * NOTE: Should issue call to reboot 1359*0Sstevel@tonic-gate * remote host that is causing the RPC 1360*0Sstevel@tonic-gate * failure. Clustering to provide 1361*0Sstevel@tonic-gate * interface in the future. This 1362*0Sstevel@tonic-gate * should stop a never-ending set of 1363*0Sstevel@tonic-gate * 205 reconfig cycles. 1364*0Sstevel@tonic-gate * Remote host causing failure is 1365*0Sstevel@tonic-gate * stored in ep->host if ep is an 1366*0Sstevel@tonic-gate * RPC error. 1367*0Sstevel@tonic-gate * if (mdanyrpcerror(ep)) 1368*0Sstevel@tonic-gate * reboot (ep->host); 1369*0Sstevel@tonic-gate */ 1370*0Sstevel@tonic-gate md_exit(local_sp, 205); 1371*0Sstevel@tonic-gate } else { 1372*0Sstevel@tonic-gate md_exit(local_sp, 1); 1373*0Sstevel@tonic-gate } 1374*0Sstevel@tonic-gate } 1375*0Sstevel@tonic-gate 1376*0Sstevel@tonic-gate /* Reget sd since sync_user_recs may have flushed it */ 1377*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1378*0Sstevel@tonic-gate mde_perror(ep, gettext("Unable to get set " 1379*0Sstevel@tonic-gate "%s desc information"), sp->setname); 1380*0Sstevel@tonic-gate md_exit(local_sp, 1); 1381*0Sstevel@tonic-gate } 1382*0Sstevel@tonic-gate 1383*0Sstevel@tonic-gate dd = metaget_drivedesc(sp, 1384*0Sstevel@tonic-gate (MD_BASICNAME_OK | PRINT_FAST), ep); 1385*0Sstevel@tonic-gate if (! mdisok(ep)) { 1386*0Sstevel@tonic-gate mde_perror(ep, gettext("Unable to get set " 1387*0Sstevel@tonic-gate "%s drive information"), sp->setname); 1388*0Sstevel@tonic-gate md_exit(local_sp, 1); 1389*0Sstevel@tonic-gate } 1390*0Sstevel@tonic-gate 1391*0Sstevel@tonic-gate /* 1392*0Sstevel@tonic-gate * No drives in set, continue to next set. 1393*0Sstevel@tonic-gate */ 1394*0Sstevel@tonic-gate if (dd == NULL) { 1395*0Sstevel@tonic-gate /* Done with this set */ 1396*0Sstevel@tonic-gate continue; 1397*0Sstevel@tonic-gate } 1398*0Sstevel@tonic-gate 1399*0Sstevel@tonic-gate meta_mc_log(MC_LOG3, gettext("Step2 - local set user " 1400*0Sstevel@tonic-gate "records completed for set %s: %s"), sp->setname, 1401*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 1402*0Sstevel@tonic-gate 1403*0Sstevel@tonic-gate /* 1404*0Sstevel@tonic-gate * Synchronize the diskset mddbs for hosts 1405*0Sstevel@tonic-gate * that are members. This may involve 1406*0Sstevel@tonic-gate * playing the changelog and writing out 1407*0Sstevel@tonic-gate * to the diskset mddbs. 1408*0Sstevel@tonic-gate */ 1409*0Sstevel@tonic-gate rval = meta_mnsync_diskset_mddbs(sp, ep); 1410*0Sstevel@tonic-gate if (rval != 0) { 1411*0Sstevel@tonic-gate mde_perror(ep, gettext( 1412*0Sstevel@tonic-gate "Synchronization of diskset mddbs " 1413*0Sstevel@tonic-gate "in set %s failed\n"), sp->setname); 1414*0Sstevel@tonic-gate meta_mc_log(MC_LOG3, gettext("Step2 - diskset " 1415*0Sstevel@tonic-gate "mddb synchronization failed for " 1416*0Sstevel@tonic-gate "set %s: %s"), sp->setname, 1417*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - 1418*0Sstevel@tonic-gate start_time)); 1419*0Sstevel@tonic-gate if (rval == 205) { 1420*0Sstevel@tonic-gate /* 1421*0Sstevel@tonic-gate * NOTE: Should issue call to reboot 1422*0Sstevel@tonic-gate * remote host that is causing the RPC 1423*0Sstevel@tonic-gate * failure. Clustering to provide 1424*0Sstevel@tonic-gate * interface in the future. This 1425*0Sstevel@tonic-gate * should stop a never-ending set of 1426*0Sstevel@tonic-gate * 205 reconfig cycles. 1427*0Sstevel@tonic-gate * Remote host causing failure is 1428*0Sstevel@tonic-gate * stored in ep->host if ep is an 1429*0Sstevel@tonic-gate * RPC error. 1430*0Sstevel@tonic-gate * if (mdanyrpcerror(ep)) 1431*0Sstevel@tonic-gate * reboot (ep->host); 1432*0Sstevel@tonic-gate */ 1433*0Sstevel@tonic-gate md_exit(local_sp, 205); 1434*0Sstevel@tonic-gate } else if (rval == 1) { 1435*0Sstevel@tonic-gate continue; 1436*0Sstevel@tonic-gate } else { 1437*0Sstevel@tonic-gate md_exit(local_sp, 1); 1438*0Sstevel@tonic-gate } 1439*0Sstevel@tonic-gate } 1440*0Sstevel@tonic-gate 1441*0Sstevel@tonic-gate meta_mc_log(MC_LOG3, gettext("Step2 - diskset mddb " 1442*0Sstevel@tonic-gate "synchronization completed for set %s: %s"), 1443*0Sstevel@tonic-gate sp->setname, 1444*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 1445*0Sstevel@tonic-gate 1446*0Sstevel@tonic-gate /* Join the starting nodes to the diskset */ 1447*0Sstevel@tonic-gate rval = meta_mnjoin_all(sp, ep); 1448*0Sstevel@tonic-gate if (rval != 0) { 1449*0Sstevel@tonic-gate mde_perror(ep, gettext( 1450*0Sstevel@tonic-gate "Join of non-owner (starting) nodes " 1451*0Sstevel@tonic-gate "in set %s failed\n"), sp->setname); 1452*0Sstevel@tonic-gate meta_mc_log(MC_LOG3, gettext("Step2 - non owner" 1453*0Sstevel@tonic-gate "nodes joined for set %s: %s"), 1454*0Sstevel@tonic-gate sp->setname, 1455*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - 1456*0Sstevel@tonic-gate start_time)); 1457*0Sstevel@tonic-gate if (rval == 205) { 1458*0Sstevel@tonic-gate /* 1459*0Sstevel@tonic-gate * NOTE: Should issue call to reboot 1460*0Sstevel@tonic-gate * remote host that is causing the RPC 1461*0Sstevel@tonic-gate * failure. Clustering to provide 1462*0Sstevel@tonic-gate * interface in the future. This 1463*0Sstevel@tonic-gate * should stop a never-ending set of 1464*0Sstevel@tonic-gate * 205 reconfig cycles. 1465*0Sstevel@tonic-gate * Remote host causing failure is 1466*0Sstevel@tonic-gate * stored in ep->host if ep is an 1467*0Sstevel@tonic-gate * RPC error. 1468*0Sstevel@tonic-gate * if (mdanyrpcerror(ep)) 1469*0Sstevel@tonic-gate * reboot (ep->host); 1470*0Sstevel@tonic-gate */ 1471*0Sstevel@tonic-gate md_exit(local_sp, 205); 1472*0Sstevel@tonic-gate } else { 1473*0Sstevel@tonic-gate md_exit(local_sp, 1); 1474*0Sstevel@tonic-gate } 1475*0Sstevel@tonic-gate } 1476*0Sstevel@tonic-gate 1477*0Sstevel@tonic-gate meta_mc_log(MC_LOG3, gettext("Step2 - non owner nodes " 1478*0Sstevel@tonic-gate "joined for set %s: %s"), sp->setname, 1479*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 1480*0Sstevel@tonic-gate 1481*0Sstevel@tonic-gate } 1482*0Sstevel@tonic-gate 1483*0Sstevel@tonic-gate meta_mc_log(MC_LOG2, gettext("Step2 completed: %s"), 1484*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 1485*0Sstevel@tonic-gate 1486*0Sstevel@tonic-gate break; 1487*0Sstevel@tonic-gate 1488*0Sstevel@tonic-gate case MC_STEP3: 1489*0Sstevel@tonic-gate /* 1490*0Sstevel@tonic-gate * Step 3 1491*0Sstevel@tonic-gate * 1492*0Sstevel@tonic-gate * For all multinode sets do, 1493*0Sstevel@tonic-gate * - Reinitialise rpc.mdcommd 1494*0Sstevel@tonic-gate * - Reset mirror owners to null if the current owner is 1495*0Sstevel@tonic-gate * no longer in the membership list 1496*0Sstevel@tonic-gate */ 1497*0Sstevel@tonic-gate 1498*0Sstevel@tonic-gate /* expect the nodelist to follow the step name */ 1499*0Sstevel@tonic-gate if (argc < 1) 1500*0Sstevel@tonic-gate usage(sp, 1); 1501*0Sstevel@tonic-gate 1502*0Sstevel@tonic-gate meta_mc_log(MC_LOG2, gettext("Starting Step3: %s"), 1503*0Sstevel@tonic-gate meta_print_hrtime(0)); 1504*0Sstevel@tonic-gate 1505*0Sstevel@tonic-gate /* 1506*0Sstevel@tonic-gate * Does local set exist? If not, exit with 0 1507*0Sstevel@tonic-gate * since there's no reason to have this node panic if 1508*0Sstevel@tonic-gate * the local set cannot be started. 1509*0Sstevel@tonic-gate */ 1510*0Sstevel@tonic-gate if ((local_sp = load_local_set(ep)) == NULL) { 1511*0Sstevel@tonic-gate md_exit(local_sp, 0); 1512*0Sstevel@tonic-gate } 1513*0Sstevel@tonic-gate 1514*0Sstevel@tonic-gate /* 1515*0Sstevel@tonic-gate * walk through all sets on this node which could include: 1516*0Sstevel@tonic-gate * - MN disksets 1517*0Sstevel@tonic-gate * - traditional disksets 1518*0Sstevel@tonic-gate * - non-existent disksets 1519*0Sstevel@tonic-gate * start mirror resync for all MN sets 1520*0Sstevel@tonic-gate */ 1521*0Sstevel@tonic-gate if ((max_sets = get_max_sets(ep)) == 0) { 1522*0Sstevel@tonic-gate mde_perror(ep, ""); 1523*0Sstevel@tonic-gate md_exit(local_sp, 1); 1524*0Sstevel@tonic-gate } 1525*0Sstevel@tonic-gate 1526*0Sstevel@tonic-gate /* start walking through all possible disksets */ 1527*0Sstevel@tonic-gate for (setno = 1; setno < max_sets; setno++) { 1528*0Sstevel@tonic-gate if ((sp = metasetnosetname(setno, ep)) == NULL) { 1529*0Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) { 1530*0Sstevel@tonic-gate /* No set for this setno - continue */ 1531*0Sstevel@tonic-gate mdclrerror(ep); 1532*0Sstevel@tonic-gate continue; 1533*0Sstevel@tonic-gate } else { 1534*0Sstevel@tonic-gate mde_perror(ep, gettext("Unable to " 1535*0Sstevel@tonic-gate "get set %d information"), setno); 1536*0Sstevel@tonic-gate md_exit(local_sp, 1); 1537*0Sstevel@tonic-gate } 1538*0Sstevel@tonic-gate } 1539*0Sstevel@tonic-gate 1540*0Sstevel@tonic-gate /* only check multi-node disksets */ 1541*0Sstevel@tonic-gate if (!meta_is_mn_set(sp, ep)) { 1542*0Sstevel@tonic-gate mdclrerror(ep); 1543*0Sstevel@tonic-gate continue; 1544*0Sstevel@tonic-gate } 1545*0Sstevel@tonic-gate 1546*0Sstevel@tonic-gate if (meta_lock(sp, TRUE, ep) != 0) { 1547*0Sstevel@tonic-gate mde_perror(ep, ""); 1548*0Sstevel@tonic-gate md_exit(local_sp, 1); 1549*0Sstevel@tonic-gate } 1550*0Sstevel@tonic-gate 1551*0Sstevel@tonic-gate /* If this node isn't joined to set, do nothing */ 1552*0Sstevel@tonic-gate if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) { 1553*0Sstevel@tonic-gate if (!mdisok(ep)) { 1554*0Sstevel@tonic-gate mde_perror(ep, gettext("Could " 1555*0Sstevel@tonic-gate "not get set %s ownership"), 1556*0Sstevel@tonic-gate sp->setname); 1557*0Sstevel@tonic-gate md_exit(sp, 1); 1558*0Sstevel@tonic-gate } 1559*0Sstevel@tonic-gate mdclrerror(ep); 1560*0Sstevel@tonic-gate meta_unlock(sp, ep); 1561*0Sstevel@tonic-gate continue; 1562*0Sstevel@tonic-gate } 1563*0Sstevel@tonic-gate 1564*0Sstevel@tonic-gate meta_mc_log(MC_LOG3, gettext("Step3 - begin " 1565*0Sstevel@tonic-gate "re-initialising rpc.mdcommd and resetting mirror " 1566*0Sstevel@tonic-gate "owners for set %s: %s"), sp->setname, 1567*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 1568*0Sstevel@tonic-gate 1569*0Sstevel@tonic-gate /* reinitialzse rpc.mdcommd with new nodelist */ 1570*0Sstevel@tonic-gate if (mdmn_reinit_set(setno)) { 1571*0Sstevel@tonic-gate md_eprintf(gettext( 1572*0Sstevel@tonic-gate "Could not re-initialise rpc.mdcommd for " 1573*0Sstevel@tonic-gate "set %s\n"), sp->setname); 1574*0Sstevel@tonic-gate md_exit(sp, 1); 1575*0Sstevel@tonic-gate } 1576*0Sstevel@tonic-gate 1577*0Sstevel@tonic-gate (void) memset(&cfg, 0, sizeof (cfg)); 1578*0Sstevel@tonic-gate cfg.c_id = 0; 1579*0Sstevel@tonic-gate cfg.c_setno = sp->setno; 1580*0Sstevel@tonic-gate if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde, 1581*0Sstevel@tonic-gate NULL) != 0) { 1582*0Sstevel@tonic-gate mdstealerror(ep, &cfg.c_mde); 1583*0Sstevel@tonic-gate mde_perror(ep, gettext("Could " 1584*0Sstevel@tonic-gate "not get set %s information"), 1585*0Sstevel@tonic-gate sp->setname); 1586*0Sstevel@tonic-gate md_exit(sp, 1); 1587*0Sstevel@tonic-gate } 1588*0Sstevel@tonic-gate 1589*0Sstevel@tonic-gate /* Don't do anything else if set is stale */ 1590*0Sstevel@tonic-gate if (cfg.c_flags & MDDB_C_STALE) { 1591*0Sstevel@tonic-gate meta_unlock(sp, ep); 1592*0Sstevel@tonic-gate mdclrerror(ep); 1593*0Sstevel@tonic-gate continue; 1594*0Sstevel@tonic-gate } 1595*0Sstevel@tonic-gate 1596*0Sstevel@tonic-gate /* reset mirror owners */ 1597*0Sstevel@tonic-gate if (reset_state(RESET_OWNER, sp, MD_MIRROR, ep) == -1) { 1598*0Sstevel@tonic-gate md_exit(sp, 1); 1599*0Sstevel@tonic-gate } 1600*0Sstevel@tonic-gate 1601*0Sstevel@tonic-gate meta_unlock(sp, ep); 1602*0Sstevel@tonic-gate 1603*0Sstevel@tonic-gate meta_mc_log(MC_LOG3, gettext("Step3 - rpc.mdcommd " 1604*0Sstevel@tonic-gate "re-initialised and mirror owners reset for " 1605*0Sstevel@tonic-gate "set %s: %s"), sp->setname, 1606*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 1607*0Sstevel@tonic-gate } 1608*0Sstevel@tonic-gate 1609*0Sstevel@tonic-gate meta_mc_log(MC_LOG2, gettext("Step3 completed: %s"), 1610*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 1611*0Sstevel@tonic-gate 1612*0Sstevel@tonic-gate break; 1613*0Sstevel@tonic-gate 1614*0Sstevel@tonic-gate case MC_STEP4: 1615*0Sstevel@tonic-gate /* 1616*0Sstevel@tonic-gate * Step 4 1617*0Sstevel@tonic-gate * 1618*0Sstevel@tonic-gate * For all multinode sets do: 1619*0Sstevel@tonic-gate * - Resume the rpc.mdcommd messages. Must resume all 1620*0Sstevel@tonic-gate * sets before issuing I/O to any set since an error 1621*0Sstevel@tonic-gate * encountered in a commd suspended set could be 1622*0Sstevel@tonic-gate * blocked waiting for commd in another set to resume. 1623*0Sstevel@tonic-gate * (This happens since the daemon queues service 1624*0Sstevel@tonic-gate * all sets). An open of a soft partition causes 1625*0Sstevel@tonic-gate * a read of the watermarks during the open. 1626*0Sstevel@tonic-gate * - If set is non-writable (not an owner or STALE), then 1627*0Sstevel@tonic-gate * continue to next set. 1628*0Sstevel@tonic-gate * 1629*0Sstevel@tonic-gate * For all multinode sets do, 1630*0Sstevel@tonic-gate * - Reset ABR states for all mirrors, ie clear ABR if not 1631*0Sstevel@tonic-gate * open on any node. 1632*0Sstevel@tonic-gate * - Reset ABR states for all soft partitions, ie clear ABR if 1633*0Sstevel@tonic-gate * not open on any node. 1634*0Sstevel@tonic-gate * - For all slave nodes that have entered through the start 1635*0Sstevel@tonic-gate * step, update the ABR state to that of the master and 1636*0Sstevel@tonic-gate * get the submirror state from the master 1637*0Sstevel@tonic-gate * - meta_lock set 1638*0Sstevel@tonic-gate * - Resync all mirrors 1639*0Sstevel@tonic-gate * - unlock meta_lock for this set. 1640*0Sstevel@tonic-gate * - Choose a new owner for any orphaned resyncs 1641*0Sstevel@tonic-gate * 1642*0Sstevel@tonic-gate * There is one potential issue here. when concurrently 1643*0Sstevel@tonic-gate * resetting and updating the ABR state. If the master has ABR 1644*0Sstevel@tonic-gate * set, but should no longer have because the only node that 1645*0Sstevel@tonic-gate * had the metadevice open and had ABR set has paniced, the 1646*0Sstevel@tonic-gate * master will send a message to all nodes to clear the ABR 1647*0Sstevel@tonic-gate * state. Meanwhile any node that has come through the 1648*0Sstevel@tonic-gate * start step will get tstate from the master and will update 1649*0Sstevel@tonic-gate * ABR if it was set in tstate. So, we appear to have a problem 1650*0Sstevel@tonic-gate * if the following sequence occurs:- 1651*0Sstevel@tonic-gate * - The slave gets tstate with ABR set 1652*0Sstevel@tonic-gate * - The master sends a message to clear ABR 1653*0Sstevel@tonic-gate * - The slave updates ABR with the value it got from tstate. 1654*0Sstevel@tonic-gate * We now have the master with ABR clear and the slave with ABR 1655*0Sstevel@tonic-gate * set. Fortunately, having set ABR, the slave will close the 1656*0Sstevel@tonic-gate * metadevice after setting ABR and as there are no nodes with 1657*0Sstevel@tonic-gate * the device open, the close will send a message to clear ABR 1658*0Sstevel@tonic-gate * on all nodes. So, the nodes will all have ABR unset. 1659*0Sstevel@tonic-gate */ 1660*0Sstevel@tonic-gate 1661*0Sstevel@tonic-gate /* expect the nodelist to follow the step name */ 1662*0Sstevel@tonic-gate if (argc < 1) 1663*0Sstevel@tonic-gate usage(sp, 1); 1664*0Sstevel@tonic-gate 1665*0Sstevel@tonic-gate meta_mc_log(MC_LOG2, gettext("Starting Step4: %s"), 1666*0Sstevel@tonic-gate meta_print_hrtime(0)); 1667*0Sstevel@tonic-gate 1668*0Sstevel@tonic-gate /* 1669*0Sstevel@tonic-gate * Does local set exist? If not, exit with 0 1670*0Sstevel@tonic-gate * since there's no reason to have this node panic if 1671*0Sstevel@tonic-gate * the local set cannot be started. 1672*0Sstevel@tonic-gate */ 1673*0Sstevel@tonic-gate if ((local_sp = load_local_set(ep)) == NULL) { 1674*0Sstevel@tonic-gate md_exit(local_sp, 0); 1675*0Sstevel@tonic-gate } 1676*0Sstevel@tonic-gate 1677*0Sstevel@tonic-gate /* 1678*0Sstevel@tonic-gate * walk through all sets on this node which could include: 1679*0Sstevel@tonic-gate * - MN disksets 1680*0Sstevel@tonic-gate * - traditional disksets 1681*0Sstevel@tonic-gate * - non-existent disksets 1682*0Sstevel@tonic-gate * start mirror resync for all MN sets 1683*0Sstevel@tonic-gate */ 1684*0Sstevel@tonic-gate if ((max_sets = get_max_sets(ep)) == 0) { 1685*0Sstevel@tonic-gate mde_perror(ep, ""); 1686*0Sstevel@tonic-gate md_exit(local_sp, 1); 1687*0Sstevel@tonic-gate } 1688*0Sstevel@tonic-gate 1689*0Sstevel@tonic-gate /* Clear set_info structure */ 1690*0Sstevel@tonic-gate for (setno = 1; setno < max_sets; setno++) { 1691*0Sstevel@tonic-gate set_info[setno] = 0; 1692*0Sstevel@tonic-gate } 1693*0Sstevel@tonic-gate 1694*0Sstevel@tonic-gate /* start walking through all possible disksets */ 1695*0Sstevel@tonic-gate for (setno = 1; setno < max_sets; setno++) { 1696*0Sstevel@tonic-gate if ((sp = metasetnosetname(setno, ep)) == NULL) { 1697*0Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) { 1698*0Sstevel@tonic-gate /* No set for this setno - continue */ 1699*0Sstevel@tonic-gate mdclrerror(ep); 1700*0Sstevel@tonic-gate continue; 1701*0Sstevel@tonic-gate } else { 1702*0Sstevel@tonic-gate mde_perror(ep, gettext("Unable to " 1703*0Sstevel@tonic-gate "get set %d information"), setno); 1704*0Sstevel@tonic-gate md_exit(local_sp, 1); 1705*0Sstevel@tonic-gate } 1706*0Sstevel@tonic-gate } 1707*0Sstevel@tonic-gate 1708*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1709*0Sstevel@tonic-gate mde_perror(ep, gettext("Unable to get set " 1710*0Sstevel@tonic-gate "%s desc information"), sp->setname); 1711*0Sstevel@tonic-gate mdclrerror(ep); 1712*0Sstevel@tonic-gate continue; 1713*0Sstevel@tonic-gate } 1714*0Sstevel@tonic-gate 1715*0Sstevel@tonic-gate /* only check multi-node disksets */ 1716*0Sstevel@tonic-gate if (!meta_is_mn_set(sp, ep)) { 1717*0Sstevel@tonic-gate mdclrerror(ep); 1718*0Sstevel@tonic-gate continue; 1719*0Sstevel@tonic-gate } 1720*0Sstevel@tonic-gate 1721*0Sstevel@tonic-gate set_info[setno] |= SET_INFO_MN; 1722*0Sstevel@tonic-gate 1723*0Sstevel@tonic-gate /* 1724*0Sstevel@tonic-gate * If not an owner (all mddbs failed) or stale 1725*0Sstevel@tonic-gate * (< 50% mddbs operational), then set is 1726*0Sstevel@tonic-gate * non-writable so just resume commd and 1727*0Sstevel@tonic-gate * unblock mddb messages. 1728*0Sstevel@tonic-gate */ 1729*0Sstevel@tonic-gate mdclrerror(ep); 1730*0Sstevel@tonic-gate if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) { 1731*0Sstevel@tonic-gate set_info[setno] |= SET_INFO_NO_WR; 1732*0Sstevel@tonic-gate } 1733*0Sstevel@tonic-gate if (!mdisok(ep)) { 1734*0Sstevel@tonic-gate mde_perror(ep, gettext("Could " 1735*0Sstevel@tonic-gate "not get set %s ownership"), 1736*0Sstevel@tonic-gate sp->setname); 1737*0Sstevel@tonic-gate md_exit(local_sp, 1); 1738*0Sstevel@tonic-gate } 1739*0Sstevel@tonic-gate /* Set is owned - is it stale? */ 1740*0Sstevel@tonic-gate if (!set_info[setno] & SET_INFO_NO_WR) { 1741*0Sstevel@tonic-gate (void) memset(&cfg, 0, sizeof (cfg)); 1742*0Sstevel@tonic-gate cfg.c_id = 0; 1743*0Sstevel@tonic-gate cfg.c_setno = sp->setno; 1744*0Sstevel@tonic-gate if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde, 1745*0Sstevel@tonic-gate NULL) != 0) { 1746*0Sstevel@tonic-gate mdstealerror(ep, &cfg.c_mde); 1747*0Sstevel@tonic-gate mde_perror(ep, gettext("Could " 1748*0Sstevel@tonic-gate "not get set %s information"), 1749*0Sstevel@tonic-gate sp->setname); 1750*0Sstevel@tonic-gate md_exit(local_sp, 1); 1751*0Sstevel@tonic-gate } 1752*0Sstevel@tonic-gate if (cfg.c_flags & MDDB_C_STALE) { 1753*0Sstevel@tonic-gate set_info[setno] |= SET_INFO_NO_WR; 1754*0Sstevel@tonic-gate } 1755*0Sstevel@tonic-gate } 1756*0Sstevel@tonic-gate 1757*0Sstevel@tonic-gate /* resume rpc.mdcommd */ 1758*0Sstevel@tonic-gate if (mdmn_resume(setno, MD_COMM_ALL_CLASSES, 0)) { 1759*0Sstevel@tonic-gate md_eprintf(gettext("Unable to resume " 1760*0Sstevel@tonic-gate "rpc.mdcommd for set %s\n"), sp->setname); 1761*0Sstevel@tonic-gate md_exit(local_sp, 1); 1762*0Sstevel@tonic-gate } 1763*0Sstevel@tonic-gate meta_ping_mnset(setno); 1764*0Sstevel@tonic-gate 1765*0Sstevel@tonic-gate /* Unblock mddb parse messages */ 1766*0Sstevel@tonic-gate if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 1767*0Sstevel@tonic-gate (void) memset(&mbp, 0, sizeof (mbp)); 1768*0Sstevel@tonic-gate mbp.c_setno = setno; 1769*0Sstevel@tonic-gate mbp.c_blk_flags = MDDB_UNBLOCK_PARSE; 1770*0Sstevel@tonic-gate if (metaioctl(MD_MN_MDDB_BLOCK, &mbp, 1771*0Sstevel@tonic-gate &mbp.c_mde, NULL)) { 1772*0Sstevel@tonic-gate mdstealerror(ep, &mbp.c_mde); 1773*0Sstevel@tonic-gate mde_perror(ep, gettext("Could not " 1774*0Sstevel@tonic-gate "unblock set %s"), sp->setname); 1775*0Sstevel@tonic-gate md_exit(local_sp, 1); 1776*0Sstevel@tonic-gate } 1777*0Sstevel@tonic-gate } 1778*0Sstevel@tonic-gate meta_mc_log(MC_LOG3, gettext("Step4 - rpc.mdcommd " 1779*0Sstevel@tonic-gate "resumed and messages unblocked for set %s: %s"), 1780*0Sstevel@tonic-gate sp->setname, 1781*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 1782*0Sstevel@tonic-gate } 1783*0Sstevel@tonic-gate 1784*0Sstevel@tonic-gate for (setno = 1; setno < max_sets; setno++) { 1785*0Sstevel@tonic-gate int start_step; 1786*0Sstevel@tonic-gate 1787*0Sstevel@tonic-gate /* Skip traditional disksets. */ 1788*0Sstevel@tonic-gate if ((set_info[setno] & SET_INFO_MN) == 0) 1789*0Sstevel@tonic-gate continue; 1790*0Sstevel@tonic-gate 1791*0Sstevel@tonic-gate /* 1792*0Sstevel@tonic-gate * If already determined that this set is 1793*0Sstevel@tonic-gate * a non-writable set, then just continue 1794*0Sstevel@tonic-gate * to next set since there's nothing else 1795*0Sstevel@tonic-gate * to do for a non-writable set. 1796*0Sstevel@tonic-gate */ 1797*0Sstevel@tonic-gate if (set_info[setno] & SET_INFO_NO_WR) 1798*0Sstevel@tonic-gate continue; 1799*0Sstevel@tonic-gate 1800*0Sstevel@tonic-gate if ((sp = metasetnosetname(setno, ep)) == NULL) { 1801*0Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) { 1802*0Sstevel@tonic-gate /* No set for this setno - continue */ 1803*0Sstevel@tonic-gate mdclrerror(ep); 1804*0Sstevel@tonic-gate continue; 1805*0Sstevel@tonic-gate } else { 1806*0Sstevel@tonic-gate mde_perror(ep, gettext("Unable to " 1807*0Sstevel@tonic-gate "get set %d information"), setno); 1808*0Sstevel@tonic-gate md_exit(local_sp, 1); 1809*0Sstevel@tonic-gate } 1810*0Sstevel@tonic-gate } 1811*0Sstevel@tonic-gate 1812*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1813*0Sstevel@tonic-gate mde_perror(ep, gettext("Unable to get set " 1814*0Sstevel@tonic-gate "%s desc information"), sp->setname); 1815*0Sstevel@tonic-gate mdclrerror(ep); 1816*0Sstevel@tonic-gate continue; 1817*0Sstevel@tonic-gate } 1818*0Sstevel@tonic-gate 1819*0Sstevel@tonic-gate /* See if this node came through the start step */ 1820*0Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf)); 1821*0Sstevel@tonic-gate sf.sf_setno = sp->setno; 1822*0Sstevel@tonic-gate sf.sf_flags = MDDB_NM_GET; 1823*0Sstevel@tonic-gate /* Use magic to help protect ioctl against attack. */ 1824*0Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC; 1825*0Sstevel@tonic-gate if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 1826*0Sstevel@tonic-gate &sf.sf_mde, NULL)) { 1827*0Sstevel@tonic-gate mdstealerror(ep, &sf.sf_mde); 1828*0Sstevel@tonic-gate mde_perror(ep, gettext("Could not get " 1829*0Sstevel@tonic-gate "start_step flag for set %s"), sp->setname); 1830*0Sstevel@tonic-gate md_exit(local_sp, 1); 1831*0Sstevel@tonic-gate } 1832*0Sstevel@tonic-gate start_step = 1833*0Sstevel@tonic-gate (sf.sf_setflags & MD_SET_MN_START_RC)? 1: 0; 1834*0Sstevel@tonic-gate 1835*0Sstevel@tonic-gate /* 1836*0Sstevel@tonic-gate * We can now reset the start_step flag for the set 1837*0Sstevel@tonic-gate * if it was already set. 1838*0Sstevel@tonic-gate */ 1839*0Sstevel@tonic-gate if (start_step) { 1840*0Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf)); 1841*0Sstevel@tonic-gate sf.sf_setno = sp->setno; 1842*0Sstevel@tonic-gate sf.sf_setflags = MD_SET_MN_START_RC; 1843*0Sstevel@tonic-gate sf.sf_flags = MDDB_NM_RESET; 1844*0Sstevel@tonic-gate /* 1845*0Sstevel@tonic-gate * Use magic to help protect ioctl 1846*0Sstevel@tonic-gate * against attack. 1847*0Sstevel@tonic-gate */ 1848*0Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC; 1849*0Sstevel@tonic-gate if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 1850*0Sstevel@tonic-gate &sf.sf_mde, NULL)) { 1851*0Sstevel@tonic-gate mdstealerror(ep, &sf.sf_mde); 1852*0Sstevel@tonic-gate mde_perror(ep, 1853*0Sstevel@tonic-gate gettext("Could not reset " 1854*0Sstevel@tonic-gate "start_step flag for set %s"), 1855*0Sstevel@tonic-gate sp->setname); 1856*0Sstevel@tonic-gate } 1857*0Sstevel@tonic-gate } 1858*0Sstevel@tonic-gate 1859*0Sstevel@tonic-gate meta_mc_log(MC_LOG3, gettext("Step4 - begin setting " 1860*0Sstevel@tonic-gate "ABR state and restarting io's for " 1861*0Sstevel@tonic-gate "set %s: %s"), sp->setname, 1862*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 1863*0Sstevel@tonic-gate 1864*0Sstevel@tonic-gate 1865*0Sstevel@tonic-gate /* 1866*0Sstevel@tonic-gate * If we are not the master and we have come through 1867*0Sstevel@tonic-gate * the start step, we must update the ABR states 1868*0Sstevel@tonic-gate * for mirrors and soft partitions. Also the submirror 1869*0Sstevel@tonic-gate * states need to be synchronised so that we see the 1870*0Sstevel@tonic-gate * same status as other previously joined members. 1871*0Sstevel@tonic-gate * This _must_ be done before starting the resync. 1872*0Sstevel@tonic-gate */ 1873*0Sstevel@tonic-gate if (!(sd->sd_mn_am_i_master) && start_step) { 1874*0Sstevel@tonic-gate if (reset_state(GET_MIRROR_STATE, sp, MD_MIRROR, 1875*0Sstevel@tonic-gate ep) == -1) { 1876*0Sstevel@tonic-gate md_exit(local_sp, 1); 1877*0Sstevel@tonic-gate } 1878*0Sstevel@tonic-gate if (reset_state(UPDATE_ABR, sp, MD_SP, 1879*0Sstevel@tonic-gate ep) == -1) { 1880*0Sstevel@tonic-gate md_exit(local_sp, 1); 1881*0Sstevel@tonic-gate } 1882*0Sstevel@tonic-gate /* 1883*0Sstevel@tonic-gate * Mark the fact that we've got the mirror 1884*0Sstevel@tonic-gate * state. This allows the resync thread to 1885*0Sstevel@tonic-gate * determine if _it_ needs to issue this. This 1886*0Sstevel@tonic-gate * can happen if a node is added to a set after 1887*0Sstevel@tonic-gate * a reconfig cycle has completed. 1888*0Sstevel@tonic-gate */ 1889*0Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf)); 1890*0Sstevel@tonic-gate sf.sf_setno = sp->setno; 1891*0Sstevel@tonic-gate sf.sf_setflags = MD_SET_MN_MIR_STATE_RC; 1892*0Sstevel@tonic-gate sf.sf_flags = MDDB_NM_SET; 1893*0Sstevel@tonic-gate /* 1894*0Sstevel@tonic-gate * Use magic to help protect ioctl 1895*0Sstevel@tonic-gate * against attack. 1896*0Sstevel@tonic-gate */ 1897*0Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC; 1898*0Sstevel@tonic-gate if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 1899*0Sstevel@tonic-gate &sf.sf_mde, NULL)) { 1900*0Sstevel@tonic-gate mdstealerror(ep, &sf.sf_mde); 1901*0Sstevel@tonic-gate mde_perror(ep, 1902*0Sstevel@tonic-gate gettext("Could not set " 1903*0Sstevel@tonic-gate "submirror state flag for set %s"), 1904*0Sstevel@tonic-gate sp->setname); 1905*0Sstevel@tonic-gate } 1906*0Sstevel@tonic-gate } 1907*0Sstevel@tonic-gate 1908*0Sstevel@tonic-gate /* 1909*0Sstevel@tonic-gate * All remaining actions are only performed by the 1910*0Sstevel@tonic-gate * master 1911*0Sstevel@tonic-gate */ 1912*0Sstevel@tonic-gate if (!(sd->sd_mn_am_i_master)) { 1913*0Sstevel@tonic-gate if (meta_lock(sp, TRUE, ep) != 0) { 1914*0Sstevel@tonic-gate mde_perror(ep, ""); 1915*0Sstevel@tonic-gate md_exit(local_sp, 1); 1916*0Sstevel@tonic-gate } 1917*0Sstevel@tonic-gate meta_mirror_resync_unblock(sp); 1918*0Sstevel@tonic-gate meta_unlock(sp, ep); 1919*0Sstevel@tonic-gate continue; 1920*0Sstevel@tonic-gate } 1921*0Sstevel@tonic-gate 1922*0Sstevel@tonic-gate /* 1923*0Sstevel@tonic-gate * If the master came through the start step, this 1924*0Sstevel@tonic-gate * implies that all of the nodes must have done the 1925*0Sstevel@tonic-gate * same and hence there can be no applications 1926*0Sstevel@tonic-gate * running. Hence no need to reset ABR 1927*0Sstevel@tonic-gate */ 1928*0Sstevel@tonic-gate if (!start_step) { 1929*0Sstevel@tonic-gate /* Reset ABR state for mirrors */ 1930*0Sstevel@tonic-gate if (reset_state(RESET_ABR, sp, MD_MIRROR, 1931*0Sstevel@tonic-gate ep) == -1) { 1932*0Sstevel@tonic-gate md_exit(local_sp, 1); 1933*0Sstevel@tonic-gate } 1934*0Sstevel@tonic-gate /* ...and now the same for soft partitions */ 1935*0Sstevel@tonic-gate if (reset_state(RESET_ABR, sp, MD_SP, 1936*0Sstevel@tonic-gate ep) == -1) { 1937*0Sstevel@tonic-gate md_exit(local_sp, 1); 1938*0Sstevel@tonic-gate } 1939*0Sstevel@tonic-gate } 1940*0Sstevel@tonic-gate 1941*0Sstevel@tonic-gate /* 1942*0Sstevel@tonic-gate * choose owners for orphaned resyncs and reset 1943*0Sstevel@tonic-gate * non-orphaned resyncs so that an owner node that 1944*0Sstevel@tonic-gate * reboots will restart the resync if needed. 1945*0Sstevel@tonic-gate */ 1946*0Sstevel@tonic-gate if (reset_state(CHOOSE_OWNER, sp, MD_MIRROR, ep) == -1) 1947*0Sstevel@tonic-gate md_exit(local_sp, 1); 1948*0Sstevel@tonic-gate 1949*0Sstevel@tonic-gate /* 1950*0Sstevel@tonic-gate * Must unlock set lock before meta_mirror_resync_all 1951*0Sstevel@tonic-gate * sends a message to run the metasync command 1952*0Sstevel@tonic-gate * which also grabs the meta_lock. 1953*0Sstevel@tonic-gate */ 1954*0Sstevel@tonic-gate if (meta_lock(sp, TRUE, ep) != 0) { 1955*0Sstevel@tonic-gate mde_perror(ep, ""); 1956*0Sstevel@tonic-gate md_exit(local_sp, 1); 1957*0Sstevel@tonic-gate } 1958*0Sstevel@tonic-gate meta_mirror_resync_unblock(sp); 1959*0Sstevel@tonic-gate meta_unlock(sp, ep); 1960*0Sstevel@tonic-gate 1961*0Sstevel@tonic-gate /* resync all mirrors in set */ 1962*0Sstevel@tonic-gate if (meta_mirror_resync_all(sp, 0, ep) != 0) { 1963*0Sstevel@tonic-gate mde_perror(ep, gettext("Mirror resyncs " 1964*0Sstevel@tonic-gate "failed for set %s"), sp->setname); 1965*0Sstevel@tonic-gate md_exit(local_sp, 1); 1966*0Sstevel@tonic-gate } 1967*0Sstevel@tonic-gate 1968*0Sstevel@tonic-gate meta_mc_log(MC_LOG3, gettext("Step4 - io's restarted " 1969*0Sstevel@tonic-gate "for set %s: %s"), sp->setname, 1970*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 1971*0Sstevel@tonic-gate } 1972*0Sstevel@tonic-gate 1973*0Sstevel@tonic-gate meta_mc_log(MC_LOG2, gettext("Step4 completed: %s"), 1974*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 1975*0Sstevel@tonic-gate 1976*0Sstevel@tonic-gate break; 1977*0Sstevel@tonic-gate 1978*0Sstevel@tonic-gate default: 1979*0Sstevel@tonic-gate usage(sp, 1); 1980*0Sstevel@tonic-gate break; 1981*0Sstevel@tonic-gate } 1982*0Sstevel@tonic-gate 1983*0Sstevel@tonic-gate md_exit(sp, 0); 1984*0Sstevel@tonic-gate /* NOTREACHED */ 1985*0Sstevel@tonic-gate return (0); 1986*0Sstevel@tonic-gate } 1987