1*0Sstevel@tonic-gate /* 2*0Sstevel@tonic-gate * CDDL HEADER START 3*0Sstevel@tonic-gate * 4*0Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*0Sstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only 6*0Sstevel@tonic-gate * (the "License"). You may not use this file except in compliance 7*0Sstevel@tonic-gate * with the License. 8*0Sstevel@tonic-gate * 9*0Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10*0Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 11*0Sstevel@tonic-gate * See the License for the specific language governing permissions 12*0Sstevel@tonic-gate * and limitations under the License. 13*0Sstevel@tonic-gate * 14*0Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 15*0Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16*0Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 17*0Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 18*0Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 19*0Sstevel@tonic-gate * 20*0Sstevel@tonic-gate * CDDL HEADER END 21*0Sstevel@tonic-gate */ 22*0Sstevel@tonic-gate /* 23*0Sstevel@tonic-gate * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24*0Sstevel@tonic-gate * Use is subject to license terms. 25*0Sstevel@tonic-gate */ 26*0Sstevel@tonic-gate 27*0Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 28*0Sstevel@tonic-gate 29*0Sstevel@tonic-gate /* 30*0Sstevel@tonic-gate * Just in case we're not in a build environment, make sure that 31*0Sstevel@tonic-gate * TEXT_DOMAIN gets set to something. 32*0Sstevel@tonic-gate */ 33*0Sstevel@tonic-gate #if !defined(TEXT_DOMAIN) 34*0Sstevel@tonic-gate #define TEXT_DOMAIN "SYS_TEST" 35*0Sstevel@tonic-gate #endif 36*0Sstevel@tonic-gate 37*0Sstevel@tonic-gate /* 38*0Sstevel@tonic-gate * Metadevice diskset interfaces 39*0Sstevel@tonic-gate */ 40*0Sstevel@tonic-gate 41*0Sstevel@tonic-gate #include "meta_set_prv.h" 42*0Sstevel@tonic-gate #include <meta.h> 43*0Sstevel@tonic-gate #include <metad.h> 44*0Sstevel@tonic-gate #include <mdmn_changelog.h> 45*0Sstevel@tonic-gate #include <sys/lvm/md_crc.h> 46*0Sstevel@tonic-gate #include <sys/utsname.h> 47*0Sstevel@tonic-gate #include <sdssc.h> 48*0Sstevel@tonic-gate 49*0Sstevel@tonic-gate #include <sys/sysevent/eventdefs.h> 50*0Sstevel@tonic-gate #include <sys/sysevent/svm.h> 51*0Sstevel@tonic-gate extern char *blkname(char *); 52*0Sstevel@tonic-gate 53*0Sstevel@tonic-gate static md_drive_desc * 54*0Sstevel@tonic-gate dr2drivedesc( 55*0Sstevel@tonic-gate mdsetname_t *sp, 56*0Sstevel@tonic-gate side_t sideno, 57*0Sstevel@tonic-gate int flags, 58*0Sstevel@tonic-gate md_error_t *ep 59*0Sstevel@tonic-gate ) 60*0Sstevel@tonic-gate { 61*0Sstevel@tonic-gate md_set_record *sr; 62*0Sstevel@tonic-gate md_drive_record *dr; 63*0Sstevel@tonic-gate mddrivename_t *dnp; 64*0Sstevel@tonic-gate md_drive_desc *dd_head = NULL; 65*0Sstevel@tonic-gate md_set_desc *sd; 66*0Sstevel@tonic-gate 67*0Sstevel@tonic-gate if (flags & MD_BYPASS_DAEMON) { 68*0Sstevel@tonic-gate if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL) 69*0Sstevel@tonic-gate return (NULL); 70*0Sstevel@tonic-gate sd = metaget_setdesc(sp, ep); 71*0Sstevel@tonic-gate sideno = getnodeside(mynode(), sd); 72*0Sstevel@tonic-gate sp = metafakesetname(sp->setno, sr->sr_setname); 73*0Sstevel@tonic-gate } else { 74*0Sstevel@tonic-gate if ((sr = getsetbyname(sp->setname, ep)) == NULL) 75*0Sstevel@tonic-gate return (NULL); 76*0Sstevel@tonic-gate } 77*0Sstevel@tonic-gate 78*0Sstevel@tonic-gate assert(sideno != MD_SIDEWILD); 79*0Sstevel@tonic-gate 80*0Sstevel@tonic-gate /* 81*0Sstevel@tonic-gate * WARNING: 82*0Sstevel@tonic-gate * The act of getting the dnp from the namespace means that we 83*0Sstevel@tonic-gate * will get the devid of the disk as recorded in the namespace. 84*0Sstevel@tonic-gate * This devid has the potential to be stale if the disk is being 85*0Sstevel@tonic-gate * replaced via a rebind, this means that any code that relies 86*0Sstevel@tonic-gate * on any of the dnp information should take the appropriate action 87*0Sstevel@tonic-gate * to preserve that information. For example in the rebind code the 88*0Sstevel@tonic-gate * devid of the new disk is saved off and then copied back in once 89*0Sstevel@tonic-gate * the code that has called this function has completed. 90*0Sstevel@tonic-gate */ 91*0Sstevel@tonic-gate for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) { 92*0Sstevel@tonic-gate if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key, 93*0Sstevel@tonic-gate flags, ep)) == NULL) { 94*0Sstevel@tonic-gate if (!(flags & MD_BYPASS_DAEMON)) 95*0Sstevel@tonic-gate free_sr(sr); 96*0Sstevel@tonic-gate metafreedrivedesc(&dd_head); 97*0Sstevel@tonic-gate return (NULL); 98*0Sstevel@tonic-gate } 99*0Sstevel@tonic-gate 100*0Sstevel@tonic-gate (void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt, 101*0Sstevel@tonic-gate dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags); 102*0Sstevel@tonic-gate } 103*0Sstevel@tonic-gate 104*0Sstevel@tonic-gate if (!(flags & MD_BYPASS_DAEMON)) { 105*0Sstevel@tonic-gate free_sr(sr); 106*0Sstevel@tonic-gate } 107*0Sstevel@tonic-gate return (dd_head); 108*0Sstevel@tonic-gate } 109*0Sstevel@tonic-gate 110*0Sstevel@tonic-gate static int 111*0Sstevel@tonic-gate get_sidenmlist( 112*0Sstevel@tonic-gate mdsetname_t *sp, 113*0Sstevel@tonic-gate mddrivename_t *dnp, 114*0Sstevel@tonic-gate md_error_t *ep 115*0Sstevel@tonic-gate ) 116*0Sstevel@tonic-gate { 117*0Sstevel@tonic-gate md_set_desc *sd; 118*0Sstevel@tonic-gate mdsidenames_t *sn, **sn_next; 119*0Sstevel@tonic-gate int i; 120*0Sstevel@tonic-gate 121*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) 122*0Sstevel@tonic-gate return (-1); 123*0Sstevel@tonic-gate 124*0Sstevel@tonic-gate metaflushsidenames(dnp); 125*0Sstevel@tonic-gate sn_next = &dnp->side_names; 126*0Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) { 127*0Sstevel@tonic-gate /* 128*0Sstevel@tonic-gate * Only get sidenames for this node since 129*0Sstevel@tonic-gate * that is the only side information stored in 130*0Sstevel@tonic-gate * the local mddb for a multi-node diskset. 131*0Sstevel@tonic-gate */ 132*0Sstevel@tonic-gate if (sd->sd_mn_mynode) { 133*0Sstevel@tonic-gate sn = Zalloc(sizeof (*sn)); 134*0Sstevel@tonic-gate sn->sideno = sd->sd_mn_mynode->nd_nodeid; 135*0Sstevel@tonic-gate if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET, 136*0Sstevel@tonic-gate sn->sideno, dnp->side_names_key, &sn->dname, 137*0Sstevel@tonic-gate &sn->mnum, NULL, ep)) == NULL) { 138*0Sstevel@tonic-gate if (sn->dname != NULL) 139*0Sstevel@tonic-gate Free(sn->dname); 140*0Sstevel@tonic-gate Free(sn); 141*0Sstevel@tonic-gate return (-1); 142*0Sstevel@tonic-gate } 143*0Sstevel@tonic-gate 144*0Sstevel@tonic-gate /* Add to the end of the linked list */ 145*0Sstevel@tonic-gate assert(*sn_next == NULL); 146*0Sstevel@tonic-gate *sn_next = sn; 147*0Sstevel@tonic-gate sn_next = &sn->next; 148*0Sstevel@tonic-gate } 149*0Sstevel@tonic-gate } else { 150*0Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) { 151*0Sstevel@tonic-gate /* Skip empty slots */ 152*0Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0') 153*0Sstevel@tonic-gate continue; 154*0Sstevel@tonic-gate 155*0Sstevel@tonic-gate sn = Zalloc(sizeof (*sn)); 156*0Sstevel@tonic-gate sn->sideno = i; 157*0Sstevel@tonic-gate if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET, 158*0Sstevel@tonic-gate i+SKEW, dnp->side_names_key, &sn->dname, 159*0Sstevel@tonic-gate &sn->mnum, NULL, ep)) == NULL) { 160*0Sstevel@tonic-gate /* 161*0Sstevel@tonic-gate * It is possible that during the add of a 162*0Sstevel@tonic-gate * host to have a 'missing' side as the side 163*0Sstevel@tonic-gate * for this disk will be added later. So ignore 164*0Sstevel@tonic-gate * the error. The 'missing' side will be added 165*0Sstevel@tonic-gate * once the addhosts process has completed. 166*0Sstevel@tonic-gate */ 167*0Sstevel@tonic-gate if (mdissyserror(ep, ENOENT)) { 168*0Sstevel@tonic-gate mdclrerror(ep); 169*0Sstevel@tonic-gate Free(sn); 170*0Sstevel@tonic-gate continue; 171*0Sstevel@tonic-gate } 172*0Sstevel@tonic-gate 173*0Sstevel@tonic-gate if (sn->dname != NULL) 174*0Sstevel@tonic-gate Free(sn->dname); 175*0Sstevel@tonic-gate Free(sn); 176*0Sstevel@tonic-gate return (-1); 177*0Sstevel@tonic-gate } 178*0Sstevel@tonic-gate 179*0Sstevel@tonic-gate /* Add to the end of the linked list */ 180*0Sstevel@tonic-gate assert(*sn_next == NULL); 181*0Sstevel@tonic-gate *sn_next = sn; 182*0Sstevel@tonic-gate sn_next = &sn->next; 183*0Sstevel@tonic-gate } 184*0Sstevel@tonic-gate } 185*0Sstevel@tonic-gate 186*0Sstevel@tonic-gate return (0); 187*0Sstevel@tonic-gate } 188*0Sstevel@tonic-gate 189*0Sstevel@tonic-gate static md_drive_desc * 190*0Sstevel@tonic-gate rl_to_dd( 191*0Sstevel@tonic-gate mdsetname_t *sp, 192*0Sstevel@tonic-gate md_replicalist_t *rlp, 193*0Sstevel@tonic-gate md_error_t *ep 194*0Sstevel@tonic-gate ) 195*0Sstevel@tonic-gate { 196*0Sstevel@tonic-gate md_replicalist_t *rl; 197*0Sstevel@tonic-gate md_replica_t *r; 198*0Sstevel@tonic-gate md_drive_desc *dd = NULL; 199*0Sstevel@tonic-gate md_drive_desc *d; 200*0Sstevel@tonic-gate int found; 201*0Sstevel@tonic-gate md_set_desc *sd; 202*0Sstevel@tonic-gate daddr_t nblks = 0; 203*0Sstevel@tonic-gate 204*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) 205*0Sstevel@tonic-gate return (NULL); 206*0Sstevel@tonic-gate 207*0Sstevel@tonic-gate /* find the smallest existing replica */ 208*0Sstevel@tonic-gate for (rl = rlp; rl != NULL; rl = rl->rl_next) { 209*0Sstevel@tonic-gate r = rl->rl_repp; 210*0Sstevel@tonic-gate nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks)); 211*0Sstevel@tonic-gate } 212*0Sstevel@tonic-gate 213*0Sstevel@tonic-gate if (nblks <= 0) 214*0Sstevel@tonic-gate nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE; 215*0Sstevel@tonic-gate 216*0Sstevel@tonic-gate for (rl = rlp; rl != NULL; rl = rl->rl_next) { 217*0Sstevel@tonic-gate r = rl->rl_repp; 218*0Sstevel@tonic-gate 219*0Sstevel@tonic-gate found = 0; 220*0Sstevel@tonic-gate for (d = dd; d != NULL; d = d->dd_next) { 221*0Sstevel@tonic-gate if (strcmp(r->r_namep->drivenamep->cname, 222*0Sstevel@tonic-gate d->dd_dnp->cname) == 0) { 223*0Sstevel@tonic-gate found = 1; 224*0Sstevel@tonic-gate dd->dd_dbcnt++; 225*0Sstevel@tonic-gate break; 226*0Sstevel@tonic-gate } 227*0Sstevel@tonic-gate } 228*0Sstevel@tonic-gate 229*0Sstevel@tonic-gate if (! found) 230*0Sstevel@tonic-gate (void) metadrivedesc_append(&dd, r->r_namep->drivenamep, 231*0Sstevel@tonic-gate 1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK); 232*0Sstevel@tonic-gate } 233*0Sstevel@tonic-gate 234*0Sstevel@tonic-gate return (dd); 235*0Sstevel@tonic-gate } 236*0Sstevel@tonic-gate 237*0Sstevel@tonic-gate /* 238*0Sstevel@tonic-gate * Exported Entry Points 239*0Sstevel@tonic-gate */ 240*0Sstevel@tonic-gate 241*0Sstevel@tonic-gate set_t 242*0Sstevel@tonic-gate get_max_sets(md_error_t *ep) 243*0Sstevel@tonic-gate { 244*0Sstevel@tonic-gate 245*0Sstevel@tonic-gate static set_t max_sets = 0; 246*0Sstevel@tonic-gate 247*0Sstevel@tonic-gate if (max_sets == 0) 248*0Sstevel@tonic-gate if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0) 249*0Sstevel@tonic-gate return (0); 250*0Sstevel@tonic-gate 251*0Sstevel@tonic-gate return (max_sets); 252*0Sstevel@tonic-gate } 253*0Sstevel@tonic-gate 254*0Sstevel@tonic-gate int 255*0Sstevel@tonic-gate get_max_meds(md_error_t *ep) 256*0Sstevel@tonic-gate { 257*0Sstevel@tonic-gate static int max_meds = 0; 258*0Sstevel@tonic-gate 259*0Sstevel@tonic-gate if (max_meds == 0) 260*0Sstevel@tonic-gate if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0) 261*0Sstevel@tonic-gate return (0); 262*0Sstevel@tonic-gate 263*0Sstevel@tonic-gate return (max_meds); 264*0Sstevel@tonic-gate } 265*0Sstevel@tonic-gate 266*0Sstevel@tonic-gate side_t 267*0Sstevel@tonic-gate getmyside(mdsetname_t *sp, md_error_t *ep) 268*0Sstevel@tonic-gate { 269*0Sstevel@tonic-gate md_set_desc *sd; 270*0Sstevel@tonic-gate char *node = NULL; 271*0Sstevel@tonic-gate side_t sideno; 272*0Sstevel@tonic-gate 273*0Sstevel@tonic-gate if (sp->setno == 0) 274*0Sstevel@tonic-gate return (0); 275*0Sstevel@tonic-gate 276*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) 277*0Sstevel@tonic-gate return (MD_SIDEWILD); 278*0Sstevel@tonic-gate 279*0Sstevel@tonic-gate node = mynode(); 280*0Sstevel@tonic-gate 281*0Sstevel@tonic-gate assert(node != NULL); 282*0Sstevel@tonic-gate 283*0Sstevel@tonic-gate sideno = getnodeside(node, sd); 284*0Sstevel@tonic-gate 285*0Sstevel@tonic-gate if (sideno != MD_SIDEWILD) 286*0Sstevel@tonic-gate return (sideno); 287*0Sstevel@tonic-gate 288*0Sstevel@tonic-gate return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node)); 289*0Sstevel@tonic-gate } 290*0Sstevel@tonic-gate 291*0Sstevel@tonic-gate /* 292*0Sstevel@tonic-gate * get set info from name 293*0Sstevel@tonic-gate */ 294*0Sstevel@tonic-gate md_set_record * 295*0Sstevel@tonic-gate getsetbyname(char *setname, md_error_t *ep) 296*0Sstevel@tonic-gate { 297*0Sstevel@tonic-gate md_set_record *sr = NULL; 298*0Sstevel@tonic-gate md_mnset_record *mnsr = NULL; 299*0Sstevel@tonic-gate char *p; 300*0Sstevel@tonic-gate size_t len; 301*0Sstevel@tonic-gate 302*0Sstevel@tonic-gate /* get set info from daemon */ 303*0Sstevel@tonic-gate if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1) 304*0Sstevel@tonic-gate return (NULL); 305*0Sstevel@tonic-gate if (sr != NULL) { 306*0Sstevel@tonic-gate /* 307*0Sstevel@tonic-gate * Returned record could be for a multi-node set or a 308*0Sstevel@tonic-gate * non-multi-node set. 309*0Sstevel@tonic-gate */ 310*0Sstevel@tonic-gate if (MD_MNSET_REC(sr)) { 311*0Sstevel@tonic-gate /* 312*0Sstevel@tonic-gate * Record is for a multi-node set. Reissue call 313*0Sstevel@tonic-gate * to get mnset information. Need to free 314*0Sstevel@tonic-gate * record as if a non-multi-node set record since 315*0Sstevel@tonic-gate * that is what clnt_getset gave us. If in 316*0Sstevel@tonic-gate * the daemon, don't free since this is a pointer 317*0Sstevel@tonic-gate * into the setrecords array. 318*0Sstevel@tonic-gate */ 319*0Sstevel@tonic-gate if (! md_in_daemon) { 320*0Sstevel@tonic-gate sr->sr_flags &= ~MD_SR_MN; 321*0Sstevel@tonic-gate free_sr(sr); 322*0Sstevel@tonic-gate } 323*0Sstevel@tonic-gate if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr, 324*0Sstevel@tonic-gate ep) == -1) 325*0Sstevel@tonic-gate return (NULL); 326*0Sstevel@tonic-gate if (mnsr != NULL) 327*0Sstevel@tonic-gate return ((struct md_set_record *)mnsr); 328*0Sstevel@tonic-gate } else { 329*0Sstevel@tonic-gate return (sr); 330*0Sstevel@tonic-gate } 331*0Sstevel@tonic-gate } 332*0Sstevel@tonic-gate 333*0Sstevel@tonic-gate /* no such set */ 334*0Sstevel@tonic-gate len = strlen(setname) + 30; 335*0Sstevel@tonic-gate p = Malloc(len); 336*0Sstevel@tonic-gate (void) snprintf(p, len, "setname \"%s\"", setname); 337*0Sstevel@tonic-gate (void) mderror(ep, MDE_NO_SET, p); 338*0Sstevel@tonic-gate Free(p); 339*0Sstevel@tonic-gate return (NULL); 340*0Sstevel@tonic-gate } 341*0Sstevel@tonic-gate 342*0Sstevel@tonic-gate /* 343*0Sstevel@tonic-gate * get set info from number 344*0Sstevel@tonic-gate */ 345*0Sstevel@tonic-gate md_set_record * 346*0Sstevel@tonic-gate getsetbynum(set_t setno, md_error_t *ep) 347*0Sstevel@tonic-gate { 348*0Sstevel@tonic-gate md_set_record *sr; 349*0Sstevel@tonic-gate md_mnset_record *mnsr = NULL; 350*0Sstevel@tonic-gate char buf[100]; 351*0Sstevel@tonic-gate 352*0Sstevel@tonic-gate if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1) 353*0Sstevel@tonic-gate return (NULL); 354*0Sstevel@tonic-gate 355*0Sstevel@tonic-gate if (sr != NULL) { 356*0Sstevel@tonic-gate /* 357*0Sstevel@tonic-gate * Record is for a multi-node set. Reissue call 358*0Sstevel@tonic-gate * to get mnset information. Need to free 359*0Sstevel@tonic-gate * record as if a non-multi-node set record since 360*0Sstevel@tonic-gate * that is what clnt_getset gave us. If in 361*0Sstevel@tonic-gate * the daemon, don't free since this is a pointer 362*0Sstevel@tonic-gate * into the setrecords array. 363*0Sstevel@tonic-gate */ 364*0Sstevel@tonic-gate if (MD_MNSET_REC(sr)) { 365*0Sstevel@tonic-gate /* 366*0Sstevel@tonic-gate * Record is for a multi-node set. Reissue call 367*0Sstevel@tonic-gate * to get mnset information. 368*0Sstevel@tonic-gate */ 369*0Sstevel@tonic-gate if (! md_in_daemon) { 370*0Sstevel@tonic-gate sr->sr_flags &= ~MD_SR_MN; 371*0Sstevel@tonic-gate free_sr(sr); 372*0Sstevel@tonic-gate } 373*0Sstevel@tonic-gate if (clnt_mngetset(mynode(), NULL, setno, &mnsr, 374*0Sstevel@tonic-gate ep) == -1) 375*0Sstevel@tonic-gate return (NULL); 376*0Sstevel@tonic-gate if (mnsr != NULL) 377*0Sstevel@tonic-gate return ((struct md_set_record *)mnsr); 378*0Sstevel@tonic-gate } else { 379*0Sstevel@tonic-gate return (sr); 380*0Sstevel@tonic-gate } 381*0Sstevel@tonic-gate } 382*0Sstevel@tonic-gate 383*0Sstevel@tonic-gate (void) sprintf(buf, "setno %u", setno); 384*0Sstevel@tonic-gate (void) mderror(ep, MDE_NO_SET, buf); 385*0Sstevel@tonic-gate return (NULL); 386*0Sstevel@tonic-gate } 387*0Sstevel@tonic-gate 388*0Sstevel@tonic-gate int 389*0Sstevel@tonic-gate meta_check_drive_inuse( 390*0Sstevel@tonic-gate mdsetname_t *sp, 391*0Sstevel@tonic-gate mddrivename_t *dnp, 392*0Sstevel@tonic-gate int check_db, 393*0Sstevel@tonic-gate md_error_t *ep 394*0Sstevel@tonic-gate ) 395*0Sstevel@tonic-gate { 396*0Sstevel@tonic-gate mdnamelist_t *nlp = NULL; 397*0Sstevel@tonic-gate mdnamelist_t *p; 398*0Sstevel@tonic-gate int rval = 0; 399*0Sstevel@tonic-gate 400*0Sstevel@tonic-gate /* get all underlying partitions */ 401*0Sstevel@tonic-gate if (meta_getalldevs(sp, &nlp, check_db, ep) != 0) 402*0Sstevel@tonic-gate return (-1); 403*0Sstevel@tonic-gate 404*0Sstevel@tonic-gate /* search for drive */ 405*0Sstevel@tonic-gate for (p = nlp; (p != NULL); p = p->next) { 406*0Sstevel@tonic-gate mdname_t *np = p->namep; 407*0Sstevel@tonic-gate 408*0Sstevel@tonic-gate if (strcmp(dnp->cname, np->drivenamep->cname) == 0) { 409*0Sstevel@tonic-gate rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno, 410*0Sstevel@tonic-gate NULL, dnp->cname, sp->setname)); 411*0Sstevel@tonic-gate break; 412*0Sstevel@tonic-gate } 413*0Sstevel@tonic-gate } 414*0Sstevel@tonic-gate 415*0Sstevel@tonic-gate /* cleanup, return success */ 416*0Sstevel@tonic-gate metafreenamelist(nlp); 417*0Sstevel@tonic-gate return (rval); 418*0Sstevel@tonic-gate } 419*0Sstevel@tonic-gate 420*0Sstevel@tonic-gate /* 421*0Sstevel@tonic-gate * simple check for ownership 422*0Sstevel@tonic-gate */ 423*0Sstevel@tonic-gate int 424*0Sstevel@tonic-gate meta_check_ownership(mdsetname_t *sp, md_error_t *ep) 425*0Sstevel@tonic-gate { 426*0Sstevel@tonic-gate int ownset; 427*0Sstevel@tonic-gate md_set_desc *sd; 428*0Sstevel@tonic-gate md_drive_desc *dd; 429*0Sstevel@tonic-gate md_replicalist_t *rlp = NULL; 430*0Sstevel@tonic-gate md_error_t xep = mdnullerror; 431*0Sstevel@tonic-gate 432*0Sstevel@tonic-gate if (metaislocalset(sp)) 433*0Sstevel@tonic-gate return (0); 434*0Sstevel@tonic-gate 435*0Sstevel@tonic-gate ownset = own_set(sp, NULL, TRUE, ep); 436*0Sstevel@tonic-gate if (! mdisok(ep)) 437*0Sstevel@tonic-gate return (-1); 438*0Sstevel@tonic-gate 439*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) 440*0Sstevel@tonic-gate return (-1); 441*0Sstevel@tonic-gate 442*0Sstevel@tonic-gate dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep); 443*0Sstevel@tonic-gate if (! mdisok(ep)) 444*0Sstevel@tonic-gate return (-1); 445*0Sstevel@tonic-gate 446*0Sstevel@tonic-gate /* If we have no drive descriptors, check for no ownership */ 447*0Sstevel@tonic-gate if (dd == NULL) { 448*0Sstevel@tonic-gate if (ownset == MD_SETOWNER_NONE) 449*0Sstevel@tonic-gate return (0); 450*0Sstevel@tonic-gate 451*0Sstevel@tonic-gate /* If ownership somehow has come to exist, we must clean up */ 452*0Sstevel@tonic-gate 453*0Sstevel@tonic-gate if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, 454*0Sstevel@tonic-gate &xep) < 0) 455*0Sstevel@tonic-gate mdclrerror(&xep); 456*0Sstevel@tonic-gate 457*0Sstevel@tonic-gate if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL) 458*0Sstevel@tonic-gate if (! mdisok(&xep)) 459*0Sstevel@tonic-gate mdclrerror(&xep); 460*0Sstevel@tonic-gate 461*0Sstevel@tonic-gate if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) { 462*0Sstevel@tonic-gate if (rel_own_bydd(sp, dd, TRUE, &xep)) 463*0Sstevel@tonic-gate mdclrerror(&xep); 464*0Sstevel@tonic-gate } 465*0Sstevel@tonic-gate 466*0Sstevel@tonic-gate if (halt_set(sp, &xep)) 467*0Sstevel@tonic-gate mdclrerror(&xep); 468*0Sstevel@tonic-gate 469*0Sstevel@tonic-gate metafreereplicalist(rlp); 470*0Sstevel@tonic-gate 471*0Sstevel@tonic-gate metafreedrivedesc(&dd); 472*0Sstevel@tonic-gate 473*0Sstevel@tonic-gate return (0); 474*0Sstevel@tonic-gate } 475*0Sstevel@tonic-gate 476*0Sstevel@tonic-gate metafreedrivedesc(&sd->sd_drvs); 477*0Sstevel@tonic-gate 478*0Sstevel@tonic-gate if (ownset == MD_SETOWNER_YES) 479*0Sstevel@tonic-gate return (0); 480*0Sstevel@tonic-gate 481*0Sstevel@tonic-gate return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL, 482*0Sstevel@tonic-gate sp->setname)); 483*0Sstevel@tonic-gate } 484*0Sstevel@tonic-gate 485*0Sstevel@tonic-gate /* 486*0Sstevel@tonic-gate * simple check for ownership 487*0Sstevel@tonic-gate */ 488*0Sstevel@tonic-gate int 489*0Sstevel@tonic-gate meta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep) 490*0Sstevel@tonic-gate { 491*0Sstevel@tonic-gate md_set_desc *sd; 492*0Sstevel@tonic-gate md_drive_desc *dd; 493*0Sstevel@tonic-gate int bool; 494*0Sstevel@tonic-gate 495*0Sstevel@tonic-gate if (metaislocalset(sp)) 496*0Sstevel@tonic-gate return (0); 497*0Sstevel@tonic-gate 498*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) 499*0Sstevel@tonic-gate return (-1); 500*0Sstevel@tonic-gate 501*0Sstevel@tonic-gate if (getnodeside(hostname, sd) == MD_SIDEWILD) 502*0Sstevel@tonic-gate return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 503*0Sstevel@tonic-gate hostname, NULL, sp->setname)); 504*0Sstevel@tonic-gate 505*0Sstevel@tonic-gate dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep); 506*0Sstevel@tonic-gate if (! mdisok(ep)) 507*0Sstevel@tonic-gate return (-1); 508*0Sstevel@tonic-gate 509*0Sstevel@tonic-gate if (clnt_ownset(hostname, sp, &bool, ep) == -1) 510*0Sstevel@tonic-gate return (-1); 511*0Sstevel@tonic-gate 512*0Sstevel@tonic-gate if (dd == NULL) 513*0Sstevel@tonic-gate return (0); 514*0Sstevel@tonic-gate 515*0Sstevel@tonic-gate metafreedrivedesc(&sd->sd_drvs); 516*0Sstevel@tonic-gate 517*0Sstevel@tonic-gate if (bool == TRUE) 518*0Sstevel@tonic-gate return (0); 519*0Sstevel@tonic-gate 520*0Sstevel@tonic-gate return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL, 521*0Sstevel@tonic-gate sp->setname)); 522*0Sstevel@tonic-gate } 523*0Sstevel@tonic-gate 524*0Sstevel@tonic-gate /* 525*0Sstevel@tonic-gate * Function that determines if a node is in the multinode diskset 526*0Sstevel@tonic-gate * membership list. Calling node passes in node to be checked and 527*0Sstevel@tonic-gate * the nodelist as returned from meta_read_nodelist. This routine 528*0Sstevel@tonic-gate * anticipates being called many times using the same diskset membership 529*0Sstevel@tonic-gate * list which is why the alloc and free of the diskset membership list 530*0Sstevel@tonic-gate * is left to the calling routine. 531*0Sstevel@tonic-gate * Returns: 532*0Sstevel@tonic-gate * 1 - if a member 533*0Sstevel@tonic-gate * 0 - not a member 534*0Sstevel@tonic-gate */ 535*0Sstevel@tonic-gate int 536*0Sstevel@tonic-gate meta_is_member( 537*0Sstevel@tonic-gate char *node_name, 538*0Sstevel@tonic-gate md_mn_nodeid_t node_id, 539*0Sstevel@tonic-gate mndiskset_membershiplist_t *nl 540*0Sstevel@tonic-gate ) 541*0Sstevel@tonic-gate { 542*0Sstevel@tonic-gate mndiskset_membershiplist_t *nl2; 543*0Sstevel@tonic-gate int flag_check_name; 544*0Sstevel@tonic-gate 545*0Sstevel@tonic-gate if (node_id != 0) 546*0Sstevel@tonic-gate flag_check_name = 0; 547*0Sstevel@tonic-gate else if (node_name != NULL) 548*0Sstevel@tonic-gate flag_check_name = 1; 549*0Sstevel@tonic-gate else 550*0Sstevel@tonic-gate return (0); 551*0Sstevel@tonic-gate 552*0Sstevel@tonic-gate nl2 = nl; 553*0Sstevel@tonic-gate while (nl2) { 554*0Sstevel@tonic-gate if (flag_check_name) { 555*0Sstevel@tonic-gate /* Compare given name against name in member list */ 556*0Sstevel@tonic-gate if (strcmp(nl2->msl_node_name, node_name) == 0) 557*0Sstevel@tonic-gate break; 558*0Sstevel@tonic-gate } else { 559*0Sstevel@tonic-gate /* Compare given nodeid against nodeid in member list */ 560*0Sstevel@tonic-gate if (nl2->msl_node_id == node_id) 561*0Sstevel@tonic-gate break; 562*0Sstevel@tonic-gate } 563*0Sstevel@tonic-gate nl2 = nl2->next; 564*0Sstevel@tonic-gate } 565*0Sstevel@tonic-gate /* No match found in member list */ 566*0Sstevel@tonic-gate if (nl2 == NULL) { 567*0Sstevel@tonic-gate return (0); 568*0Sstevel@tonic-gate } 569*0Sstevel@tonic-gate /* Return 1 if node is in member list */ 570*0Sstevel@tonic-gate return (1); 571*0Sstevel@tonic-gate } 572*0Sstevel@tonic-gate 573*0Sstevel@tonic-gate /* 574*0Sstevel@tonic-gate * meta_getnext_devinfo should go to the host that 575*0Sstevel@tonic-gate * has the device, to return the device name, driver name, minor num. 576*0Sstevel@tonic-gate * We can take the big cheat for now, since it is a requirement 577*0Sstevel@tonic-gate * that the device names and device numbers are the same, and 578*0Sstevel@tonic-gate * just get the info locally. 579*0Sstevel@tonic-gate * 580*0Sstevel@tonic-gate * This routine is very similar to meta_getnextside_devinfo except 581*0Sstevel@tonic-gate * that the specific side to be used is being passed in. 582*0Sstevel@tonic-gate * 583*0Sstevel@tonic-gate * Exit status: 584*0Sstevel@tonic-gate * 0 - No more side info to return 585*0Sstevel@tonic-gate * 1 - More side info's to return 586*0Sstevel@tonic-gate * -1 - An error has been detected 587*0Sstevel@tonic-gate */ 588*0Sstevel@tonic-gate /*ARGSUSED*/ 589*0Sstevel@tonic-gate int 590*0Sstevel@tonic-gate meta_getside_devinfo( 591*0Sstevel@tonic-gate mdsetname_t *sp, /* for this set */ 592*0Sstevel@tonic-gate char *bname, /* local block name (myside) */ 593*0Sstevel@tonic-gate side_t sideno, /* sideno */ 594*0Sstevel@tonic-gate char **ret_bname, /* block device name of returned side */ 595*0Sstevel@tonic-gate char **ret_dname, /* driver name of returned side */ 596*0Sstevel@tonic-gate minor_t *ret_mnum, /* minor number of returned side */ 597*0Sstevel@tonic-gate md_error_t *ep 598*0Sstevel@tonic-gate ) 599*0Sstevel@tonic-gate { 600*0Sstevel@tonic-gate mdname_t *np; 601*0Sstevel@tonic-gate 602*0Sstevel@tonic-gate if (ret_bname != NULL) 603*0Sstevel@tonic-gate *ret_bname = NULL; 604*0Sstevel@tonic-gate if (ret_dname != NULL) 605*0Sstevel@tonic-gate *ret_dname = NULL; 606*0Sstevel@tonic-gate if (ret_mnum != NULL) 607*0Sstevel@tonic-gate *ret_mnum = NODEV32; 608*0Sstevel@tonic-gate 609*0Sstevel@tonic-gate 610*0Sstevel@tonic-gate if ((np = metaname(&sp, bname, ep)) == NULL) 611*0Sstevel@tonic-gate return (-1); 612*0Sstevel@tonic-gate 613*0Sstevel@tonic-gate /* 614*0Sstevel@tonic-gate * NOTE (future) - There will be more work here once devids are integrated 615*0Sstevel@tonic-gate * into disksets. Then the side should be used to find the correct 616*0Sstevel@tonic-gate * host and the b/d names should be gotten from that host. 617*0Sstevel@tonic-gate */ 618*0Sstevel@tonic-gate 619*0Sstevel@tonic-gate /* 620*0Sstevel@tonic-gate * Return the side info. 621*0Sstevel@tonic-gate */ 622*0Sstevel@tonic-gate if (ret_bname != NULL) 623*0Sstevel@tonic-gate *ret_bname = Strdup(np->bname); 624*0Sstevel@tonic-gate 625*0Sstevel@tonic-gate if (ret_dname != NULL) { 626*0Sstevel@tonic-gate mdcinfo_t *cinfo; 627*0Sstevel@tonic-gate 628*0Sstevel@tonic-gate if ((cinfo = metagetcinfo(np, ep)) == NULL) 629*0Sstevel@tonic-gate return (-1); 630*0Sstevel@tonic-gate 631*0Sstevel@tonic-gate *ret_dname = Strdup(cinfo->dname); 632*0Sstevel@tonic-gate } 633*0Sstevel@tonic-gate 634*0Sstevel@tonic-gate if (ret_mnum != NULL) 635*0Sstevel@tonic-gate *ret_mnum = meta_getminor(np->dev); 636*0Sstevel@tonic-gate 637*0Sstevel@tonic-gate return (1); 638*0Sstevel@tonic-gate } 639*0Sstevel@tonic-gate 640*0Sstevel@tonic-gate /* 641*0Sstevel@tonic-gate * Get the information on the device from the remote node using the devid 642*0Sstevel@tonic-gate * of the disk. 643*0Sstevel@tonic-gate * 644*0Sstevel@tonic-gate * Exit status: 645*0Sstevel@tonic-gate * 0 - No more side info to return 646*0Sstevel@tonic-gate * 1 - More side info's to return 647*0Sstevel@tonic-gate * -1 - An error has been detected 648*0Sstevel@tonic-gate */ 649*0Sstevel@tonic-gate int 650*0Sstevel@tonic-gate meta_getnextside_devinfo( 651*0Sstevel@tonic-gate mdsetname_t *sp, /* for this set */ 652*0Sstevel@tonic-gate char *bname, /* local block name (myside) */ 653*0Sstevel@tonic-gate side_t *sideno, /* previous sideno & returned sideno */ 654*0Sstevel@tonic-gate char **ret_bname, /* block device name of returned side */ 655*0Sstevel@tonic-gate char **ret_dname, /* driver name of returned side */ 656*0Sstevel@tonic-gate minor_t *ret_mnum, /* minor number of returned side */ 657*0Sstevel@tonic-gate md_error_t *ep 658*0Sstevel@tonic-gate ) 659*0Sstevel@tonic-gate { 660*0Sstevel@tonic-gate md_set_desc *sd; 661*0Sstevel@tonic-gate int i; 662*0Sstevel@tonic-gate mdname_t *np; 663*0Sstevel@tonic-gate mddrivename_t *dnp; 664*0Sstevel@tonic-gate char *devidstr = NULL; 665*0Sstevel@tonic-gate int devidstrlen; 666*0Sstevel@tonic-gate md_dev64_t retdev = NODEV64; 667*0Sstevel@tonic-gate char *ret_devname = NULL; 668*0Sstevel@tonic-gate char *ret_blkdevname = NULL; 669*0Sstevel@tonic-gate char *ret_driver = NULL; 670*0Sstevel@tonic-gate char *nodename; 671*0Sstevel@tonic-gate int fd; 672*0Sstevel@tonic-gate int ret = -1; 673*0Sstevel@tonic-gate char *minor_name = NULL; 674*0Sstevel@tonic-gate md_mnnode_desc *nd; 675*0Sstevel@tonic-gate 676*0Sstevel@tonic-gate 677*0Sstevel@tonic-gate if (ret_bname != NULL) 678*0Sstevel@tonic-gate *ret_bname = NULL; 679*0Sstevel@tonic-gate if (ret_dname != NULL) 680*0Sstevel@tonic-gate *ret_dname = NULL; 681*0Sstevel@tonic-gate if (ret_mnum != NULL) 682*0Sstevel@tonic-gate *ret_mnum = NODEV32; 683*0Sstevel@tonic-gate 684*0Sstevel@tonic-gate if (metaislocalset(sp)) { 685*0Sstevel@tonic-gate /* no more sides - we are done */ 686*0Sstevel@tonic-gate if (*sideno != MD_SIDEWILD) 687*0Sstevel@tonic-gate return (0); 688*0Sstevel@tonic-gate 689*0Sstevel@tonic-gate /* First time through - set up return sideno */ 690*0Sstevel@tonic-gate *sideno = 0; 691*0Sstevel@tonic-gate } else { 692*0Sstevel@tonic-gate 693*0Sstevel@tonic-gate /* 694*0Sstevel@tonic-gate * Find the next sideno, starting after the one given. 695*0Sstevel@tonic-gate */ 696*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) 697*0Sstevel@tonic-gate return (-1); 698*0Sstevel@tonic-gate 699*0Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) { 700*0Sstevel@tonic-gate nd = sd->sd_nodelist; 701*0Sstevel@tonic-gate if ((*sideno == MD_SIDEWILD) && 702*0Sstevel@tonic-gate (nd != (struct md_mnnode_desc *)NULL)) { 703*0Sstevel@tonic-gate *sideno = nd->nd_nodeid; 704*0Sstevel@tonic-gate } else { 705*0Sstevel@tonic-gate while (nd) { 706*0Sstevel@tonic-gate /* 707*0Sstevel@tonic-gate * Found given sideno, now find 708*0Sstevel@tonic-gate * next sideno, if there is one. 709*0Sstevel@tonic-gate */ 710*0Sstevel@tonic-gate if ((*sideno == nd->nd_nodeid) && 711*0Sstevel@tonic-gate (nd->nd_next != 712*0Sstevel@tonic-gate (struct md_mnnode_desc *)NULL)) { 713*0Sstevel@tonic-gate *sideno = 714*0Sstevel@tonic-gate nd->nd_next->nd_nodeid; 715*0Sstevel@tonic-gate break; 716*0Sstevel@tonic-gate } 717*0Sstevel@tonic-gate nd = nd->nd_next; 718*0Sstevel@tonic-gate } 719*0Sstevel@tonic-gate if (nd == NULL) { 720*0Sstevel@tonic-gate return (0); 721*0Sstevel@tonic-gate } 722*0Sstevel@tonic-gate } 723*0Sstevel@tonic-gate if (*sideno == MD_SIDEWILD) 724*0Sstevel@tonic-gate return (0); 725*0Sstevel@tonic-gate } else { 726*0Sstevel@tonic-gate for (i = (*sideno)+1; i < MD_MAXSIDES; i++) 727*0Sstevel@tonic-gate /* Find next full slot */ 728*0Sstevel@tonic-gate if (sd->sd_nodes[i][0] != '\0') 729*0Sstevel@tonic-gate break; 730*0Sstevel@tonic-gate 731*0Sstevel@tonic-gate /* No more sides - we are done */ 732*0Sstevel@tonic-gate if (i == MD_MAXSIDES) 733*0Sstevel@tonic-gate return (0); 734*0Sstevel@tonic-gate 735*0Sstevel@tonic-gate /* Set up the return sideno */ 736*0Sstevel@tonic-gate *sideno = i; 737*0Sstevel@tonic-gate nodename = (char *)sd->sd_nodes[i]; 738*0Sstevel@tonic-gate } 739*0Sstevel@tonic-gate } 740*0Sstevel@tonic-gate 741*0Sstevel@tonic-gate /* 742*0Sstevel@tonic-gate * Need to pass the node the devid of the disk and get it to 743*0Sstevel@tonic-gate * send back the details of the disk from that side. 744*0Sstevel@tonic-gate */ 745*0Sstevel@tonic-gate if ((np = metaname(&sp, bname, ep)) == NULL) 746*0Sstevel@tonic-gate return (-1); 747*0Sstevel@tonic-gate 748*0Sstevel@tonic-gate dnp = np->drivenamep; 749*0Sstevel@tonic-gate 750*0Sstevel@tonic-gate /* 751*0Sstevel@tonic-gate * By default, set up the parameters so that they are copied out. 752*0Sstevel@tonic-gate */ 753*0Sstevel@tonic-gate if (ret_bname != NULL) 754*0Sstevel@tonic-gate *ret_bname = Strdup(np->bname); 755*0Sstevel@tonic-gate 756*0Sstevel@tonic-gate if (ret_dname != NULL) { 757*0Sstevel@tonic-gate mdcinfo_t *cinfo; 758*0Sstevel@tonic-gate 759*0Sstevel@tonic-gate if ((cinfo = metagetcinfo(np, ep)) == NULL) 760*0Sstevel@tonic-gate return (-1); 761*0Sstevel@tonic-gate 762*0Sstevel@tonic-gate *ret_dname = Strdup(cinfo->dname); 763*0Sstevel@tonic-gate } 764*0Sstevel@tonic-gate 765*0Sstevel@tonic-gate if (ret_mnum != NULL) 766*0Sstevel@tonic-gate *ret_mnum = meta_getminor(np->dev); 767*0Sstevel@tonic-gate 768*0Sstevel@tonic-gate /* 769*0Sstevel@tonic-gate * Try some optimization. If this is the local set or the device 770*0Sstevel@tonic-gate * is a metadevice then just copy the information. If the device 771*0Sstevel@tonic-gate * does not have a devid (due to not having a minor name) then 772*0Sstevel@tonic-gate * fall back to the pre-devid behaviour of copying the information 773*0Sstevel@tonic-gate * on the device: this is okay because the sanity checks before this 774*0Sstevel@tonic-gate * call would have found any issues with the device. If it's a 775*0Sstevel@tonic-gate * multi-node diskset also just return ie. copy. 776*0Sstevel@tonic-gate */ 777*0Sstevel@tonic-gate if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL) || 778*0Sstevel@tonic-gate (MD_MNSET_DESC(sd))) 779*0Sstevel@tonic-gate return (1); 780*0Sstevel@tonic-gate 781*0Sstevel@tonic-gate if (np->minor_name == (char *)NULL) { 782*0Sstevel@tonic-gate /* 783*0Sstevel@tonic-gate * Have to get the minor name then. The slice should exist 784*0Sstevel@tonic-gate * on the disk because it will have already been repartitioned 785*0Sstevel@tonic-gate * up prior to getting to this point. 786*0Sstevel@tonic-gate */ 787*0Sstevel@tonic-gate if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) { 788*0Sstevel@tonic-gate (void) mdsyserror(ep, errno, np->bname); 789*0Sstevel@tonic-gate return (-1); 790*0Sstevel@tonic-gate } 791*0Sstevel@tonic-gate (void) devid_get_minor_name(fd, &minor_name); 792*0Sstevel@tonic-gate np->minor_name = Strdup(minor_name); 793*0Sstevel@tonic-gate devid_str_free(minor_name); 794*0Sstevel@tonic-gate (void) close(fd); 795*0Sstevel@tonic-gate } 796*0Sstevel@tonic-gate 797*0Sstevel@tonic-gate /* allocate extra space for "/" and NULL hence +2 */ 798*0Sstevel@tonic-gate devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2; 799*0Sstevel@tonic-gate devidstr = (char *)Malloc(devidstrlen); 800*0Sstevel@tonic-gate 801*0Sstevel@tonic-gate /* 802*0Sstevel@tonic-gate * As a minor name is supplied then the ret_devname will be 803*0Sstevel@tonic-gate * appropriate to that minor_name and in this case it will be 804*0Sstevel@tonic-gate * a block device ie /dev/dsk. 805*0Sstevel@tonic-gate */ 806*0Sstevel@tonic-gate (void) snprintf(devidstr, devidstrlen, 807*0Sstevel@tonic-gate "%s/%s", dnp->devid, np->minor_name); 808*0Sstevel@tonic-gate 809*0Sstevel@tonic-gate ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev, 810*0Sstevel@tonic-gate np->bname, &ret_devname, &ret_driver, ep); 811*0Sstevel@tonic-gate 812*0Sstevel@tonic-gate Free(devidstr); 813*0Sstevel@tonic-gate 814*0Sstevel@tonic-gate /* 815*0Sstevel@tonic-gate * If the other side is not running device id in disksets, 816*0Sstevel@tonic-gate * 'ret' is set to ENOTSUP in which case we fallback to 817*0Sstevel@tonic-gate * the existing behaviour 818*0Sstevel@tonic-gate */ 819*0Sstevel@tonic-gate if (ret == ENOTSUP) 820*0Sstevel@tonic-gate return (1); 821*0Sstevel@tonic-gate else if (ret == -1) 822*0Sstevel@tonic-gate return (-1); 823*0Sstevel@tonic-gate 824*0Sstevel@tonic-gate /* 825*0Sstevel@tonic-gate * ret_devname comes from the rpc call and is a 826*0Sstevel@tonic-gate * raw device name. We need to make this into a 827*0Sstevel@tonic-gate * block device via blkname for further processing. 828*0Sstevel@tonic-gate * Unfortunately, when our device id isn't found in 829*0Sstevel@tonic-gate * the system, the rpc call will return a " " in 830*0Sstevel@tonic-gate * ret_devname in which case we need to fill that in 831*0Sstevel@tonic-gate * as ret_blkname because blkname of " " returns NULL. 832*0Sstevel@tonic-gate */ 833*0Sstevel@tonic-gate if (ret_bname != NULL && ret_devname != NULL) { 834*0Sstevel@tonic-gate ret_blkdevname = blkname(ret_devname); 835*0Sstevel@tonic-gate if (ret_blkdevname == NULL) 836*0Sstevel@tonic-gate *ret_bname = Strdup(ret_devname); 837*0Sstevel@tonic-gate else 838*0Sstevel@tonic-gate *ret_bname = Strdup(ret_blkdevname); 839*0Sstevel@tonic-gate } 840*0Sstevel@tonic-gate 841*0Sstevel@tonic-gate if (ret_dname != NULL && ret_driver != NULL) 842*0Sstevel@tonic-gate *ret_dname = Strdup(ret_driver); 843*0Sstevel@tonic-gate 844*0Sstevel@tonic-gate if (ret_mnum != NULL) 845*0Sstevel@tonic-gate *ret_mnum = meta_getminor(retdev); 846*0Sstevel@tonic-gate 847*0Sstevel@tonic-gate return (1); 848*0Sstevel@tonic-gate } 849*0Sstevel@tonic-gate 850*0Sstevel@tonic-gate int 851*0Sstevel@tonic-gate meta_is_drive_in_anyset( 852*0Sstevel@tonic-gate mddrivename_t *dnp, 853*0Sstevel@tonic-gate mdsetname_t **spp, 854*0Sstevel@tonic-gate int bypass_daemon, 855*0Sstevel@tonic-gate md_error_t *ep 856*0Sstevel@tonic-gate ) 857*0Sstevel@tonic-gate { 858*0Sstevel@tonic-gate set_t setno; 859*0Sstevel@tonic-gate mdsetname_t *this_sp; 860*0Sstevel@tonic-gate int is_it; 861*0Sstevel@tonic-gate set_t max_sets; 862*0Sstevel@tonic-gate 863*0Sstevel@tonic-gate if ((max_sets = get_max_sets(ep)) == 0) 864*0Sstevel@tonic-gate return (-1); 865*0Sstevel@tonic-gate 866*0Sstevel@tonic-gate assert(spp != NULL); 867*0Sstevel@tonic-gate *spp = NULL; 868*0Sstevel@tonic-gate 869*0Sstevel@tonic-gate for (setno = 1; setno < max_sets; setno++) { 870*0Sstevel@tonic-gate if (!bypass_daemon) { 871*0Sstevel@tonic-gate if ((this_sp = metasetnosetname(setno, ep)) == NULL) { 872*0Sstevel@tonic-gate if (mdismddberror(ep, MDE_DB_NODB)) { 873*0Sstevel@tonic-gate mdclrerror(ep); 874*0Sstevel@tonic-gate return (0); 875*0Sstevel@tonic-gate } 876*0Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) { 877*0Sstevel@tonic-gate mdclrerror(ep); 878*0Sstevel@tonic-gate continue; 879*0Sstevel@tonic-gate } 880*0Sstevel@tonic-gate return (-1); 881*0Sstevel@tonic-gate } 882*0Sstevel@tonic-gate } else 883*0Sstevel@tonic-gate this_sp = metafakesetname(setno, NULL); 884*0Sstevel@tonic-gate 885*0Sstevel@tonic-gate if ((is_it = meta_is_drive_in_thisset(this_sp, dnp, 886*0Sstevel@tonic-gate bypass_daemon, ep)) == -1) { 887*0Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) { 888*0Sstevel@tonic-gate mdclrerror(ep); 889*0Sstevel@tonic-gate continue; 890*0Sstevel@tonic-gate } 891*0Sstevel@tonic-gate return (-1); 892*0Sstevel@tonic-gate } 893*0Sstevel@tonic-gate if (is_it) { 894*0Sstevel@tonic-gate *spp = this_sp; 895*0Sstevel@tonic-gate return (0); 896*0Sstevel@tonic-gate } 897*0Sstevel@tonic-gate } 898*0Sstevel@tonic-gate return (0); 899*0Sstevel@tonic-gate } 900*0Sstevel@tonic-gate 901*0Sstevel@tonic-gate int 902*0Sstevel@tonic-gate meta_is_drive_in_thisset( 903*0Sstevel@tonic-gate mdsetname_t *sp, 904*0Sstevel@tonic-gate mddrivename_t *dnp, 905*0Sstevel@tonic-gate int bypass_daemon, 906*0Sstevel@tonic-gate md_error_t *ep 907*0Sstevel@tonic-gate ) 908*0Sstevel@tonic-gate { 909*0Sstevel@tonic-gate md_drive_desc *dd, *p; 910*0Sstevel@tonic-gate 911*0Sstevel@tonic-gate if (bypass_daemon) 912*0Sstevel@tonic-gate dd = dr2drivedesc(sp, MD_SIDEWILD, 913*0Sstevel@tonic-gate (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep); 914*0Sstevel@tonic-gate else 915*0Sstevel@tonic-gate dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep); 916*0Sstevel@tonic-gate 917*0Sstevel@tonic-gate if (dd == NULL) { 918*0Sstevel@tonic-gate if (! mdisok(ep)) 919*0Sstevel@tonic-gate return (-1); 920*0Sstevel@tonic-gate return (0); 921*0Sstevel@tonic-gate } 922*0Sstevel@tonic-gate 923*0Sstevel@tonic-gate 924*0Sstevel@tonic-gate for (p = dd; p != NULL; p = p->dd_next) 925*0Sstevel@tonic-gate if (strcmp(p->dd_dnp->cname, dnp->cname) == 0) 926*0Sstevel@tonic-gate return (1); 927*0Sstevel@tonic-gate return (0); 928*0Sstevel@tonic-gate } 929*0Sstevel@tonic-gate 930*0Sstevel@tonic-gate int 931*0Sstevel@tonic-gate meta_set_balance( 932*0Sstevel@tonic-gate mdsetname_t *sp, 933*0Sstevel@tonic-gate md_error_t *ep 934*0Sstevel@tonic-gate ) 935*0Sstevel@tonic-gate { 936*0Sstevel@tonic-gate md_set_desc *sd; 937*0Sstevel@tonic-gate md_drive_desc *dd, *curdd; 938*0Sstevel@tonic-gate daddr_t dbsize; 939*0Sstevel@tonic-gate daddr_t nblks; 940*0Sstevel@tonic-gate int i; 941*0Sstevel@tonic-gate int rval = 0; 942*0Sstevel@tonic-gate sigset_t oldsigs; 943*0Sstevel@tonic-gate md_setkey_t *cl_sk; 944*0Sstevel@tonic-gate md_error_t xep = mdnullerror; 945*0Sstevel@tonic-gate md_mnnode_desc *nd; 946*0Sstevel@tonic-gate int suspend1_flag = 0; 947*0Sstevel@tonic-gate 948*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) 949*0Sstevel@tonic-gate return (-1); 950*0Sstevel@tonic-gate 951*0Sstevel@tonic-gate dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE; 952*0Sstevel@tonic-gate 953*0Sstevel@tonic-gate /* Make sure we own the set */ 954*0Sstevel@tonic-gate if (meta_check_ownership(sp, ep) != 0) 955*0Sstevel@tonic-gate return (-1); 956*0Sstevel@tonic-gate 957*0Sstevel@tonic-gate /* END CHECK CODE */ 958*0Sstevel@tonic-gate 959*0Sstevel@tonic-gate /* 960*0Sstevel@tonic-gate * Get drive descriptors for the drives that are currently in the set. 961*0Sstevel@tonic-gate */ 962*0Sstevel@tonic-gate curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep); 963*0Sstevel@tonic-gate 964*0Sstevel@tonic-gate if (! mdisok(ep)) 965*0Sstevel@tonic-gate return (-1); 966*0Sstevel@tonic-gate 967*0Sstevel@tonic-gate /* Find the minimum replica size in use is or use the default */ 968*0Sstevel@tonic-gate if ((nblks = meta_db_minreplica(sp, ep)) < 0) 969*0Sstevel@tonic-gate mdclrerror(ep); 970*0Sstevel@tonic-gate else 971*0Sstevel@tonic-gate dbsize = nblks; /* adjust replica size */ 972*0Sstevel@tonic-gate 973*0Sstevel@tonic-gate /* Make sure we are blocking all signals */ 974*0Sstevel@tonic-gate if (procsigs(TRUE, &oldsigs, &xep) < 0) 975*0Sstevel@tonic-gate mdclrerror(&xep); 976*0Sstevel@tonic-gate 977*0Sstevel@tonic-gate /* 978*0Sstevel@tonic-gate * Lock the set on current set members. 979*0Sstevel@tonic-gate * For MN diskset lock_set and SUSPEND are used to protect against 980*0Sstevel@tonic-gate * other meta* commands running on the other nodes. 981*0Sstevel@tonic-gate */ 982*0Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) { 983*0Sstevel@tonic-gate nd = sd->sd_nodelist; 984*0Sstevel@tonic-gate while (nd) { 985*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 986*0Sstevel@tonic-gate nd = nd->nd_next; 987*0Sstevel@tonic-gate continue; 988*0Sstevel@tonic-gate } 989*0Sstevel@tonic-gate if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 990*0Sstevel@tonic-gate rval = -1; 991*0Sstevel@tonic-gate goto out; 992*0Sstevel@tonic-gate } 993*0Sstevel@tonic-gate nd = nd->nd_next; 994*0Sstevel@tonic-gate } 995*0Sstevel@tonic-gate /* 996*0Sstevel@tonic-gate * Lock out other meta* commands by suspending 997*0Sstevel@tonic-gate * class 1 messages across the diskset. 998*0Sstevel@tonic-gate */ 999*0Sstevel@tonic-gate nd = sd->sd_nodelist; 1000*0Sstevel@tonic-gate while (nd) { 1001*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1002*0Sstevel@tonic-gate nd = nd->nd_next; 1003*0Sstevel@tonic-gate continue; 1004*0Sstevel@tonic-gate } 1005*0Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, 1006*0Sstevel@tonic-gate COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, 1007*0Sstevel@tonic-gate MD_MSCF_NO_FLAGS, ep)) { 1008*0Sstevel@tonic-gate rval = -1; 1009*0Sstevel@tonic-gate goto out; 1010*0Sstevel@tonic-gate } 1011*0Sstevel@tonic-gate suspend1_flag = 1; 1012*0Sstevel@tonic-gate nd = nd->nd_next; 1013*0Sstevel@tonic-gate } 1014*0Sstevel@tonic-gate } else { 1015*0Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) { 1016*0Sstevel@tonic-gate /* Skip empty slots */ 1017*0Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0') continue; 1018*0Sstevel@tonic-gate 1019*0Sstevel@tonic-gate if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { 1020*0Sstevel@tonic-gate rval = -1; 1021*0Sstevel@tonic-gate goto out; 1022*0Sstevel@tonic-gate } 1023*0Sstevel@tonic-gate } 1024*0Sstevel@tonic-gate } 1025*0Sstevel@tonic-gate 1026*0Sstevel@tonic-gate /* We are not adding or deleting any drives, just balancing */ 1027*0Sstevel@tonic-gate dd = NULL; 1028*0Sstevel@tonic-gate 1029*0Sstevel@tonic-gate /* 1030*0Sstevel@tonic-gate * Balance the DB's according to the list of existing drives and the 1031*0Sstevel@tonic-gate * list of added drives. 1032*0Sstevel@tonic-gate */ 1033*0Sstevel@tonic-gate if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1) 1034*0Sstevel@tonic-gate goto out; 1035*0Sstevel@tonic-gate 1036*0Sstevel@tonic-gate out: 1037*0Sstevel@tonic-gate /* 1038*0Sstevel@tonic-gate * Unlock diskset by resuming class 1 messages across the diskset. 1039*0Sstevel@tonic-gate * Just resume all classes so that resume is the same whether 1040*0Sstevel@tonic-gate * just one class was locked or all classes were locked. 1041*0Sstevel@tonic-gate */ 1042*0Sstevel@tonic-gate if (suspend1_flag) { 1043*0Sstevel@tonic-gate nd = sd->sd_nodelist; 1044*0Sstevel@tonic-gate while (nd) { 1045*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1046*0Sstevel@tonic-gate nd = nd->nd_next; 1047*0Sstevel@tonic-gate continue; 1048*0Sstevel@tonic-gate } 1049*0Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 1050*0Sstevel@tonic-gate sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 1051*0Sstevel@tonic-gate /* 1052*0Sstevel@tonic-gate * We are here because we failed to resume 1053*0Sstevel@tonic-gate * rpc.mdcommd. However we potentially have 1054*0Sstevel@tonic-gate * an error from the previous call 1055*0Sstevel@tonic-gate * (meta_db_balance). If the previous call 1056*0Sstevel@tonic-gate * did fail, we capture that error and 1057*0Sstevel@tonic-gate * generate a perror withthe string, 1058*0Sstevel@tonic-gate * "Unable to resume...". 1059*0Sstevel@tonic-gate * Setting rval to -1 ensures that in the 1060*0Sstevel@tonic-gate * next iteration of the loop, ep is not 1061*0Sstevel@tonic-gate * clobbered. 1062*0Sstevel@tonic-gate */ 1063*0Sstevel@tonic-gate if (rval == 0) 1064*0Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 1065*0Sstevel@tonic-gate else 1066*0Sstevel@tonic-gate mdclrerror(&xep); 1067*0Sstevel@tonic-gate rval = -1; 1068*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 1069*0Sstevel@tonic-gate "Unable to resume rpc.mdcommd.")); 1070*0Sstevel@tonic-gate } 1071*0Sstevel@tonic-gate nd = nd->nd_next; 1072*0Sstevel@tonic-gate } 1073*0Sstevel@tonic-gate } 1074*0Sstevel@tonic-gate 1075*0Sstevel@tonic-gate /* Unlock the set */ 1076*0Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname); 1077*0Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) { 1078*0Sstevel@tonic-gate nd = sd->sd_nodelist; 1079*0Sstevel@tonic-gate while (nd) { 1080*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1081*0Sstevel@tonic-gate nd = nd->nd_next; 1082*0Sstevel@tonic-gate continue; 1083*0Sstevel@tonic-gate } 1084*0Sstevel@tonic-gate if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 1085*0Sstevel@tonic-gate if (rval == 0) 1086*0Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 1087*0Sstevel@tonic-gate else 1088*0Sstevel@tonic-gate mdclrerror(&xep); 1089*0Sstevel@tonic-gate rval = -1; 1090*0Sstevel@tonic-gate } 1091*0Sstevel@tonic-gate nd = nd->nd_next; 1092*0Sstevel@tonic-gate } 1093*0Sstevel@tonic-gate } else { 1094*0Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) { 1095*0Sstevel@tonic-gate /* Skip empty slots */ 1096*0Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0') 1097*0Sstevel@tonic-gate continue; 1098*0Sstevel@tonic-gate 1099*0Sstevel@tonic-gate if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { 1100*0Sstevel@tonic-gate if (rval == 0) 1101*0Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 1102*0Sstevel@tonic-gate rval = -1; 1103*0Sstevel@tonic-gate } 1104*0Sstevel@tonic-gate } 1105*0Sstevel@tonic-gate } 1106*0Sstevel@tonic-gate 1107*0Sstevel@tonic-gate /* release signals back to what they were on entry */ 1108*0Sstevel@tonic-gate if (procsigs(FALSE, &oldsigs, &xep) < 0) 1109*0Sstevel@tonic-gate mdclrerror(&xep); 1110*0Sstevel@tonic-gate 1111*0Sstevel@tonic-gate cl_set_setkey(NULL); 1112*0Sstevel@tonic-gate 1113*0Sstevel@tonic-gate metaflushsetname(sp); 1114*0Sstevel@tonic-gate 1115*0Sstevel@tonic-gate return (rval); 1116*0Sstevel@tonic-gate } 1117*0Sstevel@tonic-gate 1118*0Sstevel@tonic-gate int 1119*0Sstevel@tonic-gate meta_set_destroy( 1120*0Sstevel@tonic-gate mdsetname_t *sp, 1121*0Sstevel@tonic-gate int lock_set, 1122*0Sstevel@tonic-gate md_error_t *ep 1123*0Sstevel@tonic-gate ) 1124*0Sstevel@tonic-gate { 1125*0Sstevel@tonic-gate int i; 1126*0Sstevel@tonic-gate med_rec_t medr; 1127*0Sstevel@tonic-gate md_set_desc *sd; 1128*0Sstevel@tonic-gate md_drive_desc *dd, *p, *p1; 1129*0Sstevel@tonic-gate mddrivename_t *dnp; 1130*0Sstevel@tonic-gate mdname_t *np; 1131*0Sstevel@tonic-gate mdnamelist_t *nlp = NULL; 1132*0Sstevel@tonic-gate int num_users = 0; 1133*0Sstevel@tonic-gate int has_set; 1134*0Sstevel@tonic-gate side_t mysideno; 1135*0Sstevel@tonic-gate sigset_t oldsigs; 1136*0Sstevel@tonic-gate md_error_t xep = mdnullerror; 1137*0Sstevel@tonic-gate md_setkey_t *cl_sk; 1138*0Sstevel@tonic-gate int rval = 0; 1139*0Sstevel@tonic-gate int delete_end = 1; 1140*0Sstevel@tonic-gate 1141*0Sstevel@tonic-gate /* Make sure we are blocking all signals */ 1142*0Sstevel@tonic-gate if (procsigs(TRUE, &oldsigs, ep) < 0) 1143*0Sstevel@tonic-gate return (-1); 1144*0Sstevel@tonic-gate 1145*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1146*0Sstevel@tonic-gate if (! mdisok(ep)) 1147*0Sstevel@tonic-gate rval = -1; 1148*0Sstevel@tonic-gate goto out; 1149*0Sstevel@tonic-gate } 1150*0Sstevel@tonic-gate 1151*0Sstevel@tonic-gate /* 1152*0Sstevel@tonic-gate * meta_set_destroy should not be called for a MN diskset. 1153*0Sstevel@tonic-gate * This routine destroys a set without communicating this information 1154*0Sstevel@tonic-gate * to the other nodes which would lead to an inconsistency in 1155*0Sstevel@tonic-gate * the MN diskset. 1156*0Sstevel@tonic-gate */ 1157*0Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) { 1158*0Sstevel@tonic-gate rval = -1; 1159*0Sstevel@tonic-gate goto out; 1160*0Sstevel@tonic-gate } 1161*0Sstevel@tonic-gate 1162*0Sstevel@tonic-gate /* Continue if a traditional diskset */ 1163*0Sstevel@tonic-gate 1164*0Sstevel@tonic-gate /* 1165*0Sstevel@tonic-gate * Check to see who has the set. If we are not the last user of the 1166*0Sstevel@tonic-gate * set, we will not touch the replicas. 1167*0Sstevel@tonic-gate */ 1168*0Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) { 1169*0Sstevel@tonic-gate /* Skip empty slots */ 1170*0Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0') 1171*0Sstevel@tonic-gate continue; 1172*0Sstevel@tonic-gate 1173*0Sstevel@tonic-gate has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ, 1174*0Sstevel@tonic-gate ep); 1175*0Sstevel@tonic-gate 1176*0Sstevel@tonic-gate if (has_set < 0) { 1177*0Sstevel@tonic-gate mdclrerror(ep); 1178*0Sstevel@tonic-gate } else 1179*0Sstevel@tonic-gate num_users++; 1180*0Sstevel@tonic-gate } 1181*0Sstevel@tonic-gate 1182*0Sstevel@tonic-gate if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) { 1183*0Sstevel@tonic-gate if (! mdisok(ep)) { 1184*0Sstevel@tonic-gate rval = -1; 1185*0Sstevel@tonic-gate goto out; 1186*0Sstevel@tonic-gate } 1187*0Sstevel@tonic-gate } 1188*0Sstevel@tonic-gate 1189*0Sstevel@tonic-gate if (setup_db_bydd(sp, dd, TRUE, ep) == -1) { 1190*0Sstevel@tonic-gate rval = -1; 1191*0Sstevel@tonic-gate goto out; 1192*0Sstevel@tonic-gate } 1193*0Sstevel@tonic-gate 1194*0Sstevel@tonic-gate if (lock_set == TRUE) { 1195*0Sstevel@tonic-gate /* Lock the set on our side */ 1196*0Sstevel@tonic-gate if (clnt_lock_set(mynode(), sp, ep)) { 1197*0Sstevel@tonic-gate rval = -1; 1198*0Sstevel@tonic-gate goto out; 1199*0Sstevel@tonic-gate } 1200*0Sstevel@tonic-gate } 1201*0Sstevel@tonic-gate 1202*0Sstevel@tonic-gate /* 1203*0Sstevel@tonic-gate * A traditional diskset has no diskset stale information to send 1204*0Sstevel@tonic-gate * since there can only be one owner node at a time. 1205*0Sstevel@tonic-gate */ 1206*0Sstevel@tonic-gate if (snarf_set(sp, FALSE, ep)) 1207*0Sstevel@tonic-gate mdclrerror(ep); 1208*0Sstevel@tonic-gate 1209*0Sstevel@tonic-gate if (dd != NULL) { 1210*0Sstevel@tonic-gate /* 1211*0Sstevel@tonic-gate * Make sure that no drives are in use as parts of metadrives 1212*0Sstevel@tonic-gate * or hot spare pools, this is one of the few error conditions 1213*0Sstevel@tonic-gate * that will stop this routine, unless the environment has 1214*0Sstevel@tonic-gate * META_DESTROY_SET_OK set, in which case, the operation will 1215*0Sstevel@tonic-gate * proceed. 1216*0Sstevel@tonic-gate */ 1217*0Sstevel@tonic-gate if (getenv("META_DESTROY_SET_OK") == NULL) { 1218*0Sstevel@tonic-gate for (p = dd; p != NULL; p = p->dd_next) { 1219*0Sstevel@tonic-gate dnp = p->dd_dnp; 1220*0Sstevel@tonic-gate 1221*0Sstevel@tonic-gate i = meta_check_drive_inuse(sp, dnp, FALSE, ep); 1222*0Sstevel@tonic-gate if (i == -1) { 1223*0Sstevel@tonic-gate /* need xep - wire calls clear error */ 1224*0Sstevel@tonic-gate i = metaget_setownership(sp, &xep); 1225*0Sstevel@tonic-gate if (i == -1) { 1226*0Sstevel@tonic-gate rval = -1; 1227*0Sstevel@tonic-gate goto out; 1228*0Sstevel@tonic-gate } 1229*0Sstevel@tonic-gate 1230*0Sstevel@tonic-gate mysideno = getmyside(sp, &xep); 1231*0Sstevel@tonic-gate 1232*0Sstevel@tonic-gate if (mysideno == MD_SIDEWILD) { 1233*0Sstevel@tonic-gate rval = -1; 1234*0Sstevel@tonic-gate goto out; 1235*0Sstevel@tonic-gate } 1236*0Sstevel@tonic-gate 1237*0Sstevel@tonic-gate if (sd->sd_isown[mysideno] == FALSE) 1238*0Sstevel@tonic-gate if (halt_set(sp, &xep)) { 1239*0Sstevel@tonic-gate rval = -1; 1240*0Sstevel@tonic-gate goto out; 1241*0Sstevel@tonic-gate } 1242*0Sstevel@tonic-gate 1243*0Sstevel@tonic-gate rval = -1; 1244*0Sstevel@tonic-gate goto out; 1245*0Sstevel@tonic-gate } 1246*0Sstevel@tonic-gate } 1247*0Sstevel@tonic-gate } 1248*0Sstevel@tonic-gate 1249*0Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) { 1250*0Sstevel@tonic-gate /* Skip empty slots */ 1251*0Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0') 1252*0Sstevel@tonic-gate continue; 1253*0Sstevel@tonic-gate 1254*0Sstevel@tonic-gate /* Skip non local nodes */ 1255*0Sstevel@tonic-gate if (strcmp(mynode(), sd->sd_nodes[i]) != 0) 1256*0Sstevel@tonic-gate continue; 1257*0Sstevel@tonic-gate 1258*0Sstevel@tonic-gate if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep)) 1259*0Sstevel@tonic-gate mdclrerror(ep); 1260*0Sstevel@tonic-gate } 1261*0Sstevel@tonic-gate 1262*0Sstevel@tonic-gate /* 1263*0Sstevel@tonic-gate * Go thru each drive and individually delete the replicas. 1264*0Sstevel@tonic-gate * This way we can ignore individual errors. 1265*0Sstevel@tonic-gate */ 1266*0Sstevel@tonic-gate for (p = dd; p != NULL; p = p->dd_next) { 1267*0Sstevel@tonic-gate uint_t rep_slice; 1268*0Sstevel@tonic-gate 1269*0Sstevel@tonic-gate dnp = p->dd_dnp; 1270*0Sstevel@tonic-gate if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) || 1271*0Sstevel@tonic-gate (((np = metaslicename(dnp, rep_slice, ep)) 1272*0Sstevel@tonic-gate == NULL) && 1273*0Sstevel@tonic-gate ((np = metaslicename(dnp, MD_SLICE0, ep)) 1274*0Sstevel@tonic-gate == NULL))) { 1275*0Sstevel@tonic-gate rval = -1; 1276*0Sstevel@tonic-gate goto out; 1277*0Sstevel@tonic-gate } 1278*0Sstevel@tonic-gate 1279*0Sstevel@tonic-gate if ((np = metaslicename(dnp, 1280*0Sstevel@tonic-gate rep_slice, ep)) == NULL) { 1281*0Sstevel@tonic-gate if ((np = metaslicename(dnp, 1282*0Sstevel@tonic-gate MD_SLICE0, ep)) == NULL) { 1283*0Sstevel@tonic-gate rval = -1; 1284*0Sstevel@tonic-gate goto out; 1285*0Sstevel@tonic-gate } 1286*0Sstevel@tonic-gate mdclrerror(ep); 1287*0Sstevel@tonic-gate } 1288*0Sstevel@tonic-gate 1289*0Sstevel@tonic-gate /* Yes this is UGLY!!! */ 1290*0Sstevel@tonic-gate p1 = p->dd_next; 1291*0Sstevel@tonic-gate p->dd_next = NULL; 1292*0Sstevel@tonic-gate if (rel_own_bydd(sp, p, FALSE, ep)) 1293*0Sstevel@tonic-gate mdclrerror(ep); 1294*0Sstevel@tonic-gate p->dd_next = p1; 1295*0Sstevel@tonic-gate 1296*0Sstevel@tonic-gate if (p->dd_dbcnt == 0) 1297*0Sstevel@tonic-gate continue; 1298*0Sstevel@tonic-gate 1299*0Sstevel@tonic-gate /* 1300*0Sstevel@tonic-gate * Skip the replica removal if we are not the last user 1301*0Sstevel@tonic-gate */ 1302*0Sstevel@tonic-gate if (num_users != 1) 1303*0Sstevel@tonic-gate continue; 1304*0Sstevel@tonic-gate 1305*0Sstevel@tonic-gate nlp = NULL; 1306*0Sstevel@tonic-gate (void) metanamelist_append(&nlp, np); 1307*0Sstevel@tonic-gate if (meta_db_detach(sp, nlp, 1308*0Sstevel@tonic-gate (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep)) 1309*0Sstevel@tonic-gate mdclrerror(ep); 1310*0Sstevel@tonic-gate metafreenamelist(nlp); 1311*0Sstevel@tonic-gate } 1312*0Sstevel@tonic-gate } 1313*0Sstevel@tonic-gate 1314*0Sstevel@tonic-gate if (halt_set(sp, ep)) { 1315*0Sstevel@tonic-gate rval = -1; 1316*0Sstevel@tonic-gate goto out; 1317*0Sstevel@tonic-gate } 1318*0Sstevel@tonic-gate 1319*0Sstevel@tonic-gate /* Setup the mediator record */ 1320*0Sstevel@tonic-gate (void) memset(&medr, '\0', sizeof (med_rec_t)); 1321*0Sstevel@tonic-gate medr.med_rec_mag = MED_REC_MAGIC; 1322*0Sstevel@tonic-gate medr.med_rec_rev = MED_REC_REV; 1323*0Sstevel@tonic-gate medr.med_rec_fl = 0; 1324*0Sstevel@tonic-gate medr.med_rec_sn = sp->setno; 1325*0Sstevel@tonic-gate (void) strcpy(medr.med_rec_snm, sp->setname); 1326*0Sstevel@tonic-gate medr.med_rec_meds = sd->sd_med; /* structure assigment */ 1327*0Sstevel@tonic-gate (void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t)); 1328*0Sstevel@tonic-gate medr.med_rec_foff = 0; 1329*0Sstevel@tonic-gate 1330*0Sstevel@tonic-gate /* 1331*0Sstevel@tonic-gate * If we are the last remaining user, then remove the mediator hosts 1332*0Sstevel@tonic-gate */ 1333*0Sstevel@tonic-gate if (num_users == 1) { 1334*0Sstevel@tonic-gate for (i = 0; i < MED_MAX_HOSTS; i++) { 1335*0Sstevel@tonic-gate if (medr.med_rec_meds.n_lst[i].a_cnt != 0) 1336*0Sstevel@tonic-gate SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE, 1337*0Sstevel@tonic-gate SVM_TAG_MEDIATOR, sp->setno, i); 1338*0Sstevel@tonic-gate (void) memset(&medr.med_rec_meds.n_lst[i], '\0', 1339*0Sstevel@tonic-gate sizeof (md_h_t)); 1340*0Sstevel@tonic-gate } 1341*0Sstevel@tonic-gate medr.med_rec_meds.n_cnt = 0; 1342*0Sstevel@tonic-gate } else { /* Remove this host from the mediator node list. */ 1343*0Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) { 1344*0Sstevel@tonic-gate /* Skip empty slots */ 1345*0Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0') 1346*0Sstevel@tonic-gate continue; 1347*0Sstevel@tonic-gate 1348*0Sstevel@tonic-gate /* Copy non local node */ 1349*0Sstevel@tonic-gate if (strcmp(mynode(), sd->sd_nodes[i]) != 0) { 1350*0Sstevel@tonic-gate (void) strcpy(medr.med_rec_nodes[i], 1351*0Sstevel@tonic-gate sd->sd_nodes[i]); 1352*0Sstevel@tonic-gate continue; 1353*0Sstevel@tonic-gate } 1354*0Sstevel@tonic-gate 1355*0Sstevel@tonic-gate /* Clear local node */ 1356*0Sstevel@tonic-gate (void) memset(&medr.med_rec_nodes[i], '\0', 1357*0Sstevel@tonic-gate sizeof (md_node_nm_t)); 1358*0Sstevel@tonic-gate } 1359*0Sstevel@tonic-gate } 1360*0Sstevel@tonic-gate 1361*0Sstevel@tonic-gate crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL); 1362*0Sstevel@tonic-gate 1363*0Sstevel@tonic-gate /* 1364*0Sstevel@tonic-gate * If the client is part of a cluster put the DCS service 1365*0Sstevel@tonic-gate * into a deleteing state. 1366*0Sstevel@tonic-gate */ 1367*0Sstevel@tonic-gate if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) { 1368*0Sstevel@tonic-gate if (metad_isautotakebyname(sp->setname)) { 1369*0Sstevel@tonic-gate delete_end = 0; 1370*0Sstevel@tonic-gate } else { 1371*0Sstevel@tonic-gate mdclrerror(ep); 1372*0Sstevel@tonic-gate goto out; 1373*0Sstevel@tonic-gate } 1374*0Sstevel@tonic-gate } 1375*0Sstevel@tonic-gate 1376*0Sstevel@tonic-gate /* Inform the mediator hosts of the new information */ 1377*0Sstevel@tonic-gate for (i = 0; i < MED_MAX_HOSTS; i++) { 1378*0Sstevel@tonic-gate if (sd->sd_med.n_lst[i].a_cnt == 0) 1379*0Sstevel@tonic-gate continue; 1380*0Sstevel@tonic-gate 1381*0Sstevel@tonic-gate if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep)) 1382*0Sstevel@tonic-gate mdclrerror(ep); 1383*0Sstevel@tonic-gate } 1384*0Sstevel@tonic-gate 1385*0Sstevel@tonic-gate /* Delete the set locally */ 1386*0Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) { 1387*0Sstevel@tonic-gate /* Skip empty slots */ 1388*0Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0') 1389*0Sstevel@tonic-gate continue; 1390*0Sstevel@tonic-gate 1391*0Sstevel@tonic-gate /* Skip non local nodes */ 1392*0Sstevel@tonic-gate if (strcmp(mynode(), sd->sd_nodes[i]) != 0) 1393*0Sstevel@tonic-gate continue; 1394*0Sstevel@tonic-gate 1395*0Sstevel@tonic-gate if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) 1396*0Sstevel@tonic-gate mdclrerror(ep); 1397*0Sstevel@tonic-gate } 1398*0Sstevel@tonic-gate if (delete_end && 1399*0Sstevel@tonic-gate sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR) 1400*0Sstevel@tonic-gate rval = -1; 1401*0Sstevel@tonic-gate 1402*0Sstevel@tonic-gate out: 1403*0Sstevel@tonic-gate /* release signals back to what they were on entry */ 1404*0Sstevel@tonic-gate if (procsigs(FALSE, &oldsigs, &xep) < 0) { 1405*0Sstevel@tonic-gate if (rval == 0) 1406*0Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 1407*0Sstevel@tonic-gate rval = -1; 1408*0Sstevel@tonic-gate } 1409*0Sstevel@tonic-gate 1410*0Sstevel@tonic-gate if (lock_set == TRUE) { 1411*0Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname); 1412*0Sstevel@tonic-gate if (clnt_unlock_set(mynode(), cl_sk, &xep)) { 1413*0Sstevel@tonic-gate if (rval == 0) 1414*0Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 1415*0Sstevel@tonic-gate rval = -1; 1416*0Sstevel@tonic-gate } 1417*0Sstevel@tonic-gate cl_set_setkey(NULL); 1418*0Sstevel@tonic-gate } 1419*0Sstevel@tonic-gate 1420*0Sstevel@tonic-gate metaflushsetname(sp); 1421*0Sstevel@tonic-gate return (rval); 1422*0Sstevel@tonic-gate } 1423*0Sstevel@tonic-gate 1424*0Sstevel@tonic-gate int 1425*0Sstevel@tonic-gate meta_set_purge( 1426*0Sstevel@tonic-gate mdsetname_t *sp, 1427*0Sstevel@tonic-gate int bypass_cluster, 1428*0Sstevel@tonic-gate int forceflg, 1429*0Sstevel@tonic-gate md_error_t *ep 1430*0Sstevel@tonic-gate ) 1431*0Sstevel@tonic-gate { 1432*0Sstevel@tonic-gate char *thishost = mynode(); 1433*0Sstevel@tonic-gate md_set_desc *sd; 1434*0Sstevel@tonic-gate md_setkey_t *cl_sk; 1435*0Sstevel@tonic-gate md_error_t xep = mdnullerror; 1436*0Sstevel@tonic-gate int rval = 0; 1437*0Sstevel@tonic-gate int i, num_hosts = 0; 1438*0Sstevel@tonic-gate int has_set = 0; 1439*0Sstevel@tonic-gate int max_node = 0; 1440*0Sstevel@tonic-gate int delete_end = 1; 1441*0Sstevel@tonic-gate md_mnnode_desc *nd; 1442*0Sstevel@tonic-gate 1443*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1444*0Sstevel@tonic-gate /* unable to find set description */ 1445*0Sstevel@tonic-gate rval = 1; 1446*0Sstevel@tonic-gate return (rval); 1447*0Sstevel@tonic-gate } 1448*0Sstevel@tonic-gate 1449*0Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) { 1450*0Sstevel@tonic-gate /* 1451*0Sstevel@tonic-gate * Get a count of the hosts in the set and also lock the set 1452*0Sstevel@tonic-gate * on those hosts that know about it. 1453*0Sstevel@tonic-gate */ 1454*0Sstevel@tonic-gate nd = sd->sd_nodelist; 1455*0Sstevel@tonic-gate while (nd) { 1456*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1457*0Sstevel@tonic-gate nd = nd->nd_next; 1458*0Sstevel@tonic-gate continue; 1459*0Sstevel@tonic-gate } 1460*0Sstevel@tonic-gate has_set = nodehasset(sp, nd->nd_nodename, 1461*0Sstevel@tonic-gate NHS_NST_EQ, ep); 1462*0Sstevel@tonic-gate 1463*0Sstevel@tonic-gate /* 1464*0Sstevel@tonic-gate * The host is not aware of this set (has_set < 0) or 1465*0Sstevel@tonic-gate * the set does not match (has_set == 0). This check 1466*0Sstevel@tonic-gate * prevents the code getting confused by an apparent 1467*0Sstevel@tonic-gate * inconsistancy in the set's state, this is in the 1468*0Sstevel@tonic-gate * purge code so something is broken in any case and 1469*0Sstevel@tonic-gate * this is just trying to fix the brokeness. 1470*0Sstevel@tonic-gate */ 1471*0Sstevel@tonic-gate if (has_set <= 0) { 1472*0Sstevel@tonic-gate mdclrerror(ep); 1473*0Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_NOSET; 1474*0Sstevel@tonic-gate } else { 1475*0Sstevel@tonic-gate num_hosts++; 1476*0Sstevel@tonic-gate if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 1477*0Sstevel@tonic-gate /* 1478*0Sstevel@tonic-gate * If the force flag is set then 1479*0Sstevel@tonic-gate * ignore any RPC failures because we 1480*0Sstevel@tonic-gate * are only really interested with 1481*0Sstevel@tonic-gate * the set on local node. 1482*0Sstevel@tonic-gate */ 1483*0Sstevel@tonic-gate if (forceflg && mdanyrpcerror(ep)) { 1484*0Sstevel@tonic-gate mdclrerror(ep); 1485*0Sstevel@tonic-gate } else { 1486*0Sstevel@tonic-gate /* 1487*0Sstevel@tonic-gate * set max_node so that in the 1488*0Sstevel@tonic-gate * unlock code nodes in the 1489*0Sstevel@tonic-gate * set that have not been 1490*0Sstevel@tonic-gate * locked are not unlocked. 1491*0Sstevel@tonic-gate */ 1492*0Sstevel@tonic-gate max_node = nd->nd_nodeid; 1493*0Sstevel@tonic-gate rval = 2; 1494*0Sstevel@tonic-gate goto out1; 1495*0Sstevel@tonic-gate } 1496*0Sstevel@tonic-gate } 1497*0Sstevel@tonic-gate 1498*0Sstevel@tonic-gate } 1499*0Sstevel@tonic-gate nd = nd->nd_next; 1500*0Sstevel@tonic-gate } 1501*0Sstevel@tonic-gate max_node = 0; 1502*0Sstevel@tonic-gate } else { 1503*0Sstevel@tonic-gate /* 1504*0Sstevel@tonic-gate * Get a count of the hosts in the set and also lock the set 1505*0Sstevel@tonic-gate * on those hosts that know about it. 1506*0Sstevel@tonic-gate */ 1507*0Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) { 1508*0Sstevel@tonic-gate /* Skip empty slots */ 1509*0Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0') 1510*0Sstevel@tonic-gate continue; 1511*0Sstevel@tonic-gate 1512*0Sstevel@tonic-gate has_set = nodehasset(sp, sd->sd_nodes[i], 1513*0Sstevel@tonic-gate NHS_NST_EQ, ep); 1514*0Sstevel@tonic-gate 1515*0Sstevel@tonic-gate /* 1516*0Sstevel@tonic-gate * The host is not aware of this set (has_set < 0) or 1517*0Sstevel@tonic-gate * the set does not match (has_set == 0). This check 1518*0Sstevel@tonic-gate * prevents the code getting confused by an apparent 1519*0Sstevel@tonic-gate * inconsistancy in the set's state, this is in the 1520*0Sstevel@tonic-gate * purge code so something is broken in any case and 1521*0Sstevel@tonic-gate * this is just trying to fix the brokeness. 1522*0Sstevel@tonic-gate */ 1523*0Sstevel@tonic-gate if (has_set <= 0) { 1524*0Sstevel@tonic-gate mdclrerror(ep); 1525*0Sstevel@tonic-gate /* 1526*0Sstevel@tonic-gate * set the node to NULL to prevent further 1527*0Sstevel@tonic-gate * requests to this unresponsive node. 1528*0Sstevel@tonic-gate */ 1529*0Sstevel@tonic-gate sd->sd_nodes[i][0] = '\0'; 1530*0Sstevel@tonic-gate } else { 1531*0Sstevel@tonic-gate num_hosts++; 1532*0Sstevel@tonic-gate if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { 1533*0Sstevel@tonic-gate /* 1534*0Sstevel@tonic-gate * If the force flag is set then 1535*0Sstevel@tonic-gate * ignore any RPC failures because we 1536*0Sstevel@tonic-gate * are only really interested with 1537*0Sstevel@tonic-gate * the set on local node. 1538*0Sstevel@tonic-gate */ 1539*0Sstevel@tonic-gate if (forceflg && mdanyrpcerror(ep)) { 1540*0Sstevel@tonic-gate mdclrerror(ep); 1541*0Sstevel@tonic-gate } else { 1542*0Sstevel@tonic-gate rval = 2; 1543*0Sstevel@tonic-gate /* 1544*0Sstevel@tonic-gate * set max_node so that in the 1545*0Sstevel@tonic-gate * unlock code nodes in the 1546*0Sstevel@tonic-gate * set that have not been 1547*0Sstevel@tonic-gate * locked are not unlocked. 1548*0Sstevel@tonic-gate */ 1549*0Sstevel@tonic-gate max_node = i; 1550*0Sstevel@tonic-gate goto out1; 1551*0Sstevel@tonic-gate } 1552*0Sstevel@tonic-gate } 1553*0Sstevel@tonic-gate } 1554*0Sstevel@tonic-gate } 1555*0Sstevel@tonic-gate max_node = i; /* now MD_MAXSIDES */ 1556*0Sstevel@tonic-gate } 1557*0Sstevel@tonic-gate if (!bypass_cluster) { 1558*0Sstevel@tonic-gate /* 1559*0Sstevel@tonic-gate * If there is only one host associated with the 1560*0Sstevel@tonic-gate * set then remove the set from the cluster. 1561*0Sstevel@tonic-gate */ 1562*0Sstevel@tonic-gate if (num_hosts == 1) { 1563*0Sstevel@tonic-gate if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) { 1564*0Sstevel@tonic-gate if (metad_isautotakebyname(sp->setname)) { 1565*0Sstevel@tonic-gate delete_end = 0; 1566*0Sstevel@tonic-gate } else { 1567*0Sstevel@tonic-gate mdclrerror(ep); 1568*0Sstevel@tonic-gate rval = 3; 1569*0Sstevel@tonic-gate goto out1; 1570*0Sstevel@tonic-gate } 1571*0Sstevel@tonic-gate } 1572*0Sstevel@tonic-gate } 1573*0Sstevel@tonic-gate } 1574*0Sstevel@tonic-gate 1575*0Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) { 1576*0Sstevel@tonic-gate /* 1577*0Sstevel@tonic-gate * Get a count of the hosts in the set and also lock the set 1578*0Sstevel@tonic-gate * on those hosts that know about it. 1579*0Sstevel@tonic-gate */ 1580*0Sstevel@tonic-gate nd = sd->sd_nodelist; 1581*0Sstevel@tonic-gate while (nd) { 1582*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1583*0Sstevel@tonic-gate nd = nd->nd_next; 1584*0Sstevel@tonic-gate continue; 1585*0Sstevel@tonic-gate } 1586*0Sstevel@tonic-gate if (nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) { 1587*0Sstevel@tonic-gate /* 1588*0Sstevel@tonic-gate * Tell the remote node to remove this node 1589*0Sstevel@tonic-gate */ 1590*0Sstevel@tonic-gate if (clnt_delhosts(nd->nd_nodename, sp, 1, 1591*0Sstevel@tonic-gate &thishost, ep) == -1) { 1592*0Sstevel@tonic-gate /* 1593*0Sstevel@tonic-gate * If we fail to delete ourselves 1594*0Sstevel@tonic-gate * from the remote host it does not 1595*0Sstevel@tonic-gate * really matter because the set is 1596*0Sstevel@tonic-gate * being "purged" from this node. The 1597*0Sstevel@tonic-gate * set can be purged from the other 1598*0Sstevel@tonic-gate * node at a later time. 1599*0Sstevel@tonic-gate */ 1600*0Sstevel@tonic-gate mdclrerror(ep); 1601*0Sstevel@tonic-gate } 1602*0Sstevel@tonic-gate nd = nd->nd_next; 1603*0Sstevel@tonic-gate continue; 1604*0Sstevel@tonic-gate } 1605*0Sstevel@tonic-gate /* remove the set from this host */ 1606*0Sstevel@tonic-gate if (clnt_delset(nd->nd_nodename, sp, ep) == -1) { 1607*0Sstevel@tonic-gate md_perror(dgettext(TEXT_DOMAIN, "delset")); 1608*0Sstevel@tonic-gate if (!bypass_cluster && num_hosts == 1) 1609*0Sstevel@tonic-gate (void) sdssc_delete_end(sp->setname, 1610*0Sstevel@tonic-gate SDSSC_CLEANUP); 1611*0Sstevel@tonic-gate mdclrerror(ep); 1612*0Sstevel@tonic-gate goto out1; 1613*0Sstevel@tonic-gate } 1614*0Sstevel@tonic-gate nd = nd->nd_next; 1615*0Sstevel@tonic-gate } 1616*0Sstevel@tonic-gate } else { 1617*0Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) { 1618*0Sstevel@tonic-gate /* Skip empty slots */ 1619*0Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0') 1620*0Sstevel@tonic-gate continue; 1621*0Sstevel@tonic-gate if (strcmp(thishost, sd->sd_nodes[i]) != 0) { 1622*0Sstevel@tonic-gate /* 1623*0Sstevel@tonic-gate * Tell the remote node to remove this node 1624*0Sstevel@tonic-gate */ 1625*0Sstevel@tonic-gate if (clnt_delhosts(sd->sd_nodes[i], sp, 1, 1626*0Sstevel@tonic-gate &thishost, ep) == -1) { 1627*0Sstevel@tonic-gate /* 1628*0Sstevel@tonic-gate * If we fail to delete ourselves 1629*0Sstevel@tonic-gate * from the remote host it does not 1630*0Sstevel@tonic-gate * really matter because the set is 1631*0Sstevel@tonic-gate * being "purged" from this node. The 1632*0Sstevel@tonic-gate * set can be purged from the other 1633*0Sstevel@tonic-gate * node at a later time. 1634*0Sstevel@tonic-gate */ 1635*0Sstevel@tonic-gate mdclrerror(ep); 1636*0Sstevel@tonic-gate } 1637*0Sstevel@tonic-gate continue; 1638*0Sstevel@tonic-gate } 1639*0Sstevel@tonic-gate 1640*0Sstevel@tonic-gate /* remove the set from this host */ 1641*0Sstevel@tonic-gate if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) { 1642*0Sstevel@tonic-gate md_perror(dgettext(TEXT_DOMAIN, "delset")); 1643*0Sstevel@tonic-gate if (!bypass_cluster && num_hosts == 1) 1644*0Sstevel@tonic-gate (void) sdssc_delete_end(sp->setname, 1645*0Sstevel@tonic-gate SDSSC_CLEANUP); 1646*0Sstevel@tonic-gate mdclrerror(ep); 1647*0Sstevel@tonic-gate goto out1; 1648*0Sstevel@tonic-gate } 1649*0Sstevel@tonic-gate } 1650*0Sstevel@tonic-gate } 1651*0Sstevel@tonic-gate 1652*0Sstevel@tonic-gate if (!bypass_cluster && num_hosts == 1) { 1653*0Sstevel@tonic-gate if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) == 1654*0Sstevel@tonic-gate SDSSC_ERROR) { 1655*0Sstevel@tonic-gate rval = 4; 1656*0Sstevel@tonic-gate } 1657*0Sstevel@tonic-gate } 1658*0Sstevel@tonic-gate 1659*0Sstevel@tonic-gate out1: 1660*0Sstevel@tonic-gate 1661*0Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname); 1662*0Sstevel@tonic-gate 1663*0Sstevel@tonic-gate /* 1664*0Sstevel@tonic-gate * Remove the set lock on those nodes that had the set locked 1665*0Sstevel@tonic-gate * max_node will either be MD_MAXSIDES or array index of the last 1666*0Sstevel@tonic-gate * node contacted (or rather failed to contact) for traditional 1667*0Sstevel@tonic-gate * diskset. For a MN diskset, max_node is the node_id of the node 1668*0Sstevel@tonic-gate * that failed the lock. 1669*0Sstevel@tonic-gate */ 1670*0Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) { 1671*0Sstevel@tonic-gate nd = sd->sd_nodelist; 1672*0Sstevel@tonic-gate while (nd) { 1673*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1674*0Sstevel@tonic-gate nd = nd->nd_next; 1675*0Sstevel@tonic-gate continue; 1676*0Sstevel@tonic-gate } 1677*0Sstevel@tonic-gate if (nd->nd_nodeid == max_node) 1678*0Sstevel@tonic-gate break; 1679*0Sstevel@tonic-gate if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 1680*0Sstevel@tonic-gate if (forceflg && mdanyrpcerror(&xep)) { 1681*0Sstevel@tonic-gate mdclrerror(&xep); 1682*0Sstevel@tonic-gate nd = nd->nd_next; 1683*0Sstevel@tonic-gate continue; 1684*0Sstevel@tonic-gate } 1685*0Sstevel@tonic-gate if (rval == 0) 1686*0Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 1687*0Sstevel@tonic-gate rval = 5; 1688*0Sstevel@tonic-gate } 1689*0Sstevel@tonic-gate nd = nd->nd_next; 1690*0Sstevel@tonic-gate } 1691*0Sstevel@tonic-gate } else { 1692*0Sstevel@tonic-gate for (i = 0; i < max_node; i++) { 1693*0Sstevel@tonic-gate /* Skip empty slots */ 1694*0Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0') 1695*0Sstevel@tonic-gate continue; 1696*0Sstevel@tonic-gate 1697*0Sstevel@tonic-gate if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { 1698*0Sstevel@tonic-gate if (forceflg && mdanyrpcerror(&xep)) { 1699*0Sstevel@tonic-gate mdclrerror(&xep); 1700*0Sstevel@tonic-gate continue; 1701*0Sstevel@tonic-gate } 1702*0Sstevel@tonic-gate if (rval == 0) 1703*0Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 1704*0Sstevel@tonic-gate rval = 5; 1705*0Sstevel@tonic-gate } 1706*0Sstevel@tonic-gate } 1707*0Sstevel@tonic-gate } 1708*0Sstevel@tonic-gate 1709*0Sstevel@tonic-gate cl_set_setkey(NULL); 1710*0Sstevel@tonic-gate 1711*0Sstevel@tonic-gate return (rval); 1712*0Sstevel@tonic-gate } 1713*0Sstevel@tonic-gate 1714*0Sstevel@tonic-gate int 1715*0Sstevel@tonic-gate meta_set_query( 1716*0Sstevel@tonic-gate mdsetname_t *sp, 1717*0Sstevel@tonic-gate mddb_dtag_lst_t **dtlpp, 1718*0Sstevel@tonic-gate md_error_t *ep 1719*0Sstevel@tonic-gate ) 1720*0Sstevel@tonic-gate { 1721*0Sstevel@tonic-gate mddb_dtag_get_parm_t dtgp; 1722*0Sstevel@tonic-gate 1723*0Sstevel@tonic-gate (void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t)); 1724*0Sstevel@tonic-gate dtgp.dtgp_setno = sp->setno; 1725*0Sstevel@tonic-gate 1726*0Sstevel@tonic-gate /*CONSTCOND*/ 1727*0Sstevel@tonic-gate while (1) { 1728*0Sstevel@tonic-gate if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0) 1729*0Sstevel@tonic-gate if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) || 1730*0Sstevel@tonic-gate *dtlpp == NULL) 1731*0Sstevel@tonic-gate return (mdstealerror(ep, &dtgp.dtgp_mde)); 1732*0Sstevel@tonic-gate else 1733*0Sstevel@tonic-gate break; 1734*0Sstevel@tonic-gate 1735*0Sstevel@tonic-gate /* 1736*0Sstevel@tonic-gate * Run to the end of the list 1737*0Sstevel@tonic-gate */ 1738*0Sstevel@tonic-gate for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) 1739*0Sstevel@tonic-gate /* void */; 1740*0Sstevel@tonic-gate 1741*0Sstevel@tonic-gate *dtlpp = Zalloc(sizeof (mddb_dtag_lst_t)); 1742*0Sstevel@tonic-gate 1743*0Sstevel@tonic-gate (void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt, 1744*0Sstevel@tonic-gate sizeof (mddb_dtag_t)); 1745*0Sstevel@tonic-gate 1746*0Sstevel@tonic-gate dtgp.dtgp_dt.dt_id++; 1747*0Sstevel@tonic-gate } 1748*0Sstevel@tonic-gate return (0); 1749*0Sstevel@tonic-gate } 1750*0Sstevel@tonic-gate 1751*0Sstevel@tonic-gate /* 1752*0Sstevel@tonic-gate * return drivename get by key 1753*0Sstevel@tonic-gate */ 1754*0Sstevel@tonic-gate mddrivename_t * 1755*0Sstevel@tonic-gate metadrivename_withdrkey( 1756*0Sstevel@tonic-gate mdsetname_t *sp, 1757*0Sstevel@tonic-gate side_t sideno, 1758*0Sstevel@tonic-gate mdkey_t key, 1759*0Sstevel@tonic-gate int flags, 1760*0Sstevel@tonic-gate md_error_t *ep 1761*0Sstevel@tonic-gate ) 1762*0Sstevel@tonic-gate { 1763*0Sstevel@tonic-gate char *nm; 1764*0Sstevel@tonic-gate mdname_t *np; 1765*0Sstevel@tonic-gate mddrivename_t *dnp; 1766*0Sstevel@tonic-gate ddi_devid_t devidp; 1767*0Sstevel@tonic-gate md_set_desc *sd; 1768*0Sstevel@tonic-gate 1769*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1770*0Sstevel@tonic-gate return (NULL); 1771*0Sstevel@tonic-gate } 1772*0Sstevel@tonic-gate 1773*0Sstevel@tonic-gate /* get namespace info */ 1774*0Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) { 1775*0Sstevel@tonic-gate if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno, 1776*0Sstevel@tonic-gate key, ep)) == NULL) 1777*0Sstevel@tonic-gate return (NULL); 1778*0Sstevel@tonic-gate } else { 1779*0Sstevel@tonic-gate if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno+SKEW, 1780*0Sstevel@tonic-gate key, ep)) == NULL) 1781*0Sstevel@tonic-gate return (NULL); 1782*0Sstevel@tonic-gate } 1783*0Sstevel@tonic-gate 1784*0Sstevel@tonic-gate /* get device name */ 1785*0Sstevel@tonic-gate if (flags & PRINT_FAST) { 1786*0Sstevel@tonic-gate if ((np = metaname_fast(&sp, nm, ep)) == NULL) { 1787*0Sstevel@tonic-gate Free(nm); 1788*0Sstevel@tonic-gate return (NULL); 1789*0Sstevel@tonic-gate } 1790*0Sstevel@tonic-gate } else { 1791*0Sstevel@tonic-gate if ((np = metaname(&sp, nm, ep)) == NULL) { 1792*0Sstevel@tonic-gate Free(nm); 1793*0Sstevel@tonic-gate return (NULL); 1794*0Sstevel@tonic-gate } 1795*0Sstevel@tonic-gate } 1796*0Sstevel@tonic-gate Free(nm); 1797*0Sstevel@tonic-gate 1798*0Sstevel@tonic-gate /* make sure it's OK */ 1799*0Sstevel@tonic-gate if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np, ep) != 0)) 1800*0Sstevel@tonic-gate return (NULL); 1801*0Sstevel@tonic-gate 1802*0Sstevel@tonic-gate /* get drivename */ 1803*0Sstevel@tonic-gate dnp = np->drivenamep; 1804*0Sstevel@tonic-gate dnp->side_names_key = key; 1805*0Sstevel@tonic-gate 1806*0Sstevel@tonic-gate /* 1807*0Sstevel@tonic-gate * Skip the following devid check if dnp is did device 1808*0Sstevel@tonic-gate * The device id is disabled for did device due to the 1809*0Sstevel@tonic-gate * lack of minor name support in the did driver. The following 1810*0Sstevel@tonic-gate * devid code path can set and propagate the error and 1811*0Sstevel@tonic-gate * eventually prevent did disks from being added to the 1812*0Sstevel@tonic-gate * diskset under SunCluster systems 1813*0Sstevel@tonic-gate */ 1814*0Sstevel@tonic-gate if (strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/")) == 0) { 1815*0Sstevel@tonic-gate goto out; 1816*0Sstevel@tonic-gate } 1817*0Sstevel@tonic-gate 1818*0Sstevel@tonic-gate /* Also, Skip the check if MN diskset, no devid's */ 1819*0Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) { 1820*0Sstevel@tonic-gate goto out; 1821*0Sstevel@tonic-gate } 1822*0Sstevel@tonic-gate 1823*0Sstevel@tonic-gate /* 1824*0Sstevel@tonic-gate * Get the devid associated with the key. 1825*0Sstevel@tonic-gate * 1826*0Sstevel@tonic-gate * If a devid was returned, it MUST be valid even in 1827*0Sstevel@tonic-gate * the case where a device id has been "updated". The 1828*0Sstevel@tonic-gate * "update" of the device id may have occured due to 1829*0Sstevel@tonic-gate * a firmware upgrade. 1830*0Sstevel@tonic-gate */ 1831*0Sstevel@tonic-gate if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep)) 1832*0Sstevel@tonic-gate != NULL) { 1833*0Sstevel@tonic-gate dnp->devid = devid_str_encode(devidp, NULL); 1834*0Sstevel@tonic-gate free(devidp); 1835*0Sstevel@tonic-gate } else { 1836*0Sstevel@tonic-gate /* 1837*0Sstevel@tonic-gate * It is okay if replica is not in devid mode 1838*0Sstevel@tonic-gate */ 1839*0Sstevel@tonic-gate if (mdissyserror(ep, MDDB_F_NODEVID)) { 1840*0Sstevel@tonic-gate mdclrerror(ep); 1841*0Sstevel@tonic-gate goto out; 1842*0Sstevel@tonic-gate } 1843*0Sstevel@tonic-gate 1844*0Sstevel@tonic-gate /* 1845*0Sstevel@tonic-gate * devid is missing so this means that we have 1846*0Sstevel@tonic-gate * just upgraded from a configuration where 1847*0Sstevel@tonic-gate * devid's were not used so try to add in 1848*0Sstevel@tonic-gate * the devid and requery. 1849*0Sstevel@tonic-gate */ 1850*0Sstevel@tonic-gate if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key, 1851*0Sstevel@tonic-gate ep) < 0) 1852*0Sstevel@tonic-gate return (NULL); 1853*0Sstevel@tonic-gate if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET, 1854*0Sstevel@tonic-gate sideno+SKEW, key, ep)) == NULL) 1855*0Sstevel@tonic-gate return (NULL); 1856*0Sstevel@tonic-gate dnp->devid = devid_str_encode(devidp, NULL); 1857*0Sstevel@tonic-gate devid_free(devidp); 1858*0Sstevel@tonic-gate } 1859*0Sstevel@tonic-gate 1860*0Sstevel@tonic-gate out: 1861*0Sstevel@tonic-gate if (flags & MD_BYPASS_DAEMON) 1862*0Sstevel@tonic-gate return (dnp); 1863*0Sstevel@tonic-gate 1864*0Sstevel@tonic-gate if (get_sidenmlist(sp, dnp, ep)) 1865*0Sstevel@tonic-gate return (NULL); 1866*0Sstevel@tonic-gate 1867*0Sstevel@tonic-gate /* return success */ 1868*0Sstevel@tonic-gate return (dnp); 1869*0Sstevel@tonic-gate } 1870*0Sstevel@tonic-gate 1871*0Sstevel@tonic-gate void 1872*0Sstevel@tonic-gate metafreedrivedesc(md_drive_desc **dd) 1873*0Sstevel@tonic-gate { 1874*0Sstevel@tonic-gate md_drive_desc *p, *next = NULL; 1875*0Sstevel@tonic-gate 1876*0Sstevel@tonic-gate for (p = *dd; p != NULL; p = next) { 1877*0Sstevel@tonic-gate next = p->dd_next; 1878*0Sstevel@tonic-gate Free(p); 1879*0Sstevel@tonic-gate } 1880*0Sstevel@tonic-gate *dd = NULL; 1881*0Sstevel@tonic-gate } 1882*0Sstevel@tonic-gate 1883*0Sstevel@tonic-gate md_drive_desc * 1884*0Sstevel@tonic-gate metaget_drivedesc( 1885*0Sstevel@tonic-gate mdsetname_t *sp, 1886*0Sstevel@tonic-gate int flags, 1887*0Sstevel@tonic-gate md_error_t *ep 1888*0Sstevel@tonic-gate ) 1889*0Sstevel@tonic-gate { 1890*0Sstevel@tonic-gate side_t sideno = MD_SIDEWILD; 1891*0Sstevel@tonic-gate 1892*0Sstevel@tonic-gate assert(! (flags & MD_BYPASS_DAEMON)); 1893*0Sstevel@tonic-gate 1894*0Sstevel@tonic-gate if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD) 1895*0Sstevel@tonic-gate return (NULL); 1896*0Sstevel@tonic-gate 1897*0Sstevel@tonic-gate return (metaget_drivedesc_sideno(sp, sideno, flags, ep)); 1898*0Sstevel@tonic-gate } 1899*0Sstevel@tonic-gate 1900*0Sstevel@tonic-gate md_drive_desc * 1901*0Sstevel@tonic-gate metaget_drivedesc_fromnamelist( 1902*0Sstevel@tonic-gate mdsetname_t *sp, 1903*0Sstevel@tonic-gate mdnamelist_t *nlp, 1904*0Sstevel@tonic-gate md_error_t *ep 1905*0Sstevel@tonic-gate ) 1906*0Sstevel@tonic-gate { 1907*0Sstevel@tonic-gate md_set_desc *sd; 1908*0Sstevel@tonic-gate mdnamelist_t *p; 1909*0Sstevel@tonic-gate md_drive_desc *dd = NULL; 1910*0Sstevel@tonic-gate 1911*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) 1912*0Sstevel@tonic-gate return (NULL); 1913*0Sstevel@tonic-gate 1914*0Sstevel@tonic-gate for (p = nlp; p != NULL; p = p->next) 1915*0Sstevel@tonic-gate (void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0, 1916*0Sstevel@tonic-gate sd->sd_ctime, sd->sd_genid, MD_DR_ADD); 1917*0Sstevel@tonic-gate 1918*0Sstevel@tonic-gate return (dd); 1919*0Sstevel@tonic-gate } 1920*0Sstevel@tonic-gate 1921*0Sstevel@tonic-gate md_drive_desc * 1922*0Sstevel@tonic-gate metaget_drivedesc_sideno( 1923*0Sstevel@tonic-gate mdsetname_t *sp, 1924*0Sstevel@tonic-gate side_t sideno, 1925*0Sstevel@tonic-gate int flags, 1926*0Sstevel@tonic-gate md_error_t *ep 1927*0Sstevel@tonic-gate ) 1928*0Sstevel@tonic-gate { 1929*0Sstevel@tonic-gate md_set_desc *sd = NULL; 1930*0Sstevel@tonic-gate 1931*0Sstevel@tonic-gate assert(! (flags & MD_BYPASS_DAEMON)); 1932*0Sstevel@tonic-gate 1933*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) 1934*0Sstevel@tonic-gate return (NULL); 1935*0Sstevel@tonic-gate 1936*0Sstevel@tonic-gate if (sd->sd_drvs) 1937*0Sstevel@tonic-gate return (sd->sd_drvs); 1938*0Sstevel@tonic-gate 1939*0Sstevel@tonic-gate if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL) 1940*0Sstevel@tonic-gate return (NULL); 1941*0Sstevel@tonic-gate 1942*0Sstevel@tonic-gate return (sd->sd_drvs); 1943*0Sstevel@tonic-gate } 1944*0Sstevel@tonic-gate 1945*0Sstevel@tonic-gate int 1946*0Sstevel@tonic-gate metaget_setownership( 1947*0Sstevel@tonic-gate mdsetname_t *sp, 1948*0Sstevel@tonic-gate md_error_t *ep 1949*0Sstevel@tonic-gate ) 1950*0Sstevel@tonic-gate { 1951*0Sstevel@tonic-gate md_set_desc *sd; 1952*0Sstevel@tonic-gate int bool; 1953*0Sstevel@tonic-gate int i; 1954*0Sstevel@tonic-gate md_mnnode_desc *nd; 1955*0Sstevel@tonic-gate 1956*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) 1957*0Sstevel@tonic-gate return (-1); 1958*0Sstevel@tonic-gate 1959*0Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) { 1960*0Sstevel@tonic-gate nd = sd->sd_nodelist; 1961*0Sstevel@tonic-gate while (nd) { 1962*0Sstevel@tonic-gate /* If node isn't alive, can't own diskset */ 1963*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1964*0Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OWN; 1965*0Sstevel@tonic-gate nd = nd->nd_next; 1966*0Sstevel@tonic-gate continue; 1967*0Sstevel@tonic-gate } 1968*0Sstevel@tonic-gate /* 1969*0Sstevel@tonic-gate * If can't communicate with rpc.metad, then mark 1970*0Sstevel@tonic-gate * this node as not an owner. That node may 1971*0Sstevel@tonic-gate * in fact, be an owner, but without rpc.metad running 1972*0Sstevel@tonic-gate * that node can't do much. 1973*0Sstevel@tonic-gate */ 1974*0Sstevel@tonic-gate if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) { 1975*0Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OWN; 1976*0Sstevel@tonic-gate } else if (bool == TRUE) { 1977*0Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_OWN; 1978*0Sstevel@tonic-gate } else { 1979*0Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OWN; 1980*0Sstevel@tonic-gate } 1981*0Sstevel@tonic-gate nd = nd->nd_next; 1982*0Sstevel@tonic-gate } 1983*0Sstevel@tonic-gate return (0); 1984*0Sstevel@tonic-gate } 1985*0Sstevel@tonic-gate 1986*0Sstevel@tonic-gate /* Rest of code handles traditional disksets */ 1987*0Sstevel@tonic-gate 1988*0Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) 1989*0Sstevel@tonic-gate sd->sd_isown[i] = 0; 1990*0Sstevel@tonic-gate 1991*0Sstevel@tonic-gate if (clnt_ownset(mynode(), sp, &bool, ep) == -1) 1992*0Sstevel@tonic-gate return (-1); 1993*0Sstevel@tonic-gate 1994*0Sstevel@tonic-gate if (bool == TRUE) 1995*0Sstevel@tonic-gate sd->sd_isown[getmyside(sp, ep)] = 1; 1996*0Sstevel@tonic-gate 1997*0Sstevel@tonic-gate return (0); 1998*0Sstevel@tonic-gate } 1999*0Sstevel@tonic-gate 2000*0Sstevel@tonic-gate char * 2001*0Sstevel@tonic-gate mynode(void) 2002*0Sstevel@tonic-gate { 2003*0Sstevel@tonic-gate static struct utsname myuname; 2004*0Sstevel@tonic-gate static int done = 0; 2005*0Sstevel@tonic-gate 2006*0Sstevel@tonic-gate if (! done) { 2007*0Sstevel@tonic-gate if (uname(&myuname) == -1) { 2008*0Sstevel@tonic-gate md_perror(dgettext(TEXT_DOMAIN, "uname")); 2009*0Sstevel@tonic-gate assert(0); 2010*0Sstevel@tonic-gate } 2011*0Sstevel@tonic-gate done = 1; 2012*0Sstevel@tonic-gate } 2013*0Sstevel@tonic-gate return (myuname.nodename); 2014*0Sstevel@tonic-gate } 2015*0Sstevel@tonic-gate 2016*0Sstevel@tonic-gate int 2017*0Sstevel@tonic-gate strinlst(char *str, int cnt, char **lst) 2018*0Sstevel@tonic-gate { 2019*0Sstevel@tonic-gate int i; 2020*0Sstevel@tonic-gate 2021*0Sstevel@tonic-gate for (i = 0; i < cnt; i++) 2022*0Sstevel@tonic-gate if (strcmp(lst[i], str) == 0) 2023*0Sstevel@tonic-gate return (TRUE); 2024*0Sstevel@tonic-gate 2025*0Sstevel@tonic-gate return (FALSE); 2026*0Sstevel@tonic-gate } 2027*0Sstevel@tonic-gate 2028*0Sstevel@tonic-gate /* 2029*0Sstevel@tonic-gate * meta_get_reserved_names 2030*0Sstevel@tonic-gate * returns an mdnamelist_t of reserved slices 2031*0Sstevel@tonic-gate * reserved slices are those that are used but don't necessarily 2032*0Sstevel@tonic-gate * show up as metadevices (ex. reserved slice for db in sets, logs) 2033*0Sstevel@tonic-gate */ 2034*0Sstevel@tonic-gate 2035*0Sstevel@tonic-gate /*ARGSUSED*/ 2036*0Sstevel@tonic-gate int 2037*0Sstevel@tonic-gate meta_get_reserved_names( 2038*0Sstevel@tonic-gate mdsetname_t *sp, 2039*0Sstevel@tonic-gate mdnamelist_t **nlpp, 2040*0Sstevel@tonic-gate int options, 2041*0Sstevel@tonic-gate md_error_t *ep) 2042*0Sstevel@tonic-gate { 2043*0Sstevel@tonic-gate int count = 0; 2044*0Sstevel@tonic-gate mdname_t *np = NULL; 2045*0Sstevel@tonic-gate mdnamelist_t *transnlp = NULL; 2046*0Sstevel@tonic-gate mdnamelist_t **tailpp = nlpp; 2047*0Sstevel@tonic-gate mdnamelist_t *nlp; 2048*0Sstevel@tonic-gate md_drive_desc *dd, *di; 2049*0Sstevel@tonic-gate 2050*0Sstevel@tonic-gate if (metaislocalset(sp)) 2051*0Sstevel@tonic-gate goto out; 2052*0Sstevel@tonic-gate 2053*0Sstevel@tonic-gate if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) { 2054*0Sstevel@tonic-gate count = -1; 2055*0Sstevel@tonic-gate goto out; 2056*0Sstevel@tonic-gate } 2057*0Sstevel@tonic-gate 2058*0Sstevel@tonic-gate /* db in for sets on reserved slice */ 2059*0Sstevel@tonic-gate for (di = dd; di && count >= 0; di = di->dd_next) { 2060*0Sstevel@tonic-gate uint_t rep_slice; 2061*0Sstevel@tonic-gate 2062*0Sstevel@tonic-gate /* 2063*0Sstevel@tonic-gate * Add the name struct to the end of the 2064*0Sstevel@tonic-gate * namelist but keep a pointer to the last 2065*0Sstevel@tonic-gate * element so that we don't incur the overhead 2066*0Sstevel@tonic-gate * of traversing the list each time 2067*0Sstevel@tonic-gate */ 2068*0Sstevel@tonic-gate if (di->dd_dnp && 2069*0Sstevel@tonic-gate (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) && 2070*0Sstevel@tonic-gate (np = metaslicename(di->dd_dnp, rep_slice, ep)) && 2071*0Sstevel@tonic-gate (tailpp = meta_namelist_append_wrapper(tailpp, np))) 2072*0Sstevel@tonic-gate count++; 2073*0Sstevel@tonic-gate else 2074*0Sstevel@tonic-gate count = -1; 2075*0Sstevel@tonic-gate } 2076*0Sstevel@tonic-gate 2077*0Sstevel@tonic-gate /* now find logs */ 2078*0Sstevel@tonic-gate if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) { 2079*0Sstevel@tonic-gate count = -1; 2080*0Sstevel@tonic-gate goto out; 2081*0Sstevel@tonic-gate } 2082*0Sstevel@tonic-gate 2083*0Sstevel@tonic-gate for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) { 2084*0Sstevel@tonic-gate mdname_t *transnp = nlp->namep; 2085*0Sstevel@tonic-gate md_trans_t *transp; 2086*0Sstevel@tonic-gate 2087*0Sstevel@tonic-gate if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) { 2088*0Sstevel@tonic-gate count = -1; 2089*0Sstevel@tonic-gate goto out; 2090*0Sstevel@tonic-gate } 2091*0Sstevel@tonic-gate if (transp->lognamep) { 2092*0Sstevel@tonic-gate /* 2093*0Sstevel@tonic-gate * Add the name struct to the end of the 2094*0Sstevel@tonic-gate * namelist but keep a pointer to the last 2095*0Sstevel@tonic-gate * element so that we don't incur the overhead 2096*0Sstevel@tonic-gate * of traversing the list each time 2097*0Sstevel@tonic-gate */ 2098*0Sstevel@tonic-gate tailpp = meta_namelist_append_wrapper( 2099*0Sstevel@tonic-gate tailpp, transp->lognamep); 2100*0Sstevel@tonic-gate } 2101*0Sstevel@tonic-gate } 2102*0Sstevel@tonic-gate out: 2103*0Sstevel@tonic-gate metafreenamelist(transnlp); 2104*0Sstevel@tonic-gate return (count); 2105*0Sstevel@tonic-gate } 2106*0Sstevel@tonic-gate 2107*0Sstevel@tonic-gate /* 2108*0Sstevel@tonic-gate * Entry point to join a node to MultiNode diskset. 2109*0Sstevel@tonic-gate * 2110*0Sstevel@tonic-gate * Validate host in diskset. 2111*0Sstevel@tonic-gate * - Should be in membership list from API 2112*0Sstevel@tonic-gate * - Should not already be joined into diskset. 2113*0Sstevel@tonic-gate * - Set must have drives 2114*0Sstevel@tonic-gate * Assume valid configuration is stored in the set/drive/node records 2115*0Sstevel@tonic-gate * in the local mddb since no node or drive can be added to the MNset 2116*0Sstevel@tonic-gate * unless all drives and nodes are available. Reconfig steps will 2117*0Sstevel@tonic-gate * resync all ALIVE nodes in case of panic in critical areas. 2118*0Sstevel@tonic-gate * 2119*0Sstevel@tonic-gate * Lock down the set. 2120*0Sstevel@tonic-gate * Verify host is a member of this diskset. 2121*0Sstevel@tonic-gate * If drives exist in the configuration, load the mddbs. 2122*0Sstevel@tonic-gate * Set this node to active by notifying master if one exists. 2123*0Sstevel@tonic-gate * If this is the first node active in the diskset, this node 2124*0Sstevel@tonic-gate * becomes the master. 2125*0Sstevel@tonic-gate * Unlock the set. 2126*0Sstevel@tonic-gate * 2127*0Sstevel@tonic-gate * Mirror Resync: 2128*0Sstevel@tonic-gate * If this node is the last node to join the set and clustering 2129*0Sstevel@tonic-gate * isn't running, then start the 'metasync -r' type resync 2130*0Sstevel@tonic-gate * on all mirrors in this diskset. 2131*0Sstevel@tonic-gate * If clustering is running, this resync operation will 2132*0Sstevel@tonic-gate * be handled by the reconfig steps and should NOT 2133*0Sstevel@tonic-gate * be handled during a join operation. 2134*0Sstevel@tonic-gate * 2135*0Sstevel@tonic-gate * There are multiple return values in order to assist 2136*0Sstevel@tonic-gate * the join operation of all sets in the metaset command. 2137*0Sstevel@tonic-gate * 2138*0Sstevel@tonic-gate * Return values: 2139*0Sstevel@tonic-gate * 0 - Node successfully joined to set. 2140*0Sstevel@tonic-gate * -1 - Join attempted but failed 2141*0Sstevel@tonic-gate * - any failure from libmeta calls 2142*0Sstevel@tonic-gate * - node not in the member list 2143*0Sstevel@tonic-gate * -2 - Join not attempted since 2144*0Sstevel@tonic-gate * - this set had no drives in set 2145*0Sstevel@tonic-gate * - this node already joined to set 2146*0Sstevel@tonic-gate * - set is not a multinode set 2147*0Sstevel@tonic-gate * -3 - Node joined to STALE set. 2148*0Sstevel@tonic-gate */ 2149*0Sstevel@tonic-gate extern int 2150*0Sstevel@tonic-gate meta_set_join( 2151*0Sstevel@tonic-gate mdsetname_t *sp, 2152*0Sstevel@tonic-gate md_error_t *ep 2153*0Sstevel@tonic-gate ) 2154*0Sstevel@tonic-gate { 2155*0Sstevel@tonic-gate md_set_desc *sd; 2156*0Sstevel@tonic-gate md_drive_desc *dd; 2157*0Sstevel@tonic-gate md_mnnode_desc *nd, *nd2, my_nd; 2158*0Sstevel@tonic-gate int rval = 0; 2159*0Sstevel@tonic-gate md_setkey_t *cl_sk; 2160*0Sstevel@tonic-gate md_error_t xep = mdnullerror; 2161*0Sstevel@tonic-gate md_error_t ep_snarf = mdnullerror; 2162*0Sstevel@tonic-gate int master_flag = 0; 2163*0Sstevel@tonic-gate md_mnset_record *mas_mnsr = NULL; 2164*0Sstevel@tonic-gate int clear_nr_flags = 0; 2165*0Sstevel@tonic-gate md_mnnode_record *nr; 2166*0Sstevel@tonic-gate int stale_set = 0; 2167*0Sstevel@tonic-gate int rb_flags = 0; 2168*0Sstevel@tonic-gate int stale_bool = FALSE; 2169*0Sstevel@tonic-gate int suspendall_flag = 0; 2170*0Sstevel@tonic-gate int suspend1_flag = 0; 2171*0Sstevel@tonic-gate sigset_t oldsigs; 2172*0Sstevel@tonic-gate int send_reinit = 0; 2173*0Sstevel@tonic-gate 2174*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 2175*0Sstevel@tonic-gate return (-1); 2176*0Sstevel@tonic-gate } 2177*0Sstevel@tonic-gate 2178*0Sstevel@tonic-gate /* Must be a multinode diskset */ 2179*0Sstevel@tonic-gate if (!MD_MNSET_DESC(sd)) { 2180*0Sstevel@tonic-gate (void) mderror(ep, MDE_NOT_MN, sp->setname); 2181*0Sstevel@tonic-gate return (-2); 2182*0Sstevel@tonic-gate } 2183*0Sstevel@tonic-gate 2184*0Sstevel@tonic-gate /* Verify that the node is ALIVE (i.e. is in the API membership list) */ 2185*0Sstevel@tonic-gate if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) { 2186*0Sstevel@tonic-gate (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno, 2187*0Sstevel@tonic-gate sd->sd_mn_mynode->nd_nodename, NULL, 2188*0Sstevel@tonic-gate sp->setname); 2189*0Sstevel@tonic-gate return (-1); 2190*0Sstevel@tonic-gate } 2191*0Sstevel@tonic-gate 2192*0Sstevel@tonic-gate /* Make sure we are blocking all signals */ 2193*0Sstevel@tonic-gate if (procsigs(TRUE, &oldsigs, &xep) < 0) 2194*0Sstevel@tonic-gate mdclrerror(&xep); 2195*0Sstevel@tonic-gate 2196*0Sstevel@tonic-gate /* 2197*0Sstevel@tonic-gate * Lock the set on current set members. 2198*0Sstevel@tonic-gate * For MN diskset lock_set and SUSPEND are used to protect against 2199*0Sstevel@tonic-gate * other meta* commands running on the other nodes. 2200*0Sstevel@tonic-gate */ 2201*0Sstevel@tonic-gate nd = sd->sd_nodelist; 2202*0Sstevel@tonic-gate while (nd) { 2203*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2204*0Sstevel@tonic-gate nd = nd->nd_next; 2205*0Sstevel@tonic-gate continue; 2206*0Sstevel@tonic-gate } 2207*0Sstevel@tonic-gate if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 2208*0Sstevel@tonic-gate rval = -1; 2209*0Sstevel@tonic-gate goto out; 2210*0Sstevel@tonic-gate } 2211*0Sstevel@tonic-gate nd = nd->nd_next; 2212*0Sstevel@tonic-gate } 2213*0Sstevel@tonic-gate 2214*0Sstevel@tonic-gate /* 2215*0Sstevel@tonic-gate * Lock out other meta* commands by suspending 2216*0Sstevel@tonic-gate * class 1 messages across the diskset. 2217*0Sstevel@tonic-gate */ 2218*0Sstevel@tonic-gate nd = sd->sd_nodelist; 2219*0Sstevel@tonic-gate while (nd) { 2220*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2221*0Sstevel@tonic-gate nd = nd->nd_next; 2222*0Sstevel@tonic-gate continue; 2223*0Sstevel@tonic-gate } 2224*0Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 2225*0Sstevel@tonic-gate sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) { 2226*0Sstevel@tonic-gate rval = -1; 2227*0Sstevel@tonic-gate goto out; 2228*0Sstevel@tonic-gate } 2229*0Sstevel@tonic-gate suspend1_flag = 1; 2230*0Sstevel@tonic-gate nd = nd->nd_next; 2231*0Sstevel@tonic-gate } 2232*0Sstevel@tonic-gate 2233*0Sstevel@tonic-gate /* 2234*0Sstevel@tonic-gate * Verify that this host is a member (in the host list) of the set. 2235*0Sstevel@tonic-gate */ 2236*0Sstevel@tonic-gate nd = sd->sd_nodelist; 2237*0Sstevel@tonic-gate while (nd) { 2238*0Sstevel@tonic-gate if (strcmp(mynode(), nd->nd_nodename) == 0) { 2239*0Sstevel@tonic-gate break; 2240*0Sstevel@tonic-gate } 2241*0Sstevel@tonic-gate nd = nd->nd_next; 2242*0Sstevel@tonic-gate } 2243*0Sstevel@tonic-gate if (!nd) { 2244*0Sstevel@tonic-gate (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 2245*0Sstevel@tonic-gate sd->sd_mn_mynode->nd_nodename, NULL, 2246*0Sstevel@tonic-gate sp->setname); 2247*0Sstevel@tonic-gate rval = -1; 2248*0Sstevel@tonic-gate goto out; 2249*0Sstevel@tonic-gate } 2250*0Sstevel@tonic-gate 2251*0Sstevel@tonic-gate /* 2252*0Sstevel@tonic-gate * Need to return failure if host is already 'joined' 2253*0Sstevel@tonic-gate * into the set. This is done so that if later the user 2254*0Sstevel@tonic-gate * issues a command to join all sets and a failure is 2255*0Sstevel@tonic-gate * encountered - that the resulting cleanup effort 2256*0Sstevel@tonic-gate * (withdrawing from all sets that were joined 2257*0Sstevel@tonic-gate * during that command) won't withdraw from this set. 2258*0Sstevel@tonic-gate */ 2259*0Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OWN) { 2260*0Sstevel@tonic-gate rval = -2; 2261*0Sstevel@tonic-gate goto out2; 2262*0Sstevel@tonic-gate } 2263*0Sstevel@tonic-gate 2264*0Sstevel@tonic-gate /* 2265*0Sstevel@tonic-gate * Call metaget_setownership that calls each node in diskset and 2266*0Sstevel@tonic-gate * marks in set descriptor if node is an owner of the set or not. 2267*0Sstevel@tonic-gate * metaget_setownership checks to see if a node is an owner by 2268*0Sstevel@tonic-gate * checking to see if that node's kernel has the mddb loaded. 2269*0Sstevel@tonic-gate * If a node had panic'd during a reconfig or an 2270*0Sstevel@tonic-gate * add/delete/join/withdraw operation, the other nodes' node 2271*0Sstevel@tonic-gate * records may not reflect the current state of the diskset, 2272*0Sstevel@tonic-gate * so calling metaget_setownership is the safest thing to do. 2273*0Sstevel@tonic-gate */ 2274*0Sstevel@tonic-gate if (metaget_setownership(sp, ep) == -1) { 2275*0Sstevel@tonic-gate rval = -1; 2276*0Sstevel@tonic-gate goto out; 2277*0Sstevel@tonic-gate } 2278*0Sstevel@tonic-gate 2279*0Sstevel@tonic-gate /* If first active member of diskset, become the master. */ 2280*0Sstevel@tonic-gate nd = sd->sd_nodelist; 2281*0Sstevel@tonic-gate while (nd) { 2282*0Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OWN) 2283*0Sstevel@tonic-gate break; 2284*0Sstevel@tonic-gate nd = nd->nd_next; 2285*0Sstevel@tonic-gate } 2286*0Sstevel@tonic-gate if (nd == NULL) 2287*0Sstevel@tonic-gate master_flag = 1; 2288*0Sstevel@tonic-gate 2289*0Sstevel@tonic-gate /* 2290*0Sstevel@tonic-gate * If not first active member of diskset, then get the 2291*0Sstevel@tonic-gate * master information from a node that is already joined 2292*0Sstevel@tonic-gate * and set the master information for this node. Be sure 2293*0Sstevel@tonic-gate * that this node (the already joined node) has its own 2294*0Sstevel@tonic-gate * join flag set. If not, then this diskset isn't currently 2295*0Sstevel@tonic-gate * consistent and shouldn't allow a node to join. This diskset 2296*0Sstevel@tonic-gate * inconsistency should only occur when a node has panic'd in 2297*0Sstevel@tonic-gate * the set while doing a metaset operation and the sysadmin is 2298*0Sstevel@tonic-gate * attempting to join a node into the set. This inconsistency 2299*0Sstevel@tonic-gate * will be fixed during a reconfig cycle which should be occurring 2300*0Sstevel@tonic-gate * soon since a node panic'd. 2301*0Sstevel@tonic-gate * 2302*0Sstevel@tonic-gate * If unable to get this information from an owning node, then 2303*0Sstevel@tonic-gate * this diskset isn't currently consistent and shouldn't 2304*0Sstevel@tonic-gate * allow a node to join. 2305*0Sstevel@tonic-gate */ 2306*0Sstevel@tonic-gate if (!master_flag) { 2307*0Sstevel@tonic-gate /* get master information from an owner (joined) node */ 2308*0Sstevel@tonic-gate if (clnt_mngetset(nd->nd_nodename, sp->setname, 2309*0Sstevel@tonic-gate sp->setno, &mas_mnsr, ep) == -1) { 2310*0Sstevel@tonic-gate rval = -1; 2311*0Sstevel@tonic-gate goto out; 2312*0Sstevel@tonic-gate } 2313*0Sstevel@tonic-gate 2314*0Sstevel@tonic-gate /* Verify that owner (joined) node has its own JOIN flag set */ 2315*0Sstevel@tonic-gate nr = mas_mnsr->sr_nodechain; 2316*0Sstevel@tonic-gate while (nr) { 2317*0Sstevel@tonic-gate if ((nd->nd_nodeid == nr->nr_nodeid) && 2318*0Sstevel@tonic-gate ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) { 2319*0Sstevel@tonic-gate (void) mddserror(ep, MDE_DS_NODENOSET, 2320*0Sstevel@tonic-gate sp->setno, nd->nd_nodename, NULL, 2321*0Sstevel@tonic-gate nd->nd_nodename); 2322*0Sstevel@tonic-gate free_sr((md_set_record *)mas_mnsr); 2323*0Sstevel@tonic-gate rval = -1; 2324*0Sstevel@tonic-gate goto out; 2325*0Sstevel@tonic-gate } 2326*0Sstevel@tonic-gate nr = nr->nr_next; 2327*0Sstevel@tonic-gate } 2328*0Sstevel@tonic-gate 2329*0Sstevel@tonic-gate /* 2330*0Sstevel@tonic-gate * Does master have set marked as STALE? 2331*0Sstevel@tonic-gate * If so, need to pass this down to kernel when 2332*0Sstevel@tonic-gate * this node snarfs the set. 2333*0Sstevel@tonic-gate */ 2334*0Sstevel@tonic-gate if (clnt_mn_is_stale(nd->nd_nodename, sp, 2335*0Sstevel@tonic-gate &stale_bool, ep) == -1) { 2336*0Sstevel@tonic-gate rval = -1; 2337*0Sstevel@tonic-gate goto out; 2338*0Sstevel@tonic-gate } 2339*0Sstevel@tonic-gate 2340*0Sstevel@tonic-gate /* set master information in my rpc.metad's set record */ 2341*0Sstevel@tonic-gate if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm, 2342*0Sstevel@tonic-gate mas_mnsr->sr_master_nodeid, ep)) { 2343*0Sstevel@tonic-gate free_sr((md_set_record *)mas_mnsr); 2344*0Sstevel@tonic-gate rval = -1; 2345*0Sstevel@tonic-gate goto out; 2346*0Sstevel@tonic-gate } 2347*0Sstevel@tonic-gate 2348*0Sstevel@tonic-gate /* set master information in my cached set desc */ 2349*0Sstevel@tonic-gate (void) strcpy(sd->sd_mn_master_nodenm, 2350*0Sstevel@tonic-gate mas_mnsr->sr_master_nodenm); 2351*0Sstevel@tonic-gate sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid; 2352*0Sstevel@tonic-gate nd2 = sd->sd_nodelist; 2353*0Sstevel@tonic-gate while (nd2) { 2354*0Sstevel@tonic-gate if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) { 2355*0Sstevel@tonic-gate sd->sd_mn_masternode = nd2; 2356*0Sstevel@tonic-gate break; 2357*0Sstevel@tonic-gate } 2358*0Sstevel@tonic-gate nd2 = nd2->nd_next; 2359*0Sstevel@tonic-gate } 2360*0Sstevel@tonic-gate free_sr((md_set_record *)mas_mnsr); 2361*0Sstevel@tonic-gate 2362*0Sstevel@tonic-gate /* 2363*0Sstevel@tonic-gate * Set the node flags in mynode's rpc.metad node records for 2364*0Sstevel@tonic-gate * the nodes that are in the diskset. Can use my sd 2365*0Sstevel@tonic-gate * since earlier call to metaget_setownership set the 2366*0Sstevel@tonic-gate * owner flags based on whether that node had snarfed 2367*0Sstevel@tonic-gate * the MN diskset mddb. Reconfig steps guarantee that 2368*0Sstevel@tonic-gate * return of metaget_setownership will match the owning 2369*0Sstevel@tonic-gate * node's owner list except in the case where a node 2370*0Sstevel@tonic-gate * has just panic'd and in this case, a reconfig will 2371*0Sstevel@tonic-gate * be starting immediately and the owner lists will 2372*0Sstevel@tonic-gate * be sync'd up by the reconfig. 2373*0Sstevel@tonic-gate * 2374*0Sstevel@tonic-gate * Flag of SET means to take no action except to 2375*0Sstevel@tonic-gate * set the node flags as given in the nodelist linked list. 2376*0Sstevel@tonic-gate */ 2377*0Sstevel@tonic-gate if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, 2378*0Sstevel@tonic-gate MD_NR_SET, NULL, ep)) { 2379*0Sstevel@tonic-gate rval = -1; 2380*0Sstevel@tonic-gate goto out; 2381*0Sstevel@tonic-gate } 2382*0Sstevel@tonic-gate } 2383*0Sstevel@tonic-gate 2384*0Sstevel@tonic-gate /* 2385*0Sstevel@tonic-gate * Read in the mddb if there are drives in the set. 2386*0Sstevel@tonic-gate */ 2387*0Sstevel@tonic-gate if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 2388*0Sstevel@tonic-gate ep)) == NULL) { 2389*0Sstevel@tonic-gate /* No drives in list */ 2390*0Sstevel@tonic-gate if (! mdisok(ep)) { 2391*0Sstevel@tonic-gate rval = -1; 2392*0Sstevel@tonic-gate goto out; 2393*0Sstevel@tonic-gate } 2394*0Sstevel@tonic-gate rval = -2; 2395*0Sstevel@tonic-gate goto out; 2396*0Sstevel@tonic-gate } 2397*0Sstevel@tonic-gate 2398*0Sstevel@tonic-gate /* 2399*0Sstevel@tonic-gate * Notify rpc.mdcommd on all nodes of a nodelist change. 2400*0Sstevel@tonic-gate * Start by suspending rpc.mdcommd (which drains it of all messages), 2401*0Sstevel@tonic-gate * then change the nodelist followed by a reinit and resume. 2402*0Sstevel@tonic-gate */ 2403*0Sstevel@tonic-gate nd = sd->sd_nodelist; 2404*0Sstevel@tonic-gate while (nd) { 2405*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2406*0Sstevel@tonic-gate nd = nd->nd_next; 2407*0Sstevel@tonic-gate continue; 2408*0Sstevel@tonic-gate } 2409*0Sstevel@tonic-gate 2410*0Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp, 2411*0Sstevel@tonic-gate MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { 2412*0Sstevel@tonic-gate rval = -1; 2413*0Sstevel@tonic-gate goto out; 2414*0Sstevel@tonic-gate } 2415*0Sstevel@tonic-gate suspendall_flag = 1; 2416*0Sstevel@tonic-gate nd = nd->nd_next; 2417*0Sstevel@tonic-gate } 2418*0Sstevel@tonic-gate 2419*0Sstevel@tonic-gate /* Set master in my set record in rpc.metad */ 2420*0Sstevel@tonic-gate if (master_flag) { 2421*0Sstevel@tonic-gate if (clnt_mnsetmaster(mynode(), sp, 2422*0Sstevel@tonic-gate sd->sd_mn_mynode->nd_nodename, 2423*0Sstevel@tonic-gate sd->sd_mn_mynode->nd_nodeid, ep)) { 2424*0Sstevel@tonic-gate rval = -1; 2425*0Sstevel@tonic-gate goto out; 2426*0Sstevel@tonic-gate } 2427*0Sstevel@tonic-gate } 2428*0Sstevel@tonic-gate /* Causes mddbs to be loaded in kernel */ 2429*0Sstevel@tonic-gate if (setup_db_bydd(sp, dd, 0, ep) == -1) { 2430*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 2431*0Sstevel@tonic-gate "Host not able to start diskset.")); 2432*0Sstevel@tonic-gate rval = -1; 2433*0Sstevel@tonic-gate goto out; 2434*0Sstevel@tonic-gate } 2435*0Sstevel@tonic-gate 2436*0Sstevel@tonic-gate if (! mdisok(ep)) { 2437*0Sstevel@tonic-gate rval = -1; 2438*0Sstevel@tonic-gate goto out; 2439*0Sstevel@tonic-gate } 2440*0Sstevel@tonic-gate 2441*0Sstevel@tonic-gate /* 2442*0Sstevel@tonic-gate * Set rollback flags to 1 so that halt_set is called if a failure 2443*0Sstevel@tonic-gate * is seen after this point. If snarf_set fails, still need to 2444*0Sstevel@tonic-gate * call halt_set to cleanup the diskset. 2445*0Sstevel@tonic-gate */ 2446*0Sstevel@tonic-gate rb_flags = 1; 2447*0Sstevel@tonic-gate 2448*0Sstevel@tonic-gate /* Starts the set */ 2449*0Sstevel@tonic-gate if (snarf_set(sp, stale_bool, ep) != 0) { 2450*0Sstevel@tonic-gate if (mdismddberror(ep, MDE_DB_STALE)) { 2451*0Sstevel@tonic-gate /* 2452*0Sstevel@tonic-gate * Don't fail join, STALE means that set has 2453*0Sstevel@tonic-gate * < 50% mddbs. 2454*0Sstevel@tonic-gate */ 2455*0Sstevel@tonic-gate (void) mdstealerror(&ep_snarf, ep); 2456*0Sstevel@tonic-gate stale_set = 1; 2457*0Sstevel@tonic-gate } else if (mdisok(ep)) { 2458*0Sstevel@tonic-gate /* If snarf failed, but no error was set - set it */ 2459*0Sstevel@tonic-gate (void) mdmddberror(ep, MDE_DB_NOTNOW, NODEV64, 2460*0Sstevel@tonic-gate sp->setno, 0, NULL); 2461*0Sstevel@tonic-gate rval = -1; 2462*0Sstevel@tonic-gate goto out; 2463*0Sstevel@tonic-gate } else if (!(mdismddberror(ep, MDE_DB_ACCOK))) { 2464*0Sstevel@tonic-gate /* 2465*0Sstevel@tonic-gate * Don't fail join if ACCOK; ACCOK means that mediator 2466*0Sstevel@tonic-gate * provided extra vote. 2467*0Sstevel@tonic-gate */ 2468*0Sstevel@tonic-gate rval = -1; 2469*0Sstevel@tonic-gate goto out; 2470*0Sstevel@tonic-gate } 2471*0Sstevel@tonic-gate } 2472*0Sstevel@tonic-gate 2473*0Sstevel@tonic-gate /* Did set really get snarfed? */ 2474*0Sstevel@tonic-gate if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) { 2475*0Sstevel@tonic-gate if (mdisok(ep)) { 2476*0Sstevel@tonic-gate /* If snarf failed, but no error was set - set it */ 2477*0Sstevel@tonic-gate (void) mdmddberror(ep, MDE_DB_NOTNOW, NODEV64, 2478*0Sstevel@tonic-gate sp->setno, 0, NULL); 2479*0Sstevel@tonic-gate } 2480*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 2481*0Sstevel@tonic-gate "Host not able to start diskset.")); 2482*0Sstevel@tonic-gate rval = -1; 2483*0Sstevel@tonic-gate goto out; 2484*0Sstevel@tonic-gate } 2485*0Sstevel@tonic-gate 2486*0Sstevel@tonic-gate /* Change to nodelist so need to send reinit to rpc.mdcommd */ 2487*0Sstevel@tonic-gate send_reinit = 1; 2488*0Sstevel@tonic-gate 2489*0Sstevel@tonic-gate /* If first node to enter set, setup master and clear change log */ 2490*0Sstevel@tonic-gate if (master_flag) { 2491*0Sstevel@tonic-gate /* Set master in my locally cached set descriptor */ 2492*0Sstevel@tonic-gate (void) strcpy(sd->sd_mn_master_nodenm, 2493*0Sstevel@tonic-gate sd->sd_mn_mynode->nd_nodename); 2494*0Sstevel@tonic-gate sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid; 2495*0Sstevel@tonic-gate sd->sd_mn_am_i_master = 1; 2496*0Sstevel@tonic-gate 2497*0Sstevel@tonic-gate /* 2498*0Sstevel@tonic-gate * If first node to join set, then clear out change log 2499*0Sstevel@tonic-gate * entries. Change log entries are only needed when a 2500*0Sstevel@tonic-gate * change of master is occurring in a diskset that has 2501*0Sstevel@tonic-gate * multiple owners. Since this node is the first owner 2502*0Sstevel@tonic-gate * of the diskset, clear the entries. 2503*0Sstevel@tonic-gate * 2504*0Sstevel@tonic-gate * Only do this if we are in a single node non-SC3.x 2505*0Sstevel@tonic-gate * situation. 2506*0Sstevel@tonic-gate */ 2507*0Sstevel@tonic-gate if (meta_mn_singlenode() && 2508*0Sstevel@tonic-gate mdmn_reset_changelog(sp, ep, MDMN_CLF_RESETLOG) != 0) { 2509*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 2510*0Sstevel@tonic-gate "Unable to reset changelog.")); 2511*0Sstevel@tonic-gate rval = -1; 2512*0Sstevel@tonic-gate goto out; 2513*0Sstevel@tonic-gate } 2514*0Sstevel@tonic-gate } 2515*0Sstevel@tonic-gate 2516*0Sstevel@tonic-gate /* Set my locally cached flag */ 2517*0Sstevel@tonic-gate sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN; 2518*0Sstevel@tonic-gate 2519*0Sstevel@tonic-gate /* 2520*0Sstevel@tonic-gate * Set this node's own flag on all joined nodes in the set 2521*0Sstevel@tonic-gate * (including my node). 2522*0Sstevel@tonic-gate */ 2523*0Sstevel@tonic-gate clear_nr_flags = 1; 2524*0Sstevel@tonic-gate 2525*0Sstevel@tonic-gate my_nd = *(sd->sd_mn_mynode); 2526*0Sstevel@tonic-gate my_nd.nd_next = NULL; 2527*0Sstevel@tonic-gate nd = sd->sd_nodelist; 2528*0Sstevel@tonic-gate while (nd) { 2529*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 2530*0Sstevel@tonic-gate nd = nd->nd_next; 2531*0Sstevel@tonic-gate continue; 2532*0Sstevel@tonic-gate } 2533*0Sstevel@tonic-gate if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 2534*0Sstevel@tonic-gate MD_NR_JOIN, NULL, ep)) { 2535*0Sstevel@tonic-gate rval = -1; 2536*0Sstevel@tonic-gate goto out; 2537*0Sstevel@tonic-gate } 2538*0Sstevel@tonic-gate nd = nd->nd_next; 2539*0Sstevel@tonic-gate } 2540*0Sstevel@tonic-gate 2541*0Sstevel@tonic-gate out: 2542*0Sstevel@tonic-gate if (rval != NULL) { 2543*0Sstevel@tonic-gate /* 2544*0Sstevel@tonic-gate * If rollback flag is 1, then node was joined to set. 2545*0Sstevel@tonic-gate * Since an error occurred, withdraw node from set in 2546*0Sstevel@tonic-gate * order to rollback to before command was run. 2547*0Sstevel@tonic-gate * Need to preserve ep so that calling function can 2548*0Sstevel@tonic-gate * get error information. 2549*0Sstevel@tonic-gate */ 2550*0Sstevel@tonic-gate if (rb_flags == 1) { 2551*0Sstevel@tonic-gate if (halt_set(sp, &xep)) { 2552*0Sstevel@tonic-gate mdclrerror(&xep); 2553*0Sstevel@tonic-gate } 2554*0Sstevel@tonic-gate } 2555*0Sstevel@tonic-gate 2556*0Sstevel@tonic-gate /* 2557*0Sstevel@tonic-gate * If error, reset master to INVALID. 2558*0Sstevel@tonic-gate * Ignore error since (next) first node to successfully join 2559*0Sstevel@tonic-gate * will set master on all nodes. 2560*0Sstevel@tonic-gate */ 2561*0Sstevel@tonic-gate (void) clnt_mnsetmaster(mynode(), sp, "", 2562*0Sstevel@tonic-gate MD_MN_INVALID_NID, &xep); 2563*0Sstevel@tonic-gate mdclrerror(&xep); 2564*0Sstevel@tonic-gate /* Reset master in my locally cached set descriptor */ 2565*0Sstevel@tonic-gate sd->sd_mn_master_nodeid = MD_MN_INVALID_NID; 2566*0Sstevel@tonic-gate sd->sd_mn_am_i_master = 0; 2567*0Sstevel@tonic-gate 2568*0Sstevel@tonic-gate /* 2569*0Sstevel@tonic-gate * If nr flags set on other nodes, reset them. 2570*0Sstevel@tonic-gate */ 2571*0Sstevel@tonic-gate if (clear_nr_flags) { 2572*0Sstevel@tonic-gate nd = sd->sd_nodelist; 2573*0Sstevel@tonic-gate while (nd) { 2574*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 2575*0Sstevel@tonic-gate nd = nd->nd_next; 2576*0Sstevel@tonic-gate continue; 2577*0Sstevel@tonic-gate } 2578*0Sstevel@tonic-gate (void) clnt_upd_nr_flags(nd->nd_nodename, sp, 2579*0Sstevel@tonic-gate &my_nd, MD_NR_WITHDRAW, NULL, &xep); 2580*0Sstevel@tonic-gate mdclrerror(&xep); 2581*0Sstevel@tonic-gate nd = nd->nd_next; 2582*0Sstevel@tonic-gate } 2583*0Sstevel@tonic-gate /* Reset my locally cached flag */ 2584*0Sstevel@tonic-gate sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN; 2585*0Sstevel@tonic-gate } 2586*0Sstevel@tonic-gate } 2587*0Sstevel@tonic-gate 2588*0Sstevel@tonic-gate /* 2589*0Sstevel@tonic-gate * Notify rpc.mdcommd on all nodes of a nodelist change. 2590*0Sstevel@tonic-gate * Send reinit command to mdcommd which forces it to get 2591*0Sstevel@tonic-gate * fresh set description. 2592*0Sstevel@tonic-gate */ 2593*0Sstevel@tonic-gate if (send_reinit) { 2594*0Sstevel@tonic-gate /* Send reinit */ 2595*0Sstevel@tonic-gate nd = sd->sd_nodelist; 2596*0Sstevel@tonic-gate while (nd) { 2597*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2598*0Sstevel@tonic-gate nd = nd->nd_next; 2599*0Sstevel@tonic-gate continue; 2600*0Sstevel@tonic-gate } 2601*0Sstevel@tonic-gate 2602*0Sstevel@tonic-gate /* Class is ignored for REINIT */ 2603*0Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, 2604*0Sstevel@tonic-gate sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { 2605*0Sstevel@tonic-gate /* 2606*0Sstevel@tonic-gate * We are here because we failed to resume 2607*0Sstevel@tonic-gate * rpc.mdcommd. However we potentially have 2608*0Sstevel@tonic-gate * an error from the previous call 2609*0Sstevel@tonic-gate * If the previous call did fail, we capture 2610*0Sstevel@tonic-gate * that error and generate a perror with 2611*0Sstevel@tonic-gate * the string, "Unable to resume...". 2612*0Sstevel@tonic-gate * Setting rval to -1 ensures that in the 2613*0Sstevel@tonic-gate * next iteration of the loop, ep is not 2614*0Sstevel@tonic-gate * clobbered. 2615*0Sstevel@tonic-gate */ 2616*0Sstevel@tonic-gate if (rval == 0) 2617*0Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 2618*0Sstevel@tonic-gate else 2619*0Sstevel@tonic-gate mdclrerror(&xep); 2620*0Sstevel@tonic-gate rval = -1; 2621*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 2622*0Sstevel@tonic-gate "Unable to reinit rpc.mdcommd.")); 2623*0Sstevel@tonic-gate } 2624*0Sstevel@tonic-gate nd = nd->nd_next; 2625*0Sstevel@tonic-gate } 2626*0Sstevel@tonic-gate 2627*0Sstevel@tonic-gate } 2628*0Sstevel@tonic-gate 2629*0Sstevel@tonic-gate out2: 2630*0Sstevel@tonic-gate /* 2631*0Sstevel@tonic-gate * Unlock diskset by resuming messages across the diskset. 2632*0Sstevel@tonic-gate * Just resume all classes so that resume is the same whether 2633*0Sstevel@tonic-gate * just one class was locked or all classes were locked. 2634*0Sstevel@tonic-gate */ 2635*0Sstevel@tonic-gate if ((suspend1_flag) || (suspendall_flag)) { 2636*0Sstevel@tonic-gate nd = sd->sd_nodelist; 2637*0Sstevel@tonic-gate while (nd) { 2638*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2639*0Sstevel@tonic-gate nd = nd->nd_next; 2640*0Sstevel@tonic-gate continue; 2641*0Sstevel@tonic-gate } 2642*0Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 2643*0Sstevel@tonic-gate sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 2644*0Sstevel@tonic-gate /* 2645*0Sstevel@tonic-gate * We are here because we failed to resume 2646*0Sstevel@tonic-gate * rpc.mdcommd. However we potentially have 2647*0Sstevel@tonic-gate * an error from the previous call 2648*0Sstevel@tonic-gate * If the previous call did fail, we capture 2649*0Sstevel@tonic-gate * that error and generate a perror with 2650*0Sstevel@tonic-gate * the string, "Unable to resume...". 2651*0Sstevel@tonic-gate * Setting rval to -1 ensures that in the 2652*0Sstevel@tonic-gate * next iteration of the loop, ep is not 2653*0Sstevel@tonic-gate * clobbered. 2654*0Sstevel@tonic-gate */ 2655*0Sstevel@tonic-gate if (rval == 0) 2656*0Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 2657*0Sstevel@tonic-gate else 2658*0Sstevel@tonic-gate mdclrerror(&xep); 2659*0Sstevel@tonic-gate rval = -1; 2660*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 2661*0Sstevel@tonic-gate "Unable to resume rpc.mdcommd.")); 2662*0Sstevel@tonic-gate } 2663*0Sstevel@tonic-gate nd = nd->nd_next; 2664*0Sstevel@tonic-gate } 2665*0Sstevel@tonic-gate meta_ping_mnset(sp->setno); 2666*0Sstevel@tonic-gate } 2667*0Sstevel@tonic-gate 2668*0Sstevel@tonic-gate /* 2669*0Sstevel@tonic-gate * Unlock set. This flushes the caches on the servers. 2670*0Sstevel@tonic-gate */ 2671*0Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname); 2672*0Sstevel@tonic-gate nd = sd->sd_nodelist; 2673*0Sstevel@tonic-gate while (nd) { 2674*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2675*0Sstevel@tonic-gate nd = nd->nd_next; 2676*0Sstevel@tonic-gate continue; 2677*0Sstevel@tonic-gate } 2678*0Sstevel@tonic-gate if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 2679*0Sstevel@tonic-gate if (rval == 0) 2680*0Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 2681*0Sstevel@tonic-gate else 2682*0Sstevel@tonic-gate mdclrerror(&xep); 2683*0Sstevel@tonic-gate rval = -1; 2684*0Sstevel@tonic-gate } 2685*0Sstevel@tonic-gate nd = nd->nd_next; 2686*0Sstevel@tonic-gate } 2687*0Sstevel@tonic-gate 2688*0Sstevel@tonic-gate /* 2689*0Sstevel@tonic-gate * If this node is the last to join the diskset and clustering isn't 2690*0Sstevel@tonic-gate * running, then resync the mirrors in the diskset. We have to wait 2691*0Sstevel@tonic-gate * until all nodes are joined so that the status gets propagated to 2692*0Sstevel@tonic-gate * all of the members of the set. 2693*0Sstevel@tonic-gate * Ignore any error from the resync as the join function shouldn't fail 2694*0Sstevel@tonic-gate * because the mirror resync had a problem. 2695*0Sstevel@tonic-gate * 2696*0Sstevel@tonic-gate * Don't start resync if set is stale. 2697*0Sstevel@tonic-gate */ 2698*0Sstevel@tonic-gate if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) && 2699*0Sstevel@tonic-gate (stale_set != 1)) { 2700*0Sstevel@tonic-gate nd = sd->sd_nodelist; 2701*0Sstevel@tonic-gate while (nd) { 2702*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) 2703*0Sstevel@tonic-gate break; 2704*0Sstevel@tonic-gate nd = nd->nd_next; 2705*0Sstevel@tonic-gate } 2706*0Sstevel@tonic-gate /* 2707*0Sstevel@tonic-gate * nd set to NULL means that we have no nodes in the set that 2708*0Sstevel@tonic-gate * haven't joined. In this case we start the resync. 2709*0Sstevel@tonic-gate */ 2710*0Sstevel@tonic-gate if (nd == NULL) { 2711*0Sstevel@tonic-gate (void) meta_mirror_resync_all(sp, 0, &xep); 2712*0Sstevel@tonic-gate mdclrerror(&xep); 2713*0Sstevel@tonic-gate } 2714*0Sstevel@tonic-gate } 2715*0Sstevel@tonic-gate 2716*0Sstevel@tonic-gate /* Update ABR state for all soft partitions */ 2717*0Sstevel@tonic-gate (void) meta_sp_update_abr(sp, &xep); 2718*0Sstevel@tonic-gate mdclrerror(&xep); 2719*0Sstevel@tonic-gate 2720*0Sstevel@tonic-gate /* 2721*0Sstevel@tonic-gate * call metaflushsetnames to reset local cache for master and 2722*0Sstevel@tonic-gate * node information. 2723*0Sstevel@tonic-gate */ 2724*0Sstevel@tonic-gate metaflushsetname(sp); 2725*0Sstevel@tonic-gate 2726*0Sstevel@tonic-gate /* release signals back to what they were on entry */ 2727*0Sstevel@tonic-gate if (procsigs(FALSE, &oldsigs, &xep) < 0) 2728*0Sstevel@tonic-gate mdclrerror(&xep); 2729*0Sstevel@tonic-gate 2730*0Sstevel@tonic-gate /* 2731*0Sstevel@tonic-gate * If no error and stale_set is set, then set ep back 2732*0Sstevel@tonic-gate * to ep from snarf_set call and return -3. If another error 2733*0Sstevel@tonic-gate * occurred and rval is not 0, then that error would have 2734*0Sstevel@tonic-gate * caused the node to be withdrawn from the set and would 2735*0Sstevel@tonic-gate * have set ep to that error information. 2736*0Sstevel@tonic-gate */ 2737*0Sstevel@tonic-gate if ((rval == 0) && (stale_set)) { 2738*0Sstevel@tonic-gate (void) mdstealerror(ep, &ep_snarf); 2739*0Sstevel@tonic-gate return (-3); 2740*0Sstevel@tonic-gate } 2741*0Sstevel@tonic-gate 2742*0Sstevel@tonic-gate return (rval); 2743*0Sstevel@tonic-gate } 2744*0Sstevel@tonic-gate 2745*0Sstevel@tonic-gate /* 2746*0Sstevel@tonic-gate * Entry point to withdraw a node from MultiNode diskset. 2747*0Sstevel@tonic-gate * 2748*0Sstevel@tonic-gate * Validate host in diskset. 2749*0Sstevel@tonic-gate * - Should be joined into diskset. 2750*0Sstevel@tonic-gate * Assume valid configuration is stored in the set/drive/node records 2751*0Sstevel@tonic-gate * in the local mddb since no node or drive can be added to the MNset 2752*0Sstevel@tonic-gate * unless all drives and nodes are available. Reconfig steps will 2753*0Sstevel@tonic-gate * resync all ALIVE nodes in case of panic in critical areas. 2754*0Sstevel@tonic-gate * 2755*0Sstevel@tonic-gate * Lock down the set. 2756*0Sstevel@tonic-gate * Verify that drives exist in configuration. 2757*0Sstevel@tonic-gate * Verify host is a member of this diskset. 2758*0Sstevel@tonic-gate * Verify host is an owner of the diskset (host is joined to diskset). 2759*0Sstevel@tonic-gate * Only allow withdrawal of master node if master node is the only joined 2760*0Sstevel@tonic-gate * in the diskset. 2761*0Sstevel@tonic-gate * Halt the diskset on this node. 2762*0Sstevel@tonic-gate * Reset Master on this node. 2763*0Sstevel@tonic-gate * Updated node flags that this node with withdrawn. 2764*0Sstevel@tonic-gate * Unlock the set. 2765*0Sstevel@tonic-gate * 2766*0Sstevel@tonic-gate * Return values: 2767*0Sstevel@tonic-gate * 0 - Node successfully withdrew from set. 2768*0Sstevel@tonic-gate * -1 - Withdrawal attempted but failed 2769*0Sstevel@tonic-gate * - any failure from libmeta calls 2770*0Sstevel@tonic-gate * - node not in the member list 2771*0Sstevel@tonic-gate * -2 - Withdrawal not attempted since 2772*0Sstevel@tonic-gate * - this set had no drives in set 2773*0Sstevel@tonic-gate * - this node not joined to set 2774*0Sstevel@tonic-gate * - set is not a multinode set 2775*0Sstevel@tonic-gate */ 2776*0Sstevel@tonic-gate extern int 2777*0Sstevel@tonic-gate meta_set_withdraw( 2778*0Sstevel@tonic-gate mdsetname_t *sp, 2779*0Sstevel@tonic-gate md_error_t *ep 2780*0Sstevel@tonic-gate ) 2781*0Sstevel@tonic-gate { 2782*0Sstevel@tonic-gate md_set_desc *sd; 2783*0Sstevel@tonic-gate md_drive_desc *dd = 0; 2784*0Sstevel@tonic-gate md_mnnode_desc *nd, my_nd; 2785*0Sstevel@tonic-gate int rval = 0; 2786*0Sstevel@tonic-gate md_setkey_t *cl_sk; 2787*0Sstevel@tonic-gate md_error_t xep = mdnullerror; 2788*0Sstevel@tonic-gate int set_halted = 0; 2789*0Sstevel@tonic-gate int suspendall_flag = 0; 2790*0Sstevel@tonic-gate int suspend1_flag = 0; 2791*0Sstevel@tonic-gate bool_t stale_bool = FALSE; 2792*0Sstevel@tonic-gate mddb_config_t c; 2793*0Sstevel@tonic-gate int node_id_list[1]; 2794*0Sstevel@tonic-gate sigset_t oldsigs; 2795*0Sstevel@tonic-gate int send_reinit = 0; 2796*0Sstevel@tonic-gate 2797*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 2798*0Sstevel@tonic-gate return (-1); 2799*0Sstevel@tonic-gate } 2800*0Sstevel@tonic-gate 2801*0Sstevel@tonic-gate /* Must be a multinode diskset */ 2802*0Sstevel@tonic-gate if (!MD_MNSET_DESC(sd)) { 2803*0Sstevel@tonic-gate (void) mderror(ep, MDE_NOT_MN, sp->setname); 2804*0Sstevel@tonic-gate return (-1); 2805*0Sstevel@tonic-gate } 2806*0Sstevel@tonic-gate 2807*0Sstevel@tonic-gate /* Make sure we are blocking all signals */ 2808*0Sstevel@tonic-gate if (procsigs(TRUE, &oldsigs, &xep) < 0) 2809*0Sstevel@tonic-gate mdclrerror(&xep); 2810*0Sstevel@tonic-gate 2811*0Sstevel@tonic-gate /* 2812*0Sstevel@tonic-gate * Lock the set on current set members. 2813*0Sstevel@tonic-gate * For MN diskset lock_set and SUSPEND are used to protect against 2814*0Sstevel@tonic-gate * other meta* commands running on the other nodes. 2815*0Sstevel@tonic-gate */ 2816*0Sstevel@tonic-gate nd = sd->sd_nodelist; 2817*0Sstevel@tonic-gate while (nd) { 2818*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2819*0Sstevel@tonic-gate nd = nd->nd_next; 2820*0Sstevel@tonic-gate continue; 2821*0Sstevel@tonic-gate } 2822*0Sstevel@tonic-gate if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 2823*0Sstevel@tonic-gate rval = -1; 2824*0Sstevel@tonic-gate goto out; 2825*0Sstevel@tonic-gate } 2826*0Sstevel@tonic-gate nd = nd->nd_next; 2827*0Sstevel@tonic-gate } 2828*0Sstevel@tonic-gate /* 2829*0Sstevel@tonic-gate * Lock out other meta* commands by suspending 2830*0Sstevel@tonic-gate * class 1 messages across the diskset. 2831*0Sstevel@tonic-gate */ 2832*0Sstevel@tonic-gate nd = sd->sd_nodelist; 2833*0Sstevel@tonic-gate while (nd) { 2834*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2835*0Sstevel@tonic-gate nd = nd->nd_next; 2836*0Sstevel@tonic-gate continue; 2837*0Sstevel@tonic-gate } 2838*0Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 2839*0Sstevel@tonic-gate sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) { 2840*0Sstevel@tonic-gate rval = -1; 2841*0Sstevel@tonic-gate goto out; 2842*0Sstevel@tonic-gate } 2843*0Sstevel@tonic-gate suspend1_flag = 1; 2844*0Sstevel@tonic-gate nd = nd->nd_next; 2845*0Sstevel@tonic-gate } 2846*0Sstevel@tonic-gate 2847*0Sstevel@tonic-gate /* Get list of drives - needed in case of failure */ 2848*0Sstevel@tonic-gate if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 2849*0Sstevel@tonic-gate ep)) == NULL) { 2850*0Sstevel@tonic-gate /* Error getting drives in list */ 2851*0Sstevel@tonic-gate if (! mdisok(ep)) { 2852*0Sstevel@tonic-gate rval = -1; 2853*0Sstevel@tonic-gate goto out2; 2854*0Sstevel@tonic-gate } 2855*0Sstevel@tonic-gate /* no drives in list */ 2856*0Sstevel@tonic-gate rval = -2; 2857*0Sstevel@tonic-gate goto out2; 2858*0Sstevel@tonic-gate } 2859*0Sstevel@tonic-gate 2860*0Sstevel@tonic-gate /* 2861*0Sstevel@tonic-gate * Verify that this host is a member (in the host list) of the set. 2862*0Sstevel@tonic-gate */ 2863*0Sstevel@tonic-gate nd = sd->sd_nodelist; 2864*0Sstevel@tonic-gate while (nd) { 2865*0Sstevel@tonic-gate if (strcmp(mynode(), nd->nd_nodename) == 0) { 2866*0Sstevel@tonic-gate break; 2867*0Sstevel@tonic-gate } 2868*0Sstevel@tonic-gate nd = nd->nd_next; 2869*0Sstevel@tonic-gate } 2870*0Sstevel@tonic-gate if (!nd) { 2871*0Sstevel@tonic-gate (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 2872*0Sstevel@tonic-gate sd->sd_mn_mynode->nd_nodename, NULL, 2873*0Sstevel@tonic-gate sp->setname); 2874*0Sstevel@tonic-gate rval = -1; 2875*0Sstevel@tonic-gate goto out2; 2876*0Sstevel@tonic-gate } 2877*0Sstevel@tonic-gate 2878*0Sstevel@tonic-gate /* 2879*0Sstevel@tonic-gate * Call metaget_setownership that calls each node in diskset and 2880*0Sstevel@tonic-gate * marks in set descriptor if node is an owner of the set or not. 2881*0Sstevel@tonic-gate * metaget_setownership checks to see if a node is an owner by 2882*0Sstevel@tonic-gate * checking to see if that node's kernel has the mddb loaded. 2883*0Sstevel@tonic-gate * If a node had panic'd during a reconfig or an 2884*0Sstevel@tonic-gate * add/delete/join/withdraw operation, the other nodes' node 2885*0Sstevel@tonic-gate * records may not reflect the current state of the diskset, 2886*0Sstevel@tonic-gate * so calling metaget_setownership is the safest thing to do. 2887*0Sstevel@tonic-gate */ 2888*0Sstevel@tonic-gate if (metaget_setownership(sp, ep) == -1) { 2889*0Sstevel@tonic-gate rval = -1; 2890*0Sstevel@tonic-gate goto out2; 2891*0Sstevel@tonic-gate } 2892*0Sstevel@tonic-gate 2893*0Sstevel@tonic-gate /* 2894*0Sstevel@tonic-gate * Verify that this node is joined 2895*0Sstevel@tonic-gate * to diskset (i.e. is an owner of the diskset). 2896*0Sstevel@tonic-gate */ 2897*0Sstevel@tonic-gate if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 2898*0Sstevel@tonic-gate rval = -2; 2899*0Sstevel@tonic-gate goto out2; 2900*0Sstevel@tonic-gate } 2901*0Sstevel@tonic-gate 2902*0Sstevel@tonic-gate /* 2903*0Sstevel@tonic-gate * For a MN diskset, only withdraw master if it is 2904*0Sstevel@tonic-gate * the only joined node. 2905*0Sstevel@tonic-gate */ 2906*0Sstevel@tonic-gate if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) { 2907*0Sstevel@tonic-gate nd = sd->sd_nodelist; 2908*0Sstevel@tonic-gate while (nd) { 2909*0Sstevel@tonic-gate /* Skip my node since checking for other owners */ 2910*0Sstevel@tonic-gate if (nd->nd_nodeid == sd->sd_mn_master_nodeid) { 2911*0Sstevel@tonic-gate nd = nd->nd_next; 2912*0Sstevel@tonic-gate continue; 2913*0Sstevel@tonic-gate } 2914*0Sstevel@tonic-gate /* If another owner node if found, error */ 2915*0Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OWN) { 2916*0Sstevel@tonic-gate (void) mddserror(ep, MDE_DS_WITHDRAWMASTER, 2917*0Sstevel@tonic-gate sp->setno, 2918*0Sstevel@tonic-gate sd->sd_mn_mynode->nd_nodename, NULL, 2919*0Sstevel@tonic-gate sp->setname); 2920*0Sstevel@tonic-gate rval = -1; 2921*0Sstevel@tonic-gate goto out2; 2922*0Sstevel@tonic-gate } 2923*0Sstevel@tonic-gate nd = nd->nd_next; 2924*0Sstevel@tonic-gate } 2925*0Sstevel@tonic-gate } 2926*0Sstevel@tonic-gate 2927*0Sstevel@tonic-gate /* 2928*0Sstevel@tonic-gate * Is current set STALE? 2929*0Sstevel@tonic-gate */ 2930*0Sstevel@tonic-gate (void) memset(&c, 0, sizeof (c)); 2931*0Sstevel@tonic-gate c.c_id = 0; 2932*0Sstevel@tonic-gate c.c_setno = sp->setno; 2933*0Sstevel@tonic-gate if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 2934*0Sstevel@tonic-gate (void) mdstealerror(ep, &c.c_mde); 2935*0Sstevel@tonic-gate rval = -1; 2936*0Sstevel@tonic-gate goto out; 2937*0Sstevel@tonic-gate } 2938*0Sstevel@tonic-gate if (c.c_flags & MDDB_C_STALE) { 2939*0Sstevel@tonic-gate stale_bool = TRUE; 2940*0Sstevel@tonic-gate } 2941*0Sstevel@tonic-gate 2942*0Sstevel@tonic-gate /* 2943*0Sstevel@tonic-gate * Notify rpc.mdcommd on all nodes of a nodelist change. 2944*0Sstevel@tonic-gate * Start by suspending rpc.mdcommd (which drains it of all messages), 2945*0Sstevel@tonic-gate * then change the nodelist followed by a reinit and resume. 2946*0Sstevel@tonic-gate */ 2947*0Sstevel@tonic-gate nd = sd->sd_nodelist; 2948*0Sstevel@tonic-gate while (nd) { 2949*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2950*0Sstevel@tonic-gate nd = nd->nd_next; 2951*0Sstevel@tonic-gate continue; 2952*0Sstevel@tonic-gate } 2953*0Sstevel@tonic-gate 2954*0Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 2955*0Sstevel@tonic-gate sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { 2956*0Sstevel@tonic-gate rval = -1; 2957*0Sstevel@tonic-gate goto out; 2958*0Sstevel@tonic-gate } 2959*0Sstevel@tonic-gate suspendall_flag = 1; 2960*0Sstevel@tonic-gate nd = nd->nd_next; 2961*0Sstevel@tonic-gate } 2962*0Sstevel@tonic-gate 2963*0Sstevel@tonic-gate /* 2964*0Sstevel@tonic-gate * Withdraw the set - halt set. 2965*0Sstevel@tonic-gate * This will fail if any I/O is occuring to any metadevice which 2966*0Sstevel@tonic-gate * includes a resync to a mirror metadevice. 2967*0Sstevel@tonic-gate */ 2968*0Sstevel@tonic-gate set_halted = 1; 2969*0Sstevel@tonic-gate if (halt_set(sp, ep)) { 2970*0Sstevel@tonic-gate /* Was set actually halted? */ 2971*0Sstevel@tonic-gate if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) { 2972*0Sstevel@tonic-gate set_halted = 0; 2973*0Sstevel@tonic-gate } 2974*0Sstevel@tonic-gate rval = -1; 2975*0Sstevel@tonic-gate goto out; 2976*0Sstevel@tonic-gate } 2977*0Sstevel@tonic-gate 2978*0Sstevel@tonic-gate /* Change to nodelist so need to send reinit to rpc.mdcommd */ 2979*0Sstevel@tonic-gate send_reinit = 1; 2980*0Sstevel@tonic-gate 2981*0Sstevel@tonic-gate /* Reset master on withdrawn node */ 2982*0Sstevel@tonic-gate if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "", 2983*0Sstevel@tonic-gate MD_MN_INVALID_NID, ep)) { 2984*0Sstevel@tonic-gate rval = -1; 2985*0Sstevel@tonic-gate goto out; 2986*0Sstevel@tonic-gate } 2987*0Sstevel@tonic-gate 2988*0Sstevel@tonic-gate /* Mark my node as withdrawn and send to other nodes */ 2989*0Sstevel@tonic-gate nd = sd->sd_nodelist; 2990*0Sstevel@tonic-gate my_nd = *(sd->sd_mn_mynode); /* structure copy */ 2991*0Sstevel@tonic-gate my_nd.nd_next = NULL; 2992*0Sstevel@tonic-gate while (nd) { 2993*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 2994*0Sstevel@tonic-gate nd = nd->nd_next; 2995*0Sstevel@tonic-gate continue; 2996*0Sstevel@tonic-gate } 2997*0Sstevel@tonic-gate if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 2998*0Sstevel@tonic-gate MD_NR_WITHDRAW, NULL, ep)) { 2999*0Sstevel@tonic-gate rval = -1; 3000*0Sstevel@tonic-gate goto out; 3001*0Sstevel@tonic-gate } 3002*0Sstevel@tonic-gate nd = nd->nd_next; 3003*0Sstevel@tonic-gate } 3004*0Sstevel@tonic-gate 3005*0Sstevel@tonic-gate /* 3006*0Sstevel@tonic-gate * If withdrawn node is a mirror owner, reset mirror owner 3007*0Sstevel@tonic-gate * to NULL. If an error occurs, print a warning and continue. 3008*0Sstevel@tonic-gate * Don't fail metaset because of mirror owner reset problem since 3009*0Sstevel@tonic-gate * next node to grab mirror will resolve this issue. 3010*0Sstevel@tonic-gate * Before next node grabs mirrors, metaset will show the withdrawn 3011*0Sstevel@tonic-gate * node as owner which is why an attempt to reset the mirror owner 3012*0Sstevel@tonic-gate * is made. 3013*0Sstevel@tonic-gate */ 3014*0Sstevel@tonic-gate node_id_list[0] = sd->sd_mn_mynode->nd_nodeid; /* Setup my nodeid */ 3015*0Sstevel@tonic-gate nd = sd->sd_nodelist; 3016*0Sstevel@tonic-gate while (nd) { 3017*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3018*0Sstevel@tonic-gate nd = nd->nd_next; 3019*0Sstevel@tonic-gate continue; 3020*0Sstevel@tonic-gate } 3021*0Sstevel@tonic-gate if (clnt_reset_mirror_owner(nd->nd_nodename, sp, 3022*0Sstevel@tonic-gate 1, &node_id_list[0], &xep) == 01) { 3023*0Sstevel@tonic-gate mde_perror(&xep, dgettext(TEXT_DOMAIN, 3024*0Sstevel@tonic-gate "Unable to reset mirror owner on node %s"), 3025*0Sstevel@tonic-gate nd->nd_nodename); 3026*0Sstevel@tonic-gate mdclrerror(&xep); 3027*0Sstevel@tonic-gate } 3028*0Sstevel@tonic-gate nd = nd->nd_next; 3029*0Sstevel@tonic-gate } 3030*0Sstevel@tonic-gate 3031*0Sstevel@tonic-gate out: 3032*0Sstevel@tonic-gate if (rval == -1) { 3033*0Sstevel@tonic-gate /* Rejoin node - Mark node as joined and send to other nodes */ 3034*0Sstevel@tonic-gate nd = sd->sd_nodelist; 3035*0Sstevel@tonic-gate my_nd = *(sd->sd_mn_mynode); /* structure copy */ 3036*0Sstevel@tonic-gate my_nd.nd_next = NULL; 3037*0Sstevel@tonic-gate while (nd) { 3038*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3039*0Sstevel@tonic-gate nd = nd->nd_next; 3040*0Sstevel@tonic-gate continue; 3041*0Sstevel@tonic-gate } 3042*0Sstevel@tonic-gate if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 3043*0Sstevel@tonic-gate MD_NR_JOIN, NULL, &xep)) { 3044*0Sstevel@tonic-gate mdclrerror(&xep); 3045*0Sstevel@tonic-gate } 3046*0Sstevel@tonic-gate nd = nd->nd_next; 3047*0Sstevel@tonic-gate } 3048*0Sstevel@tonic-gate 3049*0Sstevel@tonic-gate /* Set master on withdrawn node */ 3050*0Sstevel@tonic-gate if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, 3051*0Sstevel@tonic-gate sd->sd_mn_master_nodenm, 3052*0Sstevel@tonic-gate sd->sd_mn_master_nodeid, &xep)) { 3053*0Sstevel@tonic-gate mdclrerror(&xep); 3054*0Sstevel@tonic-gate } 3055*0Sstevel@tonic-gate 3056*0Sstevel@tonic-gate /* Join set if halt_set had succeeded */ 3057*0Sstevel@tonic-gate if (set_halted) { 3058*0Sstevel@tonic-gate if (setup_db_bydd(sp, dd, 0, &xep) == -1) { 3059*0Sstevel@tonic-gate mdclrerror(&xep); 3060*0Sstevel@tonic-gate } 3061*0Sstevel@tonic-gate /* If set previously stale - make it so at re-join */ 3062*0Sstevel@tonic-gate if (snarf_set(sp, stale_bool, &xep) != 0) { 3063*0Sstevel@tonic-gate mdclrerror(&xep); 3064*0Sstevel@tonic-gate (void) halt_set(sp, &xep); 3065*0Sstevel@tonic-gate mdclrerror(&xep); 3066*0Sstevel@tonic-gate } 3067*0Sstevel@tonic-gate } 3068*0Sstevel@tonic-gate } 3069*0Sstevel@tonic-gate 3070*0Sstevel@tonic-gate /* 3071*0Sstevel@tonic-gate * Notify rpc.mdcommd on all nodes of a nodelist change. 3072*0Sstevel@tonic-gate * Send reinit command to mdcommd which forces it to get 3073*0Sstevel@tonic-gate * fresh set description. 3074*0Sstevel@tonic-gate */ 3075*0Sstevel@tonic-gate if (send_reinit) { 3076*0Sstevel@tonic-gate /* Send reinit */ 3077*0Sstevel@tonic-gate nd = sd->sd_nodelist; 3078*0Sstevel@tonic-gate while (nd) { 3079*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3080*0Sstevel@tonic-gate nd = nd->nd_next; 3081*0Sstevel@tonic-gate continue; 3082*0Sstevel@tonic-gate } 3083*0Sstevel@tonic-gate 3084*0Sstevel@tonic-gate /* Class is ignored for REINIT */ 3085*0Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, 3086*0Sstevel@tonic-gate sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { 3087*0Sstevel@tonic-gate /* 3088*0Sstevel@tonic-gate * We are here because we failed to resume 3089*0Sstevel@tonic-gate * rpc.mdcommd. However we potentially have 3090*0Sstevel@tonic-gate * an error from the previous call. 3091*0Sstevel@tonic-gate * If the previous call did fail, we 3092*0Sstevel@tonic-gate * capture that error and generate a perror 3093*0Sstevel@tonic-gate * withthe string, "Unable to resume...". 3094*0Sstevel@tonic-gate * Setting rval to -1 ensures that in the 3095*0Sstevel@tonic-gate * next iteration of the loop, ep is not 3096*0Sstevel@tonic-gate * clobbered. 3097*0Sstevel@tonic-gate */ 3098*0Sstevel@tonic-gate if (rval == 0) 3099*0Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 3100*0Sstevel@tonic-gate else 3101*0Sstevel@tonic-gate mdclrerror(&xep); 3102*0Sstevel@tonic-gate rval = -1; 3103*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 3104*0Sstevel@tonic-gate "Unable to reinit rpc.mdcommd.")); 3105*0Sstevel@tonic-gate } 3106*0Sstevel@tonic-gate nd = nd->nd_next; 3107*0Sstevel@tonic-gate } 3108*0Sstevel@tonic-gate } 3109*0Sstevel@tonic-gate 3110*0Sstevel@tonic-gate out2: 3111*0Sstevel@tonic-gate /* 3112*0Sstevel@tonic-gate * Unlock diskset by resuming messages across the diskset. 3113*0Sstevel@tonic-gate * Just resume all classes so that resume is the same whether 3114*0Sstevel@tonic-gate * just one class was locked or all classes were locked. 3115*0Sstevel@tonic-gate */ 3116*0Sstevel@tonic-gate if ((suspend1_flag) || (suspendall_flag)) { 3117*0Sstevel@tonic-gate nd = sd->sd_nodelist; 3118*0Sstevel@tonic-gate while (nd) { 3119*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3120*0Sstevel@tonic-gate nd = nd->nd_next; 3121*0Sstevel@tonic-gate continue; 3122*0Sstevel@tonic-gate } 3123*0Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 3124*0Sstevel@tonic-gate sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 3125*0Sstevel@tonic-gate /* 3126*0Sstevel@tonic-gate * We are here because we failed to resume 3127*0Sstevel@tonic-gate * rpc.mdcommd. However we potentially have 3128*0Sstevel@tonic-gate * an error from the previous call 3129*0Sstevel@tonic-gate * If the previous call did fail, we capture 3130*0Sstevel@tonic-gate * that error and generate a perror with 3131*0Sstevel@tonic-gate * the string, "Unable to resume...". 3132*0Sstevel@tonic-gate * Setting rval to -1 ensures that in the 3133*0Sstevel@tonic-gate * next iteration of the loop, ep is not 3134*0Sstevel@tonic-gate * clobbered. 3135*0Sstevel@tonic-gate */ 3136*0Sstevel@tonic-gate if (rval == 0) 3137*0Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 3138*0Sstevel@tonic-gate else 3139*0Sstevel@tonic-gate mdclrerror(&xep); 3140*0Sstevel@tonic-gate rval = -1; 3141*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 3142*0Sstevel@tonic-gate "Unable to resume rpc.mdcommd.")); 3143*0Sstevel@tonic-gate } 3144*0Sstevel@tonic-gate nd = nd->nd_next; 3145*0Sstevel@tonic-gate } 3146*0Sstevel@tonic-gate meta_ping_mnset(sp->setno); 3147*0Sstevel@tonic-gate } 3148*0Sstevel@tonic-gate 3149*0Sstevel@tonic-gate /* 3150*0Sstevel@tonic-gate * Unlock set. This flushes the caches on the servers. 3151*0Sstevel@tonic-gate */ 3152*0Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname); 3153*0Sstevel@tonic-gate nd = sd->sd_nodelist; 3154*0Sstevel@tonic-gate while (nd) { 3155*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3156*0Sstevel@tonic-gate nd = nd->nd_next; 3157*0Sstevel@tonic-gate continue; 3158*0Sstevel@tonic-gate } 3159*0Sstevel@tonic-gate if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 3160*0Sstevel@tonic-gate if (rval == 0) 3161*0Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 3162*0Sstevel@tonic-gate else 3163*0Sstevel@tonic-gate mdclrerror(&xep); 3164*0Sstevel@tonic-gate rval = -1; 3165*0Sstevel@tonic-gate } 3166*0Sstevel@tonic-gate nd = nd->nd_next; 3167*0Sstevel@tonic-gate } 3168*0Sstevel@tonic-gate 3169*0Sstevel@tonic-gate /* 3170*0Sstevel@tonic-gate * call metaflushsetnames to reset local cache for master and 3171*0Sstevel@tonic-gate * node information. 3172*0Sstevel@tonic-gate */ 3173*0Sstevel@tonic-gate metaflushsetname(sp); 3174*0Sstevel@tonic-gate 3175*0Sstevel@tonic-gate /* release signals back to what they were on entry */ 3176*0Sstevel@tonic-gate if (procsigs(FALSE, &oldsigs, &xep) < 0) 3177*0Sstevel@tonic-gate mdclrerror(&xep); 3178*0Sstevel@tonic-gate 3179*0Sstevel@tonic-gate return (rval); 3180*0Sstevel@tonic-gate 3181*0Sstevel@tonic-gate } 3182*0Sstevel@tonic-gate 3183*0Sstevel@tonic-gate /* 3184*0Sstevel@tonic-gate * Update nodelist with cluster member information. 3185*0Sstevel@tonic-gate * A node not in the member list will be marked 3186*0Sstevel@tonic-gate * as not ALIVE and not OWN. 3187*0Sstevel@tonic-gate * A node in the member list will be marked ALIVE, but 3188*0Sstevel@tonic-gate * the OWN bit will not be changed. 3189*0Sstevel@tonic-gate * 3190*0Sstevel@tonic-gate * If mynode isn't in the membership list, fail causing 3191*0Sstevel@tonic-gate * another reconfig cycle to be started since a non-member 3192*0Sstevel@tonic-gate * node shouldn't be taking part in the reconfig cycle. 3193*0Sstevel@tonic-gate * 3194*0Sstevel@tonic-gate * Return values: 3195*0Sstevel@tonic-gate * 0 - No problem. 3196*0Sstevel@tonic-gate * 1 - Any failure including RPC failure to my node. 3197*0Sstevel@tonic-gate */ 3198*0Sstevel@tonic-gate int 3199*0Sstevel@tonic-gate meta_reconfig_update_nodelist( 3200*0Sstevel@tonic-gate mdsetname_t *sp, 3201*0Sstevel@tonic-gate mndiskset_membershiplist_t *nl, 3202*0Sstevel@tonic-gate md_set_desc *sd, 3203*0Sstevel@tonic-gate md_error_t *ep 3204*0Sstevel@tonic-gate ) 3205*0Sstevel@tonic-gate { 3206*0Sstevel@tonic-gate mndiskset_membershiplist_t *nl2; 3207*0Sstevel@tonic-gate md_mnnode_desc *nd; 3208*0Sstevel@tonic-gate md_error_t xep = mdnullerror; 3209*0Sstevel@tonic-gate int rval = 0; 3210*0Sstevel@tonic-gate 3211*0Sstevel@tonic-gate /* 3212*0Sstevel@tonic-gate * Walk through nodelist, checking to see if each 3213*0Sstevel@tonic-gate * node is in the member list. 3214*0Sstevel@tonic-gate * If node is not a member, reset ALIVE and OWN node flag. 3215*0Sstevel@tonic-gate * If node is a member, set ALIVE. 3216*0Sstevel@tonic-gate * If mynode's OWN flag gets reset, then halt the diskset on this node. 3217*0Sstevel@tonic-gate */ 3218*0Sstevel@tonic-gate nd = sd->sd_nodelist; 3219*0Sstevel@tonic-gate while (nd) { 3220*0Sstevel@tonic-gate nl2 = nl; 3221*0Sstevel@tonic-gate while (nl2) { 3222*0Sstevel@tonic-gate /* If node is in member list, set ALIVE */ 3223*0Sstevel@tonic-gate if (nl2->msl_node_id == nd->nd_nodeid) { 3224*0Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_ALIVE; 3225*0Sstevel@tonic-gate break; 3226*0Sstevel@tonic-gate } else { 3227*0Sstevel@tonic-gate nl2 = nl2->next; 3228*0Sstevel@tonic-gate } 3229*0Sstevel@tonic-gate /* node is not in member list, mark !ALIVE and !OWN */ 3230*0Sstevel@tonic-gate if (nl2 == NULL) { 3231*0Sstevel@tonic-gate /* If node is mynode, then halt set if needed */ 3232*0Sstevel@tonic-gate if (strcmp(mynode(), nd->nd_nodename) == 0) { 3233*0Sstevel@tonic-gate /* 3234*0Sstevel@tonic-gate * This shouldn't happen, but just 3235*0Sstevel@tonic-gate * in case... Any node not in the 3236*0Sstevel@tonic-gate * membership list should be dead and 3237*0Sstevel@tonic-gate * not running reconfig step1. 3238*0Sstevel@tonic-gate */ 3239*0Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OWN) { 3240*0Sstevel@tonic-gate if (halt_set(sp, &xep)) { 3241*0Sstevel@tonic-gate mde_perror(&xep, ""); 3242*0Sstevel@tonic-gate mdclrerror(&xep); 3243*0Sstevel@tonic-gate } 3244*0Sstevel@tonic-gate } 3245*0Sstevel@tonic-gate /* 3246*0Sstevel@tonic-gate * Return failure since this node 3247*0Sstevel@tonic-gate * (mynode) is not in the membership 3248*0Sstevel@tonic-gate * list, but process the rest of the 3249*0Sstevel@tonic-gate * nodelist first so that rpc.metad 3250*0Sstevel@tonic-gate * can be updated with the latest 3251*0Sstevel@tonic-gate * membership information. 3252*0Sstevel@tonic-gate */ 3253*0Sstevel@tonic-gate (void) mddserror(ep, 3254*0Sstevel@tonic-gate MDE_DS_NOTINMEMBERLIST, 3255*0Sstevel@tonic-gate sp->setno, nd->nd_nodename, NULL, 3256*0Sstevel@tonic-gate sp->setname); 3257*0Sstevel@tonic-gate rval = 1; 3258*0Sstevel@tonic-gate } 3259*0Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_ALIVE; 3260*0Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OWN; 3261*0Sstevel@tonic-gate } 3262*0Sstevel@tonic-gate } 3263*0Sstevel@tonic-gate nd = nd->nd_next; 3264*0Sstevel@tonic-gate } 3265*0Sstevel@tonic-gate 3266*0Sstevel@tonic-gate /* Send this information to rpc.metad */ 3267*0Sstevel@tonic-gate if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, 3268*0Sstevel@tonic-gate MD_NR_SET, MNSET_IN_RECONFIG, &xep)) { 3269*0Sstevel@tonic-gate /* Return failure if can't send node flags to rpc.metad */ 3270*0Sstevel@tonic-gate if (rval == 0) { 3271*0Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 3272*0Sstevel@tonic-gate rval = 1; 3273*0Sstevel@tonic-gate } 3274*0Sstevel@tonic-gate } 3275*0Sstevel@tonic-gate return (rval); 3276*0Sstevel@tonic-gate } 3277*0Sstevel@tonic-gate 3278*0Sstevel@tonic-gate /* 3279*0Sstevel@tonic-gate * Choose master determines the master for a diskset. 3280*0Sstevel@tonic-gate * Each node determines the master on its own and 3281*0Sstevel@tonic-gate * adds this information to its local rpc.metad nodelist 3282*0Sstevel@tonic-gate * and also sends it to the kernel. 3283*0Sstevel@tonic-gate * 3284*0Sstevel@tonic-gate * Nodelist in set descriptor (sd) is sorted in 3285*0Sstevel@tonic-gate * monotonically increasing sequence of nodeid. 3286*0Sstevel@tonic-gate * 3287*0Sstevel@tonic-gate * Return values: 3288*0Sstevel@tonic-gate * 0 - No problem. 3289*0Sstevel@tonic-gate * 205 - There was an RPC problem to another node. 3290*0Sstevel@tonic-gate * -1 - There was an error. This could be an RPC error to my node. 3291*0Sstevel@tonic-gate * This is a catastrophic failure causing node to panic. 3292*0Sstevel@tonic-gate */ 3293*0Sstevel@tonic-gate int 3294*0Sstevel@tonic-gate meta_reconfig_choose_master_for_set( 3295*0Sstevel@tonic-gate mdsetname_t *sp, 3296*0Sstevel@tonic-gate md_set_desc *sd, 3297*0Sstevel@tonic-gate md_error_t *ep 3298*0Sstevel@tonic-gate ) 3299*0Sstevel@tonic-gate { 3300*0Sstevel@tonic-gate int is_owner; 3301*0Sstevel@tonic-gate md_mnset_record *mnsr = NULL; 3302*0Sstevel@tonic-gate int lowest_alive_nodeid = 0; 3303*0Sstevel@tonic-gate uint_t master_nodeid; 3304*0Sstevel@tonic-gate md_mnnode_desc *nd, *nd2; 3305*0Sstevel@tonic-gate md_mnnode_record *nr; 3306*0Sstevel@tonic-gate md_drive_desc *dd; 3307*0Sstevel@tonic-gate md_setkey_t *cl_sk; 3308*0Sstevel@tonic-gate int rval = 0; 3309*0Sstevel@tonic-gate md_error_t xep = mdnullerror; 3310*0Sstevel@tonic-gate mddb_setflags_config_t sf; 3311*0Sstevel@tonic-gate 3312*0Sstevel@tonic-gate /* 3313*0Sstevel@tonic-gate * Is current node joined to diskset? 3314*0Sstevel@tonic-gate * Don't trust flags, really check to see if mddb is snarfed. 3315*0Sstevel@tonic-gate */ 3316*0Sstevel@tonic-gate if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 3317*0Sstevel@tonic-gate /* 3318*0Sstevel@tonic-gate * If a node is joined to the diskset, this node checks 3319*0Sstevel@tonic-gate * to see if the current master of the diskset is valid and 3320*0Sstevel@tonic-gate * is still in the membership list (ALIVE) and is 3321*0Sstevel@tonic-gate * still joined (OWN). Need to verify if master is 3322*0Sstevel@tonic-gate * really joined - don't trust the flags. (Can trust 3323*0Sstevel@tonic-gate * ALIVE since set during earlier part of reconfig cycle.) 3324*0Sstevel@tonic-gate * If the current master is valid, still in the membership 3325*0Sstevel@tonic-gate * list and joined, then master is not changed on this node. 3326*0Sstevel@tonic-gate * Just return. 3327*0Sstevel@tonic-gate * 3328*0Sstevel@tonic-gate * Verify that nodeid is valid before accessing masternode. 3329*0Sstevel@tonic-gate */ 3330*0Sstevel@tonic-gate if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) && 3331*0Sstevel@tonic-gate (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) { 3332*0Sstevel@tonic-gate if (clnt_ownset(sd->sd_mn_master_nodenm, sp, 3333*0Sstevel@tonic-gate &is_owner, ep) == -1) { 3334*0Sstevel@tonic-gate /* If RPC failure to another node return 205 */ 3335*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 3336*0Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 3337*0Sstevel@tonic-gate sd->sd_mn_master_nodeid)) { 3338*0Sstevel@tonic-gate return (205); 3339*0Sstevel@tonic-gate } else { 3340*0Sstevel@tonic-gate /* Any other failure */ 3341*0Sstevel@tonic-gate return (-1); 3342*0Sstevel@tonic-gate } 3343*0Sstevel@tonic-gate } else { 3344*0Sstevel@tonic-gate if (is_owner == TRUE) { 3345*0Sstevel@tonic-gate 3346*0Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext( 3347*0Sstevel@tonic-gate TEXT_DOMAIN, "Set %s previous " 3348*0Sstevel@tonic-gate "master chosen %s (%d): %s"), 3349*0Sstevel@tonic-gate sp->setname, 3350*0Sstevel@tonic-gate sd->sd_mn_master_nodenm, 3351*0Sstevel@tonic-gate sd->sd_mn_master_nodeid, 3352*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - 3353*0Sstevel@tonic-gate start_time)); 3354*0Sstevel@tonic-gate 3355*0Sstevel@tonic-gate /* Previous master is ok - done */ 3356*0Sstevel@tonic-gate return (0); 3357*0Sstevel@tonic-gate } 3358*0Sstevel@tonic-gate } 3359*0Sstevel@tonic-gate } 3360*0Sstevel@tonic-gate 3361*0Sstevel@tonic-gate /* 3362*0Sstevel@tonic-gate * If current master is no longer in the membership list or 3363*0Sstevel@tonic-gate * is no longer joined, then this node uses the following 3364*0Sstevel@tonic-gate * algorithm: 3365*0Sstevel@tonic-gate * - node calls RPC routine clnt_ownset to get latest 3366*0Sstevel@tonic-gate * information on which nodes are owners of diskset. 3367*0Sstevel@tonic-gate * clnt_ownset checks on each node to see if its kernel 3368*0Sstevel@tonic-gate * has that diskset snarfed. 3369*0Sstevel@tonic-gate */ 3370*0Sstevel@tonic-gate nd = sd->sd_nodelist; 3371*0Sstevel@tonic-gate while (nd) { 3372*0Sstevel@tonic-gate /* Don't consider node that isn't in member list */ 3373*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3374*0Sstevel@tonic-gate nd = nd->nd_next; 3375*0Sstevel@tonic-gate continue; 3376*0Sstevel@tonic-gate } 3377*0Sstevel@tonic-gate 3378*0Sstevel@tonic-gate if (clnt_ownset(nd->nd_nodename, sp, 3379*0Sstevel@tonic-gate &is_owner, ep) == -1) { 3380*0Sstevel@tonic-gate /* If RPC failure to another node return 205 */ 3381*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 3382*0Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 3383*0Sstevel@tonic-gate nd->nd_nodeid)) { 3384*0Sstevel@tonic-gate return (205); 3385*0Sstevel@tonic-gate } else { 3386*0Sstevel@tonic-gate /* Any other failure */ 3387*0Sstevel@tonic-gate return (-1); 3388*0Sstevel@tonic-gate } 3389*0Sstevel@tonic-gate } 3390*0Sstevel@tonic-gate 3391*0Sstevel@tonic-gate /* 3392*0Sstevel@tonic-gate * Set owner flag for each node based on whether 3393*0Sstevel@tonic-gate * that node really has a diskset mddb snarfed in 3394*0Sstevel@tonic-gate * or not. 3395*0Sstevel@tonic-gate */ 3396*0Sstevel@tonic-gate if (is_owner == TRUE) 3397*0Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_OWN; 3398*0Sstevel@tonic-gate else 3399*0Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OWN; 3400*0Sstevel@tonic-gate 3401*0Sstevel@tonic-gate nd = nd->nd_next; 3402*0Sstevel@tonic-gate } 3403*0Sstevel@tonic-gate 3404*0Sstevel@tonic-gate /* 3405*0Sstevel@tonic-gate * - node walks through nodelist looking for nodes that are 3406*0Sstevel@tonic-gate * owners of the diskset that are in the membership list. 3407*0Sstevel@tonic-gate * - for each owner, node calls RPC routine clnt_getset to 3408*0Sstevel@tonic-gate * see if that node has its node record set to OK. 3409*0Sstevel@tonic-gate * - If so, master is chosen to be this owner node. 3410*0Sstevel@tonic-gate */ 3411*0Sstevel@tonic-gate nd = sd->sd_nodelist; 3412*0Sstevel@tonic-gate while (nd) { 3413*0Sstevel@tonic-gate /* Don't consider node that isn't in member list */ 3414*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3415*0Sstevel@tonic-gate nd = nd->nd_next; 3416*0Sstevel@tonic-gate continue; 3417*0Sstevel@tonic-gate } 3418*0Sstevel@tonic-gate 3419*0Sstevel@tonic-gate /* Don't consider a node that isn't an owner */ 3420*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3421*0Sstevel@tonic-gate nd = nd->nd_next; 3422*0Sstevel@tonic-gate continue; 3423*0Sstevel@tonic-gate } 3424*0Sstevel@tonic-gate 3425*0Sstevel@tonic-gate /* Does node has its own node record set to OK? */ 3426*0Sstevel@tonic-gate if (clnt_mngetset(nd->nd_nodename, sp->setname, 3427*0Sstevel@tonic-gate MD_SET_BAD, &mnsr, ep) == -1) { 3428*0Sstevel@tonic-gate /* If RPC failure to another node return 205 */ 3429*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 3430*0Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 3431*0Sstevel@tonic-gate nd->nd_nodeid)) { 3432*0Sstevel@tonic-gate return (205); 3433*0Sstevel@tonic-gate } else { 3434*0Sstevel@tonic-gate /* Any other failure */ 3435*0Sstevel@tonic-gate return (-1); 3436*0Sstevel@tonic-gate } 3437*0Sstevel@tonic-gate } 3438*0Sstevel@tonic-gate nr = mnsr->sr_nodechain; 3439*0Sstevel@tonic-gate while (nr) { 3440*0Sstevel@tonic-gate if (nd->nd_nodeid == nr->nr_nodeid) { 3441*0Sstevel@tonic-gate if (nr->nr_flags & MD_MN_NODE_OK) { 3442*0Sstevel@tonic-gate /* Found a master */ 3443*0Sstevel@tonic-gate free_sr( 3444*0Sstevel@tonic-gate (md_set_record *)mnsr); 3445*0Sstevel@tonic-gate goto found_master; 3446*0Sstevel@tonic-gate } 3447*0Sstevel@tonic-gate } 3448*0Sstevel@tonic-gate nr = nr->nr_next; 3449*0Sstevel@tonic-gate } 3450*0Sstevel@tonic-gate free_sr((md_set_record *)mnsr); 3451*0Sstevel@tonic-gate nd = nd->nd_next; 3452*0Sstevel@tonic-gate } 3453*0Sstevel@tonic-gate 3454*0Sstevel@tonic-gate /* 3455*0Sstevel@tonic-gate * - If no owner node has its own node record on its own node 3456*0Sstevel@tonic-gate * set to OK, then this node checks all of the non-owner 3457*0Sstevel@tonic-gate * nodes that are in the membership list. 3458*0Sstevel@tonic-gate * - for each non-owner, node calls RPC routine clnt_getset to 3459*0Sstevel@tonic-gate * see if that node has its node record set to OK. 3460*0Sstevel@tonic-gate * - If set doesn't exist, don't choose node for master. 3461*0Sstevel@tonic-gate * - If so, master is chosen to be this non-owner node. 3462*0Sstevel@tonic-gate * 3463*0Sstevel@tonic-gate */ 3464*0Sstevel@tonic-gate nd = sd->sd_nodelist; 3465*0Sstevel@tonic-gate while (nd) { 3466*0Sstevel@tonic-gate /* Don't consider node that isn't in member list */ 3467*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3468*0Sstevel@tonic-gate nd = nd->nd_next; 3469*0Sstevel@tonic-gate continue; 3470*0Sstevel@tonic-gate } 3471*0Sstevel@tonic-gate 3472*0Sstevel@tonic-gate /* Only checking non-owner nodes this time around */ 3473*0Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OWN) { 3474*0Sstevel@tonic-gate nd = nd->nd_next; 3475*0Sstevel@tonic-gate continue; 3476*0Sstevel@tonic-gate } 3477*0Sstevel@tonic-gate 3478*0Sstevel@tonic-gate /* Does node has its own node record set to OK? */ 3479*0Sstevel@tonic-gate if (clnt_mngetset(nd->nd_nodename, sp->setname, 3480*0Sstevel@tonic-gate MD_SET_BAD, &mnsr, ep) == -1) { 3481*0Sstevel@tonic-gate /* 3482*0Sstevel@tonic-gate * If set doesn't exist on non-owner node, 3483*0Sstevel@tonic-gate * don't consider this node for master. 3484*0Sstevel@tonic-gate */ 3485*0Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) { 3486*0Sstevel@tonic-gate nd = nd->nd_next; 3487*0Sstevel@tonic-gate continue; 3488*0Sstevel@tonic-gate } else if ((mdanyrpcerror(ep)) && 3489*0Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 3490*0Sstevel@tonic-gate nd->nd_nodeid)) { 3491*0Sstevel@tonic-gate /* RPC failure to another node */ 3492*0Sstevel@tonic-gate return (205); 3493*0Sstevel@tonic-gate } else { 3494*0Sstevel@tonic-gate /* Any other failure */ 3495*0Sstevel@tonic-gate return (-1); 3496*0Sstevel@tonic-gate } 3497*0Sstevel@tonic-gate } 3498*0Sstevel@tonic-gate nr = mnsr->sr_nodechain; 3499*0Sstevel@tonic-gate while (nr) { 3500*0Sstevel@tonic-gate if (nd->nd_nodeid == nr->nr_nodeid) { 3501*0Sstevel@tonic-gate if (nr->nr_flags & MD_MN_NODE_OK) { 3502*0Sstevel@tonic-gate /* Found a master */ 3503*0Sstevel@tonic-gate free_sr( 3504*0Sstevel@tonic-gate (md_set_record *)mnsr); 3505*0Sstevel@tonic-gate goto found_master; 3506*0Sstevel@tonic-gate } 3507*0Sstevel@tonic-gate } 3508*0Sstevel@tonic-gate nr = nr->nr_next; 3509*0Sstevel@tonic-gate } 3510*0Sstevel@tonic-gate free_sr((md_set_record *)mnsr); 3511*0Sstevel@tonic-gate nd = nd->nd_next; 3512*0Sstevel@tonic-gate } 3513*0Sstevel@tonic-gate 3514*0Sstevel@tonic-gate /* 3515*0Sstevel@tonic-gate * - If no node can be found that has its own node record on 3516*0Sstevel@tonic-gate * its node to be set to OK, then all alive nodes 3517*0Sstevel@tonic-gate * were in the process of being added to or deleted 3518*0Sstevel@tonic-gate * from set. Each alive node will remove all 3519*0Sstevel@tonic-gate * information pertaining to this set from its node. 3520*0Sstevel@tonic-gate * 3521*0Sstevel@tonic-gate * If all nodes in set are ALIVE, then call sdssc end routines 3522*0Sstevel@tonic-gate * since set was truly being initially created or destroyed. 3523*0Sstevel@tonic-gate */ 3524*0Sstevel@tonic-gate goto delete_set; 3525*0Sstevel@tonic-gate } else { 3526*0Sstevel@tonic-gate 3527*0Sstevel@tonic-gate /* 3528*0Sstevel@tonic-gate * If node is not joined to diskset, then this 3529*0Sstevel@tonic-gate * node uses the following algorithm: 3530*0Sstevel@tonic-gate * - If unjoined node doesn't have a node record for itself, 3531*0Sstevel@tonic-gate * just delete the diskset since diskset was in the 3532*0Sstevel@tonic-gate * process of being created. 3533*0Sstevel@tonic-gate * - node needs to find master of diskset before 3534*0Sstevel@tonic-gate * reconfig cycle, if a master existed. 3535*0Sstevel@tonic-gate * - node calls RPC routine clnt_ownset to get latest 3536*0Sstevel@tonic-gate * information on which nodes are owners of diskset. 3537*0Sstevel@tonic-gate * clnt_ownset checks on each node to see if its 3538*0Sstevel@tonic-gate * kernel has that diskset snarfed. 3539*0Sstevel@tonic-gate */ 3540*0Sstevel@tonic-gate 3541*0Sstevel@tonic-gate /* 3542*0Sstevel@tonic-gate * Is my node in the set description? 3543*0Sstevel@tonic-gate * If not, delete the set from this node. 3544*0Sstevel@tonic-gate * sr2setdesc sets sd_mn_mynode pointer to the node 3545*0Sstevel@tonic-gate * descriptor for this node if there was a node 3546*0Sstevel@tonic-gate * record for this node. 3547*0Sstevel@tonic-gate * 3548*0Sstevel@tonic-gate */ 3549*0Sstevel@tonic-gate if (sd->sd_mn_mynode == NULL) { 3550*0Sstevel@tonic-gate goto delete_set; 3551*0Sstevel@tonic-gate } 3552*0Sstevel@tonic-gate 3553*0Sstevel@tonic-gate nd = sd->sd_nodelist; 3554*0Sstevel@tonic-gate while (nd) { 3555*0Sstevel@tonic-gate /* Don't consider node that isn't in member list */ 3556*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3557*0Sstevel@tonic-gate nd = nd->nd_next; 3558*0Sstevel@tonic-gate continue; 3559*0Sstevel@tonic-gate } 3560*0Sstevel@tonic-gate 3561*0Sstevel@tonic-gate if (clnt_ownset(nd->nd_nodename, sp, 3562*0Sstevel@tonic-gate &is_owner, ep) == -1) { 3563*0Sstevel@tonic-gate /* If RPC failure to another node return 205 */ 3564*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 3565*0Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 3566*0Sstevel@tonic-gate nd->nd_nodeid)) { 3567*0Sstevel@tonic-gate return (205); 3568*0Sstevel@tonic-gate } else { 3569*0Sstevel@tonic-gate /* Any other failure */ 3570*0Sstevel@tonic-gate return (-1); 3571*0Sstevel@tonic-gate } 3572*0Sstevel@tonic-gate } 3573*0Sstevel@tonic-gate 3574*0Sstevel@tonic-gate /* 3575*0Sstevel@tonic-gate * Set owner flag for each node based on whether 3576*0Sstevel@tonic-gate * that node really has a diskset mddb snarfed in 3577*0Sstevel@tonic-gate * or not. 3578*0Sstevel@tonic-gate */ 3579*0Sstevel@tonic-gate if (is_owner == TRUE) 3580*0Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_OWN; 3581*0Sstevel@tonic-gate else 3582*0Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OWN; 3583*0Sstevel@tonic-gate 3584*0Sstevel@tonic-gate nd = nd->nd_next; 3585*0Sstevel@tonic-gate } 3586*0Sstevel@tonic-gate 3587*0Sstevel@tonic-gate /* 3588*0Sstevel@tonic-gate * - node walks through nodelist looking for nodes that 3589*0Sstevel@tonic-gate * are owners of the diskset that are in 3590*0Sstevel@tonic-gate * the membership list. 3591*0Sstevel@tonic-gate * - for each owner, node calls RPC routine clnt_getset to 3592*0Sstevel@tonic-gate * see if that node has a master set and to get the 3593*0Sstevel@tonic-gate * diskset description. 3594*0Sstevel@tonic-gate * - If the owner node has a set description that doesn't 3595*0Sstevel@tonic-gate * include the non-joined node in the nodelist, this node 3596*0Sstevel@tonic-gate * removes its set description of that diskset 3597*0Sstevel@tonic-gate * (i.e. removes the set from its local mddbs). This is 3598*0Sstevel@tonic-gate * handling the case of when a node was removed from a 3599*0Sstevel@tonic-gate * diskset while it was not in the cluster membership 3600*0Sstevel@tonic-gate * list. 3601*0Sstevel@tonic-gate * - If that node has a master set and the master is in the 3602*0Sstevel@tonic-gate * membership list and is an owner, then either this was 3603*0Sstevel@tonic-gate * the master from before the reconfig cycle or this 3604*0Sstevel@tonic-gate * node has already chosen a new master - either way, 3605*0Sstevel@tonic-gate * the master value is valid as long as it is in the 3606*0Sstevel@tonic-gate * membership list and is an owner 3607*0Sstevel@tonic-gate * - master is chosen to be owner node's master 3608*0Sstevel@tonic-gate */ 3609*0Sstevel@tonic-gate nd = sd->sd_nodelist; 3610*0Sstevel@tonic-gate while (nd) { 3611*0Sstevel@tonic-gate /* Don't consider node that isn't in member list */ 3612*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3613*0Sstevel@tonic-gate nd = nd->nd_next; 3614*0Sstevel@tonic-gate continue; 3615*0Sstevel@tonic-gate } 3616*0Sstevel@tonic-gate 3617*0Sstevel@tonic-gate /* Don't consider a node that isn't an owner */ 3618*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3619*0Sstevel@tonic-gate nd = nd->nd_next; 3620*0Sstevel@tonic-gate continue; 3621*0Sstevel@tonic-gate } 3622*0Sstevel@tonic-gate 3623*0Sstevel@tonic-gate /* Get owner node's set record */ 3624*0Sstevel@tonic-gate if (clnt_mngetset(nd->nd_nodename, sp->setname, 3625*0Sstevel@tonic-gate MD_SET_BAD, &mnsr, ep) == -1) { 3626*0Sstevel@tonic-gate /* If RPC failure to another node return 205 */ 3627*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 3628*0Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 3629*0Sstevel@tonic-gate nd->nd_nodeid)) { 3630*0Sstevel@tonic-gate return (205); 3631*0Sstevel@tonic-gate } else { 3632*0Sstevel@tonic-gate /* Any other failure */ 3633*0Sstevel@tonic-gate return (-1); 3634*0Sstevel@tonic-gate } 3635*0Sstevel@tonic-gate } 3636*0Sstevel@tonic-gate 3637*0Sstevel@tonic-gate /* Is this node in the owner node's set record */ 3638*0Sstevel@tonic-gate nr = mnsr->sr_nodechain; 3639*0Sstevel@tonic-gate while (nr) { 3640*0Sstevel@tonic-gate if (sd->sd_mn_mynode->nd_nodeid == 3641*0Sstevel@tonic-gate nr->nr_nodeid) { 3642*0Sstevel@tonic-gate break; 3643*0Sstevel@tonic-gate } 3644*0Sstevel@tonic-gate nr = nr->nr_next; 3645*0Sstevel@tonic-gate } 3646*0Sstevel@tonic-gate if (nr == NULL) { 3647*0Sstevel@tonic-gate /* my node not found - delete set */ 3648*0Sstevel@tonic-gate free_sr((md_set_record *)mnsr); 3649*0Sstevel@tonic-gate goto delete_set; 3650*0Sstevel@tonic-gate } 3651*0Sstevel@tonic-gate 3652*0Sstevel@tonic-gate /* Is owner's node's master valid? */ 3653*0Sstevel@tonic-gate master_nodeid = mnsr->sr_master_nodeid; 3654*0Sstevel@tonic-gate free_sr((md_set_record *)mnsr); 3655*0Sstevel@tonic-gate if (master_nodeid == MD_MN_INVALID_NID) { 3656*0Sstevel@tonic-gate nd = nd->nd_next; 3657*0Sstevel@tonic-gate continue; 3658*0Sstevel@tonic-gate } 3659*0Sstevel@tonic-gate 3660*0Sstevel@tonic-gate nd2 = sd->sd_nodelist; 3661*0Sstevel@tonic-gate while (nd2) { 3662*0Sstevel@tonic-gate if ((nd2->nd_nodeid == master_nodeid) && 3663*0Sstevel@tonic-gate (nd2->nd_flags & MD_MN_NODE_ALIVE) && 3664*0Sstevel@tonic-gate (nd2->nd_flags & MD_MN_NODE_OWN)) { 3665*0Sstevel@tonic-gate nd = nd2; 3666*0Sstevel@tonic-gate goto found_master; 3667*0Sstevel@tonic-gate } 3668*0Sstevel@tonic-gate nd2 = nd2->nd_next; 3669*0Sstevel@tonic-gate } 3670*0Sstevel@tonic-gate nd = nd->nd_next; 3671*0Sstevel@tonic-gate } 3672*0Sstevel@tonic-gate 3673*0Sstevel@tonic-gate /* 3674*0Sstevel@tonic-gate * - If no owner node has a valid master, then follow 3675*0Sstevel@tonic-gate * algorithm of when a node is joined to the diskset. 3676*0Sstevel@tonic-gate * - node walks through nodelist looking for nodes that are 3677*0Sstevel@tonic-gate * owners of the diskset that are in the membership list. 3678*0Sstevel@tonic-gate * - for each owner, node calls RPC routine clnt_getset to 3679*0Sstevel@tonic-gate * see if that node has its node record set to OK. 3680*0Sstevel@tonic-gate * - If so, master is chosen to be this owner node. 3681*0Sstevel@tonic-gate */ 3682*0Sstevel@tonic-gate nd = sd->sd_nodelist; 3683*0Sstevel@tonic-gate while (nd) { 3684*0Sstevel@tonic-gate /* Don't consider node that isn't in member list */ 3685*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3686*0Sstevel@tonic-gate nd = nd->nd_next; 3687*0Sstevel@tonic-gate continue; 3688*0Sstevel@tonic-gate } 3689*0Sstevel@tonic-gate 3690*0Sstevel@tonic-gate /* Don't consider a node that isn't an owner */ 3691*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3692*0Sstevel@tonic-gate nd = nd->nd_next; 3693*0Sstevel@tonic-gate continue; 3694*0Sstevel@tonic-gate } 3695*0Sstevel@tonic-gate 3696*0Sstevel@tonic-gate /* Does node has its own node record set to OK? */ 3697*0Sstevel@tonic-gate if (clnt_mngetset(nd->nd_nodename, sp->setname, 3698*0Sstevel@tonic-gate MD_SET_BAD, &mnsr, ep) == -1) { 3699*0Sstevel@tonic-gate /* If RPC failure to another node return 205 */ 3700*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 3701*0Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 3702*0Sstevel@tonic-gate nd->nd_nodeid)) { 3703*0Sstevel@tonic-gate return (205); 3704*0Sstevel@tonic-gate } else { 3705*0Sstevel@tonic-gate /* Any other failure */ 3706*0Sstevel@tonic-gate return (-1); 3707*0Sstevel@tonic-gate } 3708*0Sstevel@tonic-gate } 3709*0Sstevel@tonic-gate nr = mnsr->sr_nodechain; 3710*0Sstevel@tonic-gate while (nr) { 3711*0Sstevel@tonic-gate if (nd->nd_nodeid == nr->nr_nodeid) { 3712*0Sstevel@tonic-gate if (nr->nr_flags & MD_MN_NODE_OK) { 3713*0Sstevel@tonic-gate /* Found a master */ 3714*0Sstevel@tonic-gate free_sr( 3715*0Sstevel@tonic-gate (md_set_record *)mnsr); 3716*0Sstevel@tonic-gate goto found_master; 3717*0Sstevel@tonic-gate } 3718*0Sstevel@tonic-gate } 3719*0Sstevel@tonic-gate nr = nr->nr_next; 3720*0Sstevel@tonic-gate } 3721*0Sstevel@tonic-gate free_sr((md_set_record *)mnsr); 3722*0Sstevel@tonic-gate nd = nd->nd_next; 3723*0Sstevel@tonic-gate } 3724*0Sstevel@tonic-gate 3725*0Sstevel@tonic-gate /* 3726*0Sstevel@tonic-gate * - If no owner node has its own node record on its own node 3727*0Sstevel@tonic-gate * set to OK, then this node checks all of the non-owner 3728*0Sstevel@tonic-gate * nodes that are in the membership list. 3729*0Sstevel@tonic-gate * - for each non-owner, node calls RPC routine clnt_getset to 3730*0Sstevel@tonic-gate * see if that node has its node record set to OK. 3731*0Sstevel@tonic-gate * - If set doesn't exist, don't choose node for master. 3732*0Sstevel@tonic-gate * - If this node doesn't exist in the nodelist on any of the 3733*0Sstevel@tonic-gate * non-owner nodes, this node removes its set description 3734*0Sstevel@tonic-gate * of that diskset (i.e. removes the set from its local 3735*0Sstevel@tonic-gate * mddbs). This is handling the case of when a node was 3736*0Sstevel@tonic-gate * removed from a diskset while it was not in the 3737*0Sstevel@tonic-gate * cluster membership list. 3738*0Sstevel@tonic-gate * - If non-owner node has its node record set to OK and if 3739*0Sstevel@tonic-gate * this node hasn't removed this diskset (step directly 3740*0Sstevel@tonic-gate * before this one), then the master is chosen to be this 3741*0Sstevel@tonic-gate * non-owner node. 3742*0Sstevel@tonic-gate */ 3743*0Sstevel@tonic-gate nd = sd->sd_nodelist; 3744*0Sstevel@tonic-gate while (nd) { 3745*0Sstevel@tonic-gate /* Don't consider node that isn't in member list */ 3746*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3747*0Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL; 3748*0Sstevel@tonic-gate nd = nd->nd_next; 3749*0Sstevel@tonic-gate continue; 3750*0Sstevel@tonic-gate } 3751*0Sstevel@tonic-gate 3752*0Sstevel@tonic-gate /* Don't consider owner nodes since none are OK */ 3753*0Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OWN) { 3754*0Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL; 3755*0Sstevel@tonic-gate nd = nd->nd_next; 3756*0Sstevel@tonic-gate continue; 3757*0Sstevel@tonic-gate } 3758*0Sstevel@tonic-gate 3759*0Sstevel@tonic-gate /* 3760*0Sstevel@tonic-gate * Don't need to get nodelist from my node since 3761*0Sstevel@tonic-gate * this is where sd_nodelist was obtained. 3762*0Sstevel@tonic-gate */ 3763*0Sstevel@tonic-gate if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) { 3764*0Sstevel@tonic-gate nd = nd->nd_next; 3765*0Sstevel@tonic-gate continue; 3766*0Sstevel@tonic-gate } 3767*0Sstevel@tonic-gate 3768*0Sstevel@tonic-gate /* 3769*0Sstevel@tonic-gate * If node has already been decided against for 3770*0Sstevel@tonic-gate * master, then skip it. 3771*0Sstevel@tonic-gate */ 3772*0Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_DEL) { 3773*0Sstevel@tonic-gate nd = nd->nd_next; 3774*0Sstevel@tonic-gate continue; 3775*0Sstevel@tonic-gate } 3776*0Sstevel@tonic-gate 3777*0Sstevel@tonic-gate /* 3778*0Sstevel@tonic-gate * Does node in my nodelist have its own node 3779*0Sstevel@tonic-gate * record marked OK on its node? And does node 3780*0Sstevel@tonic-gate * in my nodelist exist on all other nodes? 3781*0Sstevel@tonic-gate * Don't want to choose a node for master unless 3782*0Sstevel@tonic-gate * that node is marked OK on its own node and that 3783*0Sstevel@tonic-gate * node exists on all other alive nodes. 3784*0Sstevel@tonic-gate * 3785*0Sstevel@tonic-gate * This is guarding against the case when several 3786*0Sstevel@tonic-gate * nodes are down and one of the downed nodes is 3787*0Sstevel@tonic-gate * deleted from the diskset. When the down nodes 3788*0Sstevel@tonic-gate * are rebooted into the cluster, you don't want 3789*0Sstevel@tonic-gate * any node to pick the deleted node as the master. 3790*0Sstevel@tonic-gate */ 3791*0Sstevel@tonic-gate if (clnt_mngetset(nd->nd_nodename, sp->setname, 3792*0Sstevel@tonic-gate MD_SET_BAD, &mnsr, ep) == -1) { 3793*0Sstevel@tonic-gate /* 3794*0Sstevel@tonic-gate * If set doesn't exist on non-owner node, 3795*0Sstevel@tonic-gate * don't consider this node for master. 3796*0Sstevel@tonic-gate */ 3797*0Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) { 3798*0Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL; 3799*0Sstevel@tonic-gate nd = nd->nd_next; 3800*0Sstevel@tonic-gate continue; 3801*0Sstevel@tonic-gate } else if (mdanyrpcerror(ep)) { 3802*0Sstevel@tonic-gate /* RPC failure to another node */ 3803*0Sstevel@tonic-gate return (205); 3804*0Sstevel@tonic-gate } else { 3805*0Sstevel@tonic-gate /* Any other failure */ 3806*0Sstevel@tonic-gate return (-1); 3807*0Sstevel@tonic-gate } 3808*0Sstevel@tonic-gate } 3809*0Sstevel@tonic-gate /* 3810*0Sstevel@tonic-gate * Is my node in the nodelist gotten from the other 3811*0Sstevel@tonic-gate * node? If not, then remove the set from my node 3812*0Sstevel@tonic-gate * since set was deleted from my node while my node 3813*0Sstevel@tonic-gate * was out of the cluster. 3814*0Sstevel@tonic-gate */ 3815*0Sstevel@tonic-gate nr = mnsr->sr_nodechain; 3816*0Sstevel@tonic-gate while (nr) { 3817*0Sstevel@tonic-gate if (sd->sd_mn_mynode->nd_nodeid == 3818*0Sstevel@tonic-gate nr->nr_nodeid) { 3819*0Sstevel@tonic-gate break; 3820*0Sstevel@tonic-gate } 3821*0Sstevel@tonic-gate nr = nr->nr_next; 3822*0Sstevel@tonic-gate } 3823*0Sstevel@tonic-gate if (nr == NULL) { 3824*0Sstevel@tonic-gate /* my node not found - delete set */ 3825*0Sstevel@tonic-gate free_sr((md_set_record *)mnsr); 3826*0Sstevel@tonic-gate goto delete_set; 3827*0Sstevel@tonic-gate } 3828*0Sstevel@tonic-gate 3829*0Sstevel@tonic-gate /* Is node being checked marked OK on its own node? */ 3830*0Sstevel@tonic-gate nr = mnsr->sr_nodechain; 3831*0Sstevel@tonic-gate while (nr) { 3832*0Sstevel@tonic-gate if (nd->nd_nodeid == nr->nr_nodeid) { 3833*0Sstevel@tonic-gate if (!(nr->nr_flags & MD_MN_NODE_OK)) { 3834*0Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL; 3835*0Sstevel@tonic-gate } 3836*0Sstevel@tonic-gate break; 3837*0Sstevel@tonic-gate } 3838*0Sstevel@tonic-gate nr = nr->nr_next; 3839*0Sstevel@tonic-gate } 3840*0Sstevel@tonic-gate /* 3841*0Sstevel@tonic-gate * If node being checked doesn't exist on its 3842*0Sstevel@tonic-gate * own node - don't choose it as master. 3843*0Sstevel@tonic-gate */ 3844*0Sstevel@tonic-gate if (nr == NULL) { 3845*0Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL; 3846*0Sstevel@tonic-gate } 3847*0Sstevel@tonic-gate 3848*0Sstevel@tonic-gate /* 3849*0Sstevel@tonic-gate * Check every node in my node's nodelist against 3850*0Sstevel@tonic-gate * the nodelist gotten from the other node. 3851*0Sstevel@tonic-gate * If a node in my node's nodelist is not found in the 3852*0Sstevel@tonic-gate * other node's nodelist, then set the DEL flag. 3853*0Sstevel@tonic-gate */ 3854*0Sstevel@tonic-gate nd2 = sd->sd_nodelist; 3855*0Sstevel@tonic-gate while (nd2) { 3856*0Sstevel@tonic-gate nr = mnsr->sr_nodechain; 3857*0Sstevel@tonic-gate while (nr) { 3858*0Sstevel@tonic-gate if (nd2->nd_nodeid == nr->nr_nodeid) { 3859*0Sstevel@tonic-gate break; 3860*0Sstevel@tonic-gate } 3861*0Sstevel@tonic-gate nr = nr->nr_next; 3862*0Sstevel@tonic-gate } 3863*0Sstevel@tonic-gate /* nd2 not found in other node's nodelist */ 3864*0Sstevel@tonic-gate if (nr == NULL) { 3865*0Sstevel@tonic-gate nd2->nd_flags |= MD_MN_NODE_DEL; 3866*0Sstevel@tonic-gate } 3867*0Sstevel@tonic-gate nd2 = nd2->nd_next; 3868*0Sstevel@tonic-gate } 3869*0Sstevel@tonic-gate 3870*0Sstevel@tonic-gate free_sr((md_set_record *)mnsr); 3871*0Sstevel@tonic-gate nd = nd->nd_next; 3872*0Sstevel@tonic-gate } 3873*0Sstevel@tonic-gate 3874*0Sstevel@tonic-gate /* 3875*0Sstevel@tonic-gate * Rescan list look for node that has not been marked DEL. 3876*0Sstevel@tonic-gate * First node found is the master. 3877*0Sstevel@tonic-gate */ 3878*0Sstevel@tonic-gate nd = sd->sd_nodelist; 3879*0Sstevel@tonic-gate while (nd) { 3880*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_DEL)) { 3881*0Sstevel@tonic-gate break; 3882*0Sstevel@tonic-gate } 3883*0Sstevel@tonic-gate nd = nd->nd_next; 3884*0Sstevel@tonic-gate continue; 3885*0Sstevel@tonic-gate } 3886*0Sstevel@tonic-gate if (nd) { 3887*0Sstevel@tonic-gate /* Found a master */ 3888*0Sstevel@tonic-gate goto found_master; 3889*0Sstevel@tonic-gate } 3890*0Sstevel@tonic-gate 3891*0Sstevel@tonic-gate /* 3892*0Sstevel@tonic-gate * - If no node can be found that has its own node record on 3893*0Sstevel@tonic-gate * its node to be set to OK, then all alive nodes 3894*0Sstevel@tonic-gate * were in the process of being added to or deleted 3895*0Sstevel@tonic-gate * from set. Each alive node will remove all 3896*0Sstevel@tonic-gate * information pertaining to this set from its node. 3897*0Sstevel@tonic-gate * 3898*0Sstevel@tonic-gate * If all nodes in set are ALIVE, then call sdssc end routines 3899*0Sstevel@tonic-gate * since set was truly being initially created or destroyed. 3900*0Sstevel@tonic-gate */ 3901*0Sstevel@tonic-gate goto delete_set; 3902*0Sstevel@tonic-gate } 3903*0Sstevel@tonic-gate 3904*0Sstevel@tonic-gate found_master: 3905*0Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 3906*0Sstevel@tonic-gate "Set %s master chosen %s (%d): %s"), 3907*0Sstevel@tonic-gate sp->setname, nd->nd_nodename, nd->nd_nodeid, 3908*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 3909*0Sstevel@tonic-gate 3910*0Sstevel@tonic-gate if (clnt_lock_set(mynode(), sp, ep) == -1) { 3911*0Sstevel@tonic-gate return (-1); 3912*0Sstevel@tonic-gate } 3913*0Sstevel@tonic-gate 3914*0Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname); 3915*0Sstevel@tonic-gate 3916*0Sstevel@tonic-gate if (clnt_mnsetmaster(mynode(), sp, 3917*0Sstevel@tonic-gate nd->nd_nodename, nd->nd_nodeid, ep)) { 3918*0Sstevel@tonic-gate rval = -1; 3919*0Sstevel@tonic-gate } else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) { 3920*0Sstevel@tonic-gate /* If this node is new master, set flag in this node's kernel */ 3921*0Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf)); 3922*0Sstevel@tonic-gate sf.sf_setno = sp->setno; 3923*0Sstevel@tonic-gate sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 3924*0Sstevel@tonic-gate /* Use magic to help protect ioctl against attack. */ 3925*0Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC; 3926*0Sstevel@tonic-gate sf.sf_flags = MDDB_NM_SET; 3927*0Sstevel@tonic-gate 3928*0Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 3929*0Sstevel@tonic-gate "Setting new master flag for set %s: %s"), 3930*0Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 3931*0Sstevel@tonic-gate 3932*0Sstevel@tonic-gate /* 3933*0Sstevel@tonic-gate * Fail reconfig cycle if ioctl fails since it is critical 3934*0Sstevel@tonic-gate * to set new master flag. 3935*0Sstevel@tonic-gate */ 3936*0Sstevel@tonic-gate if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, 3937*0Sstevel@tonic-gate NULL) != NULL) { 3938*0Sstevel@tonic-gate (void) mdstealerror(ep, &sf.sf_mde); 3939*0Sstevel@tonic-gate rval = -1; 3940*0Sstevel@tonic-gate } 3941*0Sstevel@tonic-gate } 3942*0Sstevel@tonic-gate 3943*0Sstevel@tonic-gate if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) { 3944*0Sstevel@tonic-gate if (rval == 0) { 3945*0Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 3946*0Sstevel@tonic-gate rval = -1; 3947*0Sstevel@tonic-gate } 3948*0Sstevel@tonic-gate } 3949*0Sstevel@tonic-gate 3950*0Sstevel@tonic-gate cl_set_setkey(NULL); 3951*0Sstevel@tonic-gate 3952*0Sstevel@tonic-gate metaflushsetname(sp); 3953*0Sstevel@tonic-gate 3954*0Sstevel@tonic-gate return (rval); 3955*0Sstevel@tonic-gate 3956*0Sstevel@tonic-gate delete_set: 3957*0Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 3958*0Sstevel@tonic-gate "Master not chosen, deleting set %s: %s"), 3959*0Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 3960*0Sstevel@tonic-gate 3961*0Sstevel@tonic-gate /* 3962*0Sstevel@tonic-gate * Remove all set information from this node: 3963*0Sstevel@tonic-gate * - node records for this set 3964*0Sstevel@tonic-gate * - drive records for this set 3965*0Sstevel@tonic-gate * - set record for this set 3966*0Sstevel@tonic-gate * (Only do this on this node since each node 3967*0Sstevel@tonic-gate * will do it for its own local mddb.) 3968*0Sstevel@tonic-gate * 3969*0Sstevel@tonic-gate * If all nodes in set are ALIVE, then 3970*0Sstevel@tonic-gate * the lowest numbered ALIVE nodeid in set 3971*0Sstevel@tonic-gate * (irregardless of whether an owner node or not) will 3972*0Sstevel@tonic-gate * call the DCS service to cleanup for create/delete of set. 3973*0Sstevel@tonic-gate * sdssc_create_end(cleanup) if set was being created or 3974*0Sstevel@tonic-gate * sdssc_delete_end(cleanup) if set was being deleted. 3975*0Sstevel@tonic-gate * A node record with flag ADD denotes a set being 3976*0Sstevel@tonic-gate * created. A node record with flag DEL denotes a 3977*0Sstevel@tonic-gate * set being deleted. 3978*0Sstevel@tonic-gate */ 3979*0Sstevel@tonic-gate nd = sd->sd_nodelist; 3980*0Sstevel@tonic-gate while (nd) { 3981*0Sstevel@tonic-gate /* Found a node that isn't alive */ 3982*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) 3983*0Sstevel@tonic-gate break; 3984*0Sstevel@tonic-gate 3985*0Sstevel@tonic-gate /* Is my node the lowest numbered ALIVE node? */ 3986*0Sstevel@tonic-gate if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) { 3987*0Sstevel@tonic-gate break; 3988*0Sstevel@tonic-gate } 3989*0Sstevel@tonic-gate nd = nd->nd_next; 3990*0Sstevel@tonic-gate } 3991*0Sstevel@tonic-gate if (nd == NULL) { 3992*0Sstevel@tonic-gate /* All nodes ALIVE and this is the lowest nodeid */ 3993*0Sstevel@tonic-gate lowest_alive_nodeid = 1; 3994*0Sstevel@tonic-gate } 3995*0Sstevel@tonic-gate 3996*0Sstevel@tonic-gate if (clnt_lock_set(mynode(), sp, ep) == -1) { 3997*0Sstevel@tonic-gate return (-1); 3998*0Sstevel@tonic-gate } 3999*0Sstevel@tonic-gate 4000*0Sstevel@tonic-gate 4001*0Sstevel@tonic-gate /* 4002*0Sstevel@tonic-gate * If this node had been joined, withdraw and reset master. 4003*0Sstevel@tonic-gate * 4004*0Sstevel@tonic-gate * This could happen if a node was being added to or removed 4005*0Sstevel@tonic-gate * from a diskset and the node doing the add/delete operation and 4006*0Sstevel@tonic-gate * all other nodes in the diskset have left the cluster. 4007*0Sstevel@tonic-gate */ 4008*0Sstevel@tonic-gate if (sd->sd_mn_mynode) { 4009*0Sstevel@tonic-gate nd = sd->sd_mn_mynode; 4010*0Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OWN) { 4011*0Sstevel@tonic-gate if (clnt_withdrawset(mynode(), sp, ep)) { 4012*0Sstevel@tonic-gate rval = -1; 4013*0Sstevel@tonic-gate goto out; 4014*0Sstevel@tonic-gate } 4015*0Sstevel@tonic-gate if (clnt_mnsetmaster(mynode(), sp, "", 4016*0Sstevel@tonic-gate MD_MN_INVALID_NID, ep)) { 4017*0Sstevel@tonic-gate rval = -1; 4018*0Sstevel@tonic-gate goto out; 4019*0Sstevel@tonic-gate } 4020*0Sstevel@tonic-gate } 4021*0Sstevel@tonic-gate } 4022*0Sstevel@tonic-gate 4023*0Sstevel@tonic-gate /* 4024*0Sstevel@tonic-gate * Remove side records for this node (side) from local mddb 4025*0Sstevel@tonic-gate * (clnt_deldrvs does this) if there are drives in the set. 4026*0Sstevel@tonic-gate * 4027*0Sstevel@tonic-gate * Don't need to mark this node as DEL since already marked as 4028*0Sstevel@tonic-gate * ADD or DEL (or this node would have been chosen as master). 4029*0Sstevel@tonic-gate * Don't need to mark other node records, drive records or 4030*0Sstevel@tonic-gate * set records as DEL. If a panic occurs during clnt_delset, 4031*0Sstevel@tonic-gate * these records will be deleted the next time this node 4032*0Sstevel@tonic-gate * becomes a member and goes through the reconfig cycle. 4033*0Sstevel@tonic-gate */ 4034*0Sstevel@tonic-gate /* Get the drive descriptors for this set */ 4035*0Sstevel@tonic-gate if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 4036*0Sstevel@tonic-gate ep)) == NULL) { 4037*0Sstevel@tonic-gate if (! mdisok(ep)) { 4038*0Sstevel@tonic-gate /* 4039*0Sstevel@tonic-gate * Ignore and clear out any failures from 4040*0Sstevel@tonic-gate * metaget_drivedesc since a panic could have 4041*0Sstevel@tonic-gate * occurred when a node was partially added to a set. 4042*0Sstevel@tonic-gate */ 4043*0Sstevel@tonic-gate mdclrerror(ep); 4044*0Sstevel@tonic-gate } 4045*0Sstevel@tonic-gate } else { 4046*0Sstevel@tonic-gate if (clnt_deldrvs(mynode(), sp, dd, ep)) { 4047*0Sstevel@tonic-gate rval = -1; 4048*0Sstevel@tonic-gate goto out; 4049*0Sstevel@tonic-gate } 4050*0Sstevel@tonic-gate } 4051*0Sstevel@tonic-gate 4052*0Sstevel@tonic-gate /* 4053*0Sstevel@tonic-gate * Now, delete the set - this removes the node, drive 4054*0Sstevel@tonic-gate * and set records from the local mddb. 4055*0Sstevel@tonic-gate */ 4056*0Sstevel@tonic-gate if (clnt_delset(mynode(), sp, ep)) { 4057*0Sstevel@tonic-gate rval = -1; 4058*0Sstevel@tonic-gate goto out; 4059*0Sstevel@tonic-gate } 4060*0Sstevel@tonic-gate 4061*0Sstevel@tonic-gate out: 4062*0Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname); 4063*0Sstevel@tonic-gate 4064*0Sstevel@tonic-gate /* 4065*0Sstevel@tonic-gate * Ignore errors from unlock of set since set is no longer 4066*0Sstevel@tonic-gate * known (if clnt_delset worked). 4067*0Sstevel@tonic-gate */ 4068*0Sstevel@tonic-gate if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) { 4069*0Sstevel@tonic-gate mdclrerror(&xep); 4070*0Sstevel@tonic-gate } 4071*0Sstevel@tonic-gate 4072*0Sstevel@tonic-gate cl_set_setkey(NULL); 4073*0Sstevel@tonic-gate 4074*0Sstevel@tonic-gate metaflushsetname(sp); 4075*0Sstevel@tonic-gate 4076*0Sstevel@tonic-gate /* 4077*0Sstevel@tonic-gate * If this node is the lowest numbered nodeid then 4078*0Sstevel@tonic-gate * call sdssc_create/delete_end depending on whether 4079*0Sstevel@tonic-gate * this node is marked as ADD or DEL in the node record. 4080*0Sstevel@tonic-gate */ 4081*0Sstevel@tonic-gate if (lowest_alive_nodeid) { 4082*0Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_ADD) 4083*0Sstevel@tonic-gate sdssc_create_end(sp->setname, SDSSC_CLEANUP); 4084*0Sstevel@tonic-gate else if (nd->nd_flags & MD_MN_NODE_DEL) 4085*0Sstevel@tonic-gate sdssc_delete_end(sp->setname, SDSSC_CLEANUP); 4086*0Sstevel@tonic-gate } 4087*0Sstevel@tonic-gate 4088*0Sstevel@tonic-gate /* Finished with this set -- return */ 4089*0Sstevel@tonic-gate return (rval); 4090*0Sstevel@tonic-gate } 4091*0Sstevel@tonic-gate 4092*0Sstevel@tonic-gate /* 4093*0Sstevel@tonic-gate * Reconfig step to choose a new master for all MN disksets. 4094*0Sstevel@tonic-gate * Return values: 4095*0Sstevel@tonic-gate * 0 - Everything is great. 4096*0Sstevel@tonic-gate * 1 - This node failed to reconfig. 4097*0Sstevel@tonic-gate * 205 - Cause another reconfig due to a nodelist problem 4098*0Sstevel@tonic-gate * or RPC failure to another node 4099*0Sstevel@tonic-gate */ 4100*0Sstevel@tonic-gate int 4101*0Sstevel@tonic-gate meta_reconfig_choose_master( 4102*0Sstevel@tonic-gate md_error_t *ep 4103*0Sstevel@tonic-gate ) 4104*0Sstevel@tonic-gate { 4105*0Sstevel@tonic-gate set_t max_sets, setno; 4106*0Sstevel@tonic-gate int nodecnt; 4107*0Sstevel@tonic-gate mndiskset_membershiplist_t *nl; 4108*0Sstevel@tonic-gate md_set_desc *sd; 4109*0Sstevel@tonic-gate mdsetname_t *sp; 4110*0Sstevel@tonic-gate int rval = 0; 4111*0Sstevel@tonic-gate mddb_setflags_config_t sf; 4112*0Sstevel@tonic-gate int start_node_delayed = 0; 4113*0Sstevel@tonic-gate 4114*0Sstevel@tonic-gate if ((max_sets = get_max_sets(ep)) == 0) { 4115*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 4116*0Sstevel@tonic-gate "Unable to get number of sets")); 4117*0Sstevel@tonic-gate return (1); 4118*0Sstevel@tonic-gate } 4119*0Sstevel@tonic-gate 4120*0Sstevel@tonic-gate /* 4121*0Sstevel@tonic-gate * Get membershiplist from API routine. If there's 4122*0Sstevel@tonic-gate * an error, return a 205 to cause another reconfig. 4123*0Sstevel@tonic-gate */ 4124*0Sstevel@tonic-gate if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) { 4125*0Sstevel@tonic-gate mde_perror(ep, ""); 4126*0Sstevel@tonic-gate return (205); 4127*0Sstevel@tonic-gate } 4128*0Sstevel@tonic-gate 4129*0Sstevel@tonic-gate for (setno = 1; setno < max_sets; setno++) { 4130*0Sstevel@tonic-gate if ((sp = metasetnosetname(setno, ep)) == NULL) { 4131*0Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) { 4132*0Sstevel@tonic-gate /* No set for this setno - continue */ 4133*0Sstevel@tonic-gate mdclrerror(ep); 4134*0Sstevel@tonic-gate continue; 4135*0Sstevel@tonic-gate } else { 4136*0Sstevel@tonic-gate /* 4137*0Sstevel@tonic-gate * If encountered an RPC error from my node, 4138*0Sstevel@tonic-gate * then immediately fail. 4139*0Sstevel@tonic-gate */ 4140*0Sstevel@tonic-gate if (mdanyrpcerror(ep)) { 4141*0Sstevel@tonic-gate mde_perror(ep, ""); 4142*0Sstevel@tonic-gate return (1); 4143*0Sstevel@tonic-gate } 4144*0Sstevel@tonic-gate /* Can't get set information */ 4145*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 4146*0Sstevel@tonic-gate "Unable to get information for " 4147*0Sstevel@tonic-gate "set number %d"), setno); 4148*0Sstevel@tonic-gate mdclrerror(ep); 4149*0Sstevel@tonic-gate continue; 4150*0Sstevel@tonic-gate } 4151*0Sstevel@tonic-gate } 4152*0Sstevel@tonic-gate 4153*0Sstevel@tonic-gate /* If setname is there, set desc should exist. */ 4154*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 4155*0Sstevel@tonic-gate /* 4156*0Sstevel@tonic-gate * If encountered an RPC error from my node, 4157*0Sstevel@tonic-gate * then immediately fail. 4158*0Sstevel@tonic-gate */ 4159*0Sstevel@tonic-gate if (mdanyrpcerror(ep)) { 4160*0Sstevel@tonic-gate mde_perror(ep, ""); 4161*0Sstevel@tonic-gate return (1); 4162*0Sstevel@tonic-gate } 4163*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 4164*0Sstevel@tonic-gate "Unable to get set %s desc information"), 4165*0Sstevel@tonic-gate sp->setname); 4166*0Sstevel@tonic-gate mdclrerror(ep); 4167*0Sstevel@tonic-gate continue; 4168*0Sstevel@tonic-gate } 4169*0Sstevel@tonic-gate 4170*0Sstevel@tonic-gate /* Only reconfig MN disksets */ 4171*0Sstevel@tonic-gate if (!MD_MNSET_DESC(sd)) { 4172*0Sstevel@tonic-gate continue; 4173*0Sstevel@tonic-gate } 4174*0Sstevel@tonic-gate 4175*0Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4176*0Sstevel@tonic-gate "Begin choose master for set %s: %s"), 4177*0Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4178*0Sstevel@tonic-gate 4179*0Sstevel@tonic-gate /* Update nodelist with member information. */ 4180*0Sstevel@tonic-gate if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) { 4181*0Sstevel@tonic-gate /* 4182*0Sstevel@tonic-gate * If encountered an RPC error from my node, 4183*0Sstevel@tonic-gate * then immediately fail. 4184*0Sstevel@tonic-gate */ 4185*0Sstevel@tonic-gate if (mdanyrpcerror(ep)) { 4186*0Sstevel@tonic-gate mde_perror(ep, ""); 4187*0Sstevel@tonic-gate return (1); 4188*0Sstevel@tonic-gate } 4189*0Sstevel@tonic-gate mde_perror(ep, ""); 4190*0Sstevel@tonic-gate mdclrerror(ep); 4191*0Sstevel@tonic-gate continue; 4192*0Sstevel@tonic-gate } 4193*0Sstevel@tonic-gate 4194*0Sstevel@tonic-gate /* 4195*0Sstevel@tonic-gate * If all nodes in a cluster are starting, then 4196*0Sstevel@tonic-gate * all nodes will attempt to contact all other nodes 4197*0Sstevel@tonic-gate * to determine a master node. This can lead to a 4198*0Sstevel@tonic-gate * problem where node 1 is trying to contact the rpc.metad 4199*0Sstevel@tonic-gate * node 2 and node 2 is trying to contact the rpc.metad 4200*0Sstevel@tonic-gate * on node 1 -- and this causes the rpc call to fail 4201*0Sstevel@tonic-gate * on both nodes and causes a new reconfig cycle. 4202*0Sstevel@tonic-gate * 4203*0Sstevel@tonic-gate * In order to break this problem, a newly starting node 4204*0Sstevel@tonic-gate * will delay a small amount of time (nodeid mod 4 seconds) 4205*0Sstevel@tonic-gate * and will then run the code to choose a master for the 4206*0Sstevel@tonic-gate * first set. Delay will only be done once regardless of the 4207*0Sstevel@tonic-gate * number of sets. 4208*0Sstevel@tonic-gate */ 4209*0Sstevel@tonic-gate if (start_node_delayed == 0) { 4210*0Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf)); 4211*0Sstevel@tonic-gate sf.sf_setno = sp->setno; 4212*0Sstevel@tonic-gate sf.sf_flags = MDDB_NM_GET; 4213*0Sstevel@tonic-gate /* Use magic to help protect ioctl against attack. */ 4214*0Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC; 4215*0Sstevel@tonic-gate if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, 4216*0Sstevel@tonic-gate &sf.sf_mde, NULL) == 0) && 4217*0Sstevel@tonic-gate ((sf.sf_setflags & MD_SET_MN_START_RC) == 4218*0Sstevel@tonic-gate MD_SET_MN_START_RC)) { 4219*0Sstevel@tonic-gate (void) sleep(sd->sd_mn_mynode->nd_nodeid % 4); 4220*0Sstevel@tonic-gate } 4221*0Sstevel@tonic-gate start_node_delayed = 1; 4222*0Sstevel@tonic-gate } 4223*0Sstevel@tonic-gate 4224*0Sstevel@tonic-gate /* Choose master for this set */ 4225*0Sstevel@tonic-gate rval = meta_reconfig_choose_master_for_set(sp, sd, ep); 4226*0Sstevel@tonic-gate if (rval == -1) { 4227*0Sstevel@tonic-gate mde_perror(ep, ""); 4228*0Sstevel@tonic-gate return (1); 4229*0Sstevel@tonic-gate } else if (rval == 205) { 4230*0Sstevel@tonic-gate mde_perror(ep, ""); 4231*0Sstevel@tonic-gate return (205); 4232*0Sstevel@tonic-gate } 4233*0Sstevel@tonic-gate 4234*0Sstevel@tonic-gate /* Send new nodelist to rpc.mdcommd */ 4235*0Sstevel@tonic-gate (void) mdmn_reinit_set(sp->setno); 4236*0Sstevel@tonic-gate 4237*0Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4238*0Sstevel@tonic-gate "Choose master for set %s completed: %s"), 4239*0Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4240*0Sstevel@tonic-gate } 4241*0Sstevel@tonic-gate 4242*0Sstevel@tonic-gate /* 4243*0Sstevel@tonic-gate * Each node turns on I/Os for all MN disksets. 4244*0Sstevel@tonic-gate * This is to recover from the situation where the master died 4245*0Sstevel@tonic-gate * during a previous reconfig cycle when I/Os were suspended 4246*0Sstevel@tonic-gate * for a MN diskset. 4247*0Sstevel@tonic-gate * If a failure occurs return a 1 which will force this node to 4248*0Sstevel@tonic-gate * panic. Cannot leave node in the situation where I/Os are 4249*0Sstevel@tonic-gate * not resumed. 4250*0Sstevel@tonic-gate */ 4251*0Sstevel@tonic-gate setno = 0; /* 0 means all MN sets */ 4252*0Sstevel@tonic-gate if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) { 4253*0Sstevel@tonic-gate mde_perror(ep, ""); 4254*0Sstevel@tonic-gate return (1); 4255*0Sstevel@tonic-gate } 4256*0Sstevel@tonic-gate 4257*0Sstevel@tonic-gate /* Free the nodelist */ 4258*0Sstevel@tonic-gate if (nodecnt) 4259*0Sstevel@tonic-gate meta_free_nodelist(nl); 4260*0Sstevel@tonic-gate 4261*0Sstevel@tonic-gate return (0); 4262*0Sstevel@tonic-gate } 4263*0Sstevel@tonic-gate 4264*0Sstevel@tonic-gate /* 4265*0Sstevel@tonic-gate * meta_mnsync_user_records will synchronize the diskset user records across 4266*0Sstevel@tonic-gate * all nodes in the diskset. The diskset user records are stored in 4267*0Sstevel@tonic-gate * each node's local set mddb. 4268*0Sstevel@tonic-gate * 4269*0Sstevel@tonic-gate * This needs to be done even if there is no master change during the 4270*0Sstevel@tonic-gate * reconfig cycle since this routine should clean up any mess left by 4271*0Sstevel@tonic-gate * the untimely termination of a metaset or metadb command (due to a 4272*0Sstevel@tonic-gate * node panic or to user intervention). 4273*0Sstevel@tonic-gate * 4274*0Sstevel@tonic-gate * Caller is the Master node. 4275*0Sstevel@tonic-gate * 4276*0Sstevel@tonic-gate * Returns 0 - Success 4277*0Sstevel@tonic-gate * 205 - Failure during RPC to another node 4278*0Sstevel@tonic-gate * -1 - Any other failure and ep is filled in. 4279*0Sstevel@tonic-gate */ 4280*0Sstevel@tonic-gate int 4281*0Sstevel@tonic-gate meta_mnsync_user_records( 4282*0Sstevel@tonic-gate mdsetname_t *sp, 4283*0Sstevel@tonic-gate md_error_t *ep 4284*0Sstevel@tonic-gate ) 4285*0Sstevel@tonic-gate { 4286*0Sstevel@tonic-gate md_set_desc *sd; 4287*0Sstevel@tonic-gate md_mnnode_desc *master_nodelist, *nd, *nd2, *ndtail; 4288*0Sstevel@tonic-gate md_mnset_record *mnsr; 4289*0Sstevel@tonic-gate md_mnsr_node_t *master_mnsr_node = NULL, *mnsr_node = NULL; 4290*0Sstevel@tonic-gate md_mnnode_record *nr; 4291*0Sstevel@tonic-gate md_drive_record *dr; 4292*0Sstevel@tonic-gate int dr_cnt, dd_cnt; 4293*0Sstevel@tonic-gate int found_my_nr; 4294*0Sstevel@tonic-gate md_drive_desc *dd, *dd_prev, *master_dd, *other_dd; 4295*0Sstevel@tonic-gate int all_drives_ok; 4296*0Sstevel@tonic-gate int rval = 0; 4297*0Sstevel@tonic-gate int max_genid = 0; 4298*0Sstevel@tonic-gate int num_alive_nodes, num_alive_nodes_del = 0; 4299*0Sstevel@tonic-gate int set_locked = 0; 4300*0Sstevel@tonic-gate md_setkey_t *cl_sk; 4301*0Sstevel@tonic-gate md_error_t xep = mdnullerror; 4302*0Sstevel@tonic-gate char *anode[1]; 4303*0Sstevel@tonic-gate mddb_setflags_config_t sf; 4304*0Sstevel@tonic-gate 4305*0Sstevel@tonic-gate /* 4306*0Sstevel@tonic-gate * Sync up node records first. 4307*0Sstevel@tonic-gate * Construct a master nodelist using the nodelist from this 4308*0Sstevel@tonic-gate * node's rpc.metad node records and then setting the state of each 4309*0Sstevel@tonic-gate * node following these rules: 4310*0Sstevel@tonic-gate * - If a node record is marked OK on its node, mark it OK 4311*0Sstevel@tonic-gate * in the master nodelist (and later OK on all nodes) 4312*0Sstevel@tonic-gate * If a node record is also marked OWN on its node, 4313*0Sstevel@tonic-gate * mark it OWN in the master nodelist. 4314*0Sstevel@tonic-gate * - If a node record is not marked OK on its node, then mark 4315*0Sstevel@tonic-gate * it as DEL in the master list (later deleting it) 4316*0Sstevel@tonic-gate * - If node record doesn't exist on that node, then mark it DEL 4317*0Sstevel@tonic-gate * (later deleting it) 4318*0Sstevel@tonic-gate * - If set record doesn't exist on that node, mark node as DEL 4319*0Sstevel@tonic-gate * - If a node record doesn't exist on all nodes, then mark it DEL 4320*0Sstevel@tonic-gate * - If a node is not ALIVE, then 4321*0Sstevel@tonic-gate * - If that node marked DEL on any node - mark it DEL 4322*0Sstevel@tonic-gate * in master list but leave in nodelist 4323*0Sstevel@tonic-gate * - If that node is marked as ADD on any node, mark it 4324*0Sstevel@tonic-gate * ADD in the master list but leave in nodelist 4325*0Sstevel@tonic-gate * - When that node returns to the living, the DEL 4326*0Sstevel@tonic-gate * node record will be removed and the ADD node 4327*0Sstevel@tonic-gate * record may be removed if marked ADD on that 4328*0Sstevel@tonic-gate * node. 4329*0Sstevel@tonic-gate * The key rule is to not remove a node from the nodelist until 4330*0Sstevel@tonic-gate * that node record is removed from its own node. Do not want to 4331*0Sstevel@tonic-gate * remove a node's record from all other nodes and then have 4332*0Sstevel@tonic-gate * that node have its own record marked OK so that a node will pick 4333*0Sstevel@tonic-gate * a different master than the other nodes. 4334*0Sstevel@tonic-gate * 4335*0Sstevel@tonic-gate * Next, 4336*0Sstevel@tonic-gate * If node is ALIVE and node record is marked DEL in master nodelist, 4337*0Sstevel@tonic-gate * remove node from set. 4338*0Sstevel@tonic-gate * If node is ALIVE and node record is marked OK in master nodelist, 4339*0Sstevel@tonic-gate * mark it OK on all other nodes. 4340*0Sstevel@tonic-gate * If node is not ALIVE and node record is marked DEL in master 4341*0Sstevel@tonic-gate * nodelist, mark it DEL on all other nodes. 4342*0Sstevel@tonic-gate * If node is not ALIVE and node record is marked ADD in master, 4343*0Sstevel@tonic-gate * nodelist, mark it ADD on all other nodes. 4344*0Sstevel@tonic-gate */ 4345*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 4346*0Sstevel@tonic-gate return (-1); 4347*0Sstevel@tonic-gate } 4348*0Sstevel@tonic-gate master_nodelist = sd->sd_nodelist; 4349*0Sstevel@tonic-gate 4350*0Sstevel@tonic-gate /* 4351*0Sstevel@tonic-gate * Walk through nodelist creating a master nodelist. 4352*0Sstevel@tonic-gate */ 4353*0Sstevel@tonic-gate num_alive_nodes = 0; 4354*0Sstevel@tonic-gate nd = master_nodelist; 4355*0Sstevel@tonic-gate while (nd) { 4356*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 4357*0Sstevel@tonic-gate nd = nd->nd_next; 4358*0Sstevel@tonic-gate continue; 4359*0Sstevel@tonic-gate } 4360*0Sstevel@tonic-gate num_alive_nodes++; 4361*0Sstevel@tonic-gate if (clnt_mngetset(nd->nd_nodename, sp->setname, 4362*0Sstevel@tonic-gate MD_SET_BAD, &mnsr, ep) == -1) { 4363*0Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) { 4364*0Sstevel@tonic-gate /* set doesn't exist, mark node as DEL */ 4365*0Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OK; 4366*0Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_ADD; 4367*0Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL; 4368*0Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_NOSET; 4369*0Sstevel@tonic-gate nd = nd->nd_next; 4370*0Sstevel@tonic-gate continue; 4371*0Sstevel@tonic-gate } else { 4372*0Sstevel@tonic-gate /* If RPC failure to another node return 205 */ 4373*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 4374*0Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 4375*0Sstevel@tonic-gate nd->nd_nodeid)) { 4376*0Sstevel@tonic-gate rval = 205; 4377*0Sstevel@tonic-gate } else { 4378*0Sstevel@tonic-gate /* Any other failure */ 4379*0Sstevel@tonic-gate rval = -1; 4380*0Sstevel@tonic-gate } 4381*0Sstevel@tonic-gate goto out; 4382*0Sstevel@tonic-gate } 4383*0Sstevel@tonic-gate } 4384*0Sstevel@tonic-gate /* Find biggest genid in records for this diskset */ 4385*0Sstevel@tonic-gate if (mnsr->sr_genid > max_genid) 4386*0Sstevel@tonic-gate max_genid = mnsr->sr_genid; 4387*0Sstevel@tonic-gate 4388*0Sstevel@tonic-gate dr = mnsr->sr_drivechain; 4389*0Sstevel@tonic-gate while (dr) { 4390*0Sstevel@tonic-gate /* Find biggest genid in records for this diskset */ 4391*0Sstevel@tonic-gate if (dr->dr_genid > max_genid) { 4392*0Sstevel@tonic-gate max_genid = dr->dr_genid; 4393*0Sstevel@tonic-gate } 4394*0Sstevel@tonic-gate dr = dr->dr_next; 4395*0Sstevel@tonic-gate } 4396*0Sstevel@tonic-gate 4397*0Sstevel@tonic-gate found_my_nr = 0; 4398*0Sstevel@tonic-gate nr = mnsr->sr_nodechain; 4399*0Sstevel@tonic-gate /* nr is the list of node recs from nd_nodename node */ 4400*0Sstevel@tonic-gate while (nr) { 4401*0Sstevel@tonic-gate /* Find biggest genid in records for this diskset */ 4402*0Sstevel@tonic-gate if (nr->nr_genid > max_genid) 4403*0Sstevel@tonic-gate max_genid = nr->nr_genid; 4404*0Sstevel@tonic-gate nd2 = master_nodelist; 4405*0Sstevel@tonic-gate ndtail = NULL; 4406*0Sstevel@tonic-gate /* For each node record, is it in master list? */ 4407*0Sstevel@tonic-gate while (nd2) { 4408*0Sstevel@tonic-gate if (nd2->nd_nodeid == nr->nr_nodeid) 4409*0Sstevel@tonic-gate break; 4410*0Sstevel@tonic-gate if (nd2->nd_next == NULL) 4411*0Sstevel@tonic-gate ndtail = nd2; 4412*0Sstevel@tonic-gate nd2 = nd2->nd_next; 4413*0Sstevel@tonic-gate } 4414*0Sstevel@tonic-gate /* 4415*0Sstevel@tonic-gate * Found node record not in master list -- add it 4416*0Sstevel@tonic-gate * to list marking it as DEL since node record 4417*0Sstevel@tonic-gate * should exist on all nodes unless a panic occurred 4418*0Sstevel@tonic-gate * during addition or deletion of host to diskset. 4419*0Sstevel@tonic-gate */ 4420*0Sstevel@tonic-gate if (nd2 == NULL) { 4421*0Sstevel@tonic-gate nd2 = Zalloc(sizeof (*nd2)); 4422*0Sstevel@tonic-gate (void) strcpy(nd2->nd_nodename, 4423*0Sstevel@tonic-gate nr->nr_nodename); 4424*0Sstevel@tonic-gate nd2->nd_flags = nr->nr_flags; 4425*0Sstevel@tonic-gate nd2->nd_flags |= MD_MN_NODE_DEL; 4426*0Sstevel@tonic-gate nd2->nd_nodeid = nr->nr_nodeid; 4427*0Sstevel@tonic-gate nd2->nd_next = NULL; 4428*0Sstevel@tonic-gate ndtail->nd_next = nd2; 4429*0Sstevel@tonic-gate nd2 = NULL; 4430*0Sstevel@tonic-gate nr = nr->nr_next; 4431*0Sstevel@tonic-gate continue; 4432*0Sstevel@tonic-gate } 4433*0Sstevel@tonic-gate /* 4434*0Sstevel@tonic-gate * Is this the node record for the node that 4435*0Sstevel@tonic-gate * we requested the set desc from? 4436*0Sstevel@tonic-gate * If so, check if node has its own node record 4437*0Sstevel@tonic-gate * marked OK. If marked OK, check for the OWN bit. 4438*0Sstevel@tonic-gate */ 4439*0Sstevel@tonic-gate if (nr->nr_nodeid == nd->nd_nodeid) { 4440*0Sstevel@tonic-gate found_my_nr = 1; 4441*0Sstevel@tonic-gate if (nr->nr_flags & MD_MN_NODE_OK) { 4442*0Sstevel@tonic-gate /* 4443*0Sstevel@tonic-gate * If node record is marked OK 4444*0Sstevel@tonic-gate * on its own node, then mark it OK 4445*0Sstevel@tonic-gate * in the master list. Node record 4446*0Sstevel@tonic-gate * would have to exist on all nodes 4447*0Sstevel@tonic-gate * in the ADD state before it could 4448*0Sstevel@tonic-gate * be put into the OK state. 4449*0Sstevel@tonic-gate */ 4450*0Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_OK; 4451*0Sstevel@tonic-gate nd->nd_flags &= 4452*0Sstevel@tonic-gate ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL); 4453*0Sstevel@tonic-gate /* 4454*0Sstevel@tonic-gate * Mark own in master list as marked 4455*0Sstevel@tonic-gate * on own node. 4456*0Sstevel@tonic-gate */ 4457*0Sstevel@tonic-gate if (nr->nr_flags & MD_MN_NODE_OWN) 4458*0Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_OWN; 4459*0Sstevel@tonic-gate else 4460*0Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OWN; 4461*0Sstevel@tonic-gate } else { 4462*0Sstevel@tonic-gate /* Otherwise, mark node as DEL */ 4463*0Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OK; 4464*0Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_ADD; 4465*0Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL; 4466*0Sstevel@tonic-gate } 4467*0Sstevel@tonic-gate } 4468*0Sstevel@tonic-gate /* 4469*0Sstevel@tonic-gate * If node is not ALIVE and marked DEL 4470*0Sstevel@tonic-gate * on any node, make it DEL in master list. 4471*0Sstevel@tonic-gate * If node is not ALIVE and marked ADD 4472*0Sstevel@tonic-gate * on any node, make it ADD in master list 4473*0Sstevel@tonic-gate * unless node record has already been marked DEL. 4474*0Sstevel@tonic-gate */ 4475*0Sstevel@tonic-gate if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) { 4476*0Sstevel@tonic-gate if (nr->nr_flags & MD_MN_NODE_ADD) { 4477*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_DEL)) { 4478*0Sstevel@tonic-gate /* If not DEL - mark it ADD */ 4479*0Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_ADD; 4480*0Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OK; 4481*0Sstevel@tonic-gate } 4482*0Sstevel@tonic-gate } 4483*0Sstevel@tonic-gate if (nr->nr_flags & MD_MN_NODE_DEL) { 4484*0Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL; 4485*0Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OK; 4486*0Sstevel@tonic-gate /* Could already be ADD - make it DEL */ 4487*0Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_ADD; 4488*0Sstevel@tonic-gate } 4489*0Sstevel@tonic-gate } 4490*0Sstevel@tonic-gate nr = nr->nr_next; 4491*0Sstevel@tonic-gate } 4492*0Sstevel@tonic-gate /* 4493*0Sstevel@tonic-gate * If a node record doesn't exist on its own node, 4494*0Sstevel@tonic-gate * then mark node as DEL. 4495*0Sstevel@tonic-gate */ 4496*0Sstevel@tonic-gate if (found_my_nr == 0) { 4497*0Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OK; 4498*0Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL; 4499*0Sstevel@tonic-gate } 4500*0Sstevel@tonic-gate 4501*0Sstevel@tonic-gate /* 4502*0Sstevel@tonic-gate * If node is OK - put mnsr onto master_mnsr_node list for 4503*0Sstevel@tonic-gate * later use when syncing up the drive records in the set. 4504*0Sstevel@tonic-gate */ 4505*0Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OK) { 4506*0Sstevel@tonic-gate mnsr_node = Zalloc(sizeof (*mnsr_node)); 4507*0Sstevel@tonic-gate mnsr_node->mmn_mnsr = mnsr; 4508*0Sstevel@tonic-gate (void) strncpy(mnsr_node->mmn_nodename, 4509*0Sstevel@tonic-gate nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1); 4510*0Sstevel@tonic-gate mnsr_node->mmn_next = master_mnsr_node; 4511*0Sstevel@tonic-gate master_mnsr_node = mnsr_node; 4512*0Sstevel@tonic-gate } else { 4513*0Sstevel@tonic-gate free_sr((struct md_set_record *)mnsr); 4514*0Sstevel@tonic-gate } 4515*0Sstevel@tonic-gate 4516*0Sstevel@tonic-gate nd = nd->nd_next; 4517*0Sstevel@tonic-gate } 4518*0Sstevel@tonic-gate 4519*0Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4520*0Sstevel@tonic-gate "Master nodelist created for set %s: %s"), 4521*0Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4522*0Sstevel@tonic-gate 4523*0Sstevel@tonic-gate /* 4524*0Sstevel@tonic-gate * Send master nodelist to the rpc.metad on all nodes (including 4525*0Sstevel@tonic-gate * myself) and each node will update itself. This will set the 4526*0Sstevel@tonic-gate * ADD and DEL flags on each node as setup in the master nodelist. 4527*0Sstevel@tonic-gate * Don't send nodelist to node where set doesn't exist. 4528*0Sstevel@tonic-gate */ 4529*0Sstevel@tonic-gate nd = master_nodelist; 4530*0Sstevel@tonic-gate while (nd) { 4531*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 4532*0Sstevel@tonic-gate (nd->nd_flags & MD_MN_NODE_NOSET)) { 4533*0Sstevel@tonic-gate nd = nd->nd_next; 4534*0Sstevel@tonic-gate continue; 4535*0Sstevel@tonic-gate } 4536*0Sstevel@tonic-gate if (clnt_upd_nr_flags(nd->nd_nodename, sp, 4537*0Sstevel@tonic-gate master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) { 4538*0Sstevel@tonic-gate /* If RPC failure to another node return 205 */ 4539*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 4540*0Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 4541*0Sstevel@tonic-gate nd->nd_nodeid)) { 4542*0Sstevel@tonic-gate rval = 205; 4543*0Sstevel@tonic-gate } else { 4544*0Sstevel@tonic-gate /* Any other failure */ 4545*0Sstevel@tonic-gate rval = -1; 4546*0Sstevel@tonic-gate } 4547*0Sstevel@tonic-gate goto out; 4548*0Sstevel@tonic-gate } 4549*0Sstevel@tonic-gate nd = nd->nd_next; 4550*0Sstevel@tonic-gate } 4551*0Sstevel@tonic-gate 4552*0Sstevel@tonic-gate /* 4553*0Sstevel@tonic-gate * Now, delete nodes that need to be deleted. 4554*0Sstevel@tonic-gate */ 4555*0Sstevel@tonic-gate if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 4556*0Sstevel@tonic-gate ep)) == NULL) { 4557*0Sstevel@tonic-gate if (! mdisok(ep)) { 4558*0Sstevel@tonic-gate rval = -1; 4559*0Sstevel@tonic-gate goto out; 4560*0Sstevel@tonic-gate } 4561*0Sstevel@tonic-gate } 4562*0Sstevel@tonic-gate 4563*0Sstevel@tonic-gate /* 4564*0Sstevel@tonic-gate * May be doing lots of RPC commands to the nodes, so lock the 4565*0Sstevel@tonic-gate * ALIVE members of the set since most of the rpc.metad routines 4566*0Sstevel@tonic-gate * require this for security reasons. 4567*0Sstevel@tonic-gate */ 4568*0Sstevel@tonic-gate nd = master_nodelist; 4569*0Sstevel@tonic-gate while (nd) { 4570*0Sstevel@tonic-gate /* Skip non-alive nodes and node without set */ 4571*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 4572*0Sstevel@tonic-gate (nd->nd_flags & MD_MN_NODE_NOSET)) { 4573*0Sstevel@tonic-gate nd = nd->nd_next; 4574*0Sstevel@tonic-gate continue; 4575*0Sstevel@tonic-gate } 4576*0Sstevel@tonic-gate if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 4577*0Sstevel@tonic-gate /* If RPC failure to another node return 205 */ 4578*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 4579*0Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 4580*0Sstevel@tonic-gate nd->nd_nodeid)) { 4581*0Sstevel@tonic-gate rval = 205; 4582*0Sstevel@tonic-gate } else { 4583*0Sstevel@tonic-gate /* Any other failure */ 4584*0Sstevel@tonic-gate rval = -1; 4585*0Sstevel@tonic-gate } 4586*0Sstevel@tonic-gate goto out; 4587*0Sstevel@tonic-gate } 4588*0Sstevel@tonic-gate set_locked = 1; 4589*0Sstevel@tonic-gate nd = nd->nd_next; 4590*0Sstevel@tonic-gate } 4591*0Sstevel@tonic-gate 4592*0Sstevel@tonic-gate nd = master_nodelist; 4593*0Sstevel@tonic-gate while (nd) { 4594*0Sstevel@tonic-gate /* Skip non-alive nodes */ 4595*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 4596*0Sstevel@tonic-gate nd = nd->nd_next; 4597*0Sstevel@tonic-gate continue; 4598*0Sstevel@tonic-gate } 4599*0Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_DEL) { 4600*0Sstevel@tonic-gate num_alive_nodes_del++; 4601*0Sstevel@tonic-gate /* 4602*0Sstevel@tonic-gate * Delete this node rec from all ALIVE nodes in diskset. 4603*0Sstevel@tonic-gate */ 4604*0Sstevel@tonic-gate nd2 = master_nodelist; 4605*0Sstevel@tonic-gate while (nd2) { 4606*0Sstevel@tonic-gate /* Skip non-alive nodes and node without set */ 4607*0Sstevel@tonic-gate if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) || 4608*0Sstevel@tonic-gate (nd2->nd_flags & MD_MN_NODE_NOSET)) { 4609*0Sstevel@tonic-gate nd2 = nd2->nd_next; 4610*0Sstevel@tonic-gate continue; 4611*0Sstevel@tonic-gate } 4612*0Sstevel@tonic-gate 4613*0Sstevel@tonic-gate /* This is a node being deleted from set */ 4614*0Sstevel@tonic-gate if (nd2->nd_nodeid == nd->nd_nodeid) { 4615*0Sstevel@tonic-gate /* Mark set record as DEL */ 4616*0Sstevel@tonic-gate if (clnt_upd_sr_flags(nd->nd_nodename, 4617*0Sstevel@tonic-gate sp, MD_SR_DEL, ep)) { 4618*0Sstevel@tonic-gate /* RPC failure to !my node */ 4619*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 4620*0Sstevel@tonic-gate (sd->sd_mn_mynode-> 4621*0Sstevel@tonic-gate nd_nodeid 4622*0Sstevel@tonic-gate != nd->nd_nodeid)) { 4623*0Sstevel@tonic-gate rval = 205; 4624*0Sstevel@tonic-gate } else { 4625*0Sstevel@tonic-gate /* Any other failure */ 4626*0Sstevel@tonic-gate rval = -1; 4627*0Sstevel@tonic-gate } 4628*0Sstevel@tonic-gate goto out; 4629*0Sstevel@tonic-gate } 4630*0Sstevel@tonic-gate if (clnt_deldrvs(nd->nd_nodename, sp, 4631*0Sstevel@tonic-gate dd, ep)) { 4632*0Sstevel@tonic-gate /* RPC failure to !my node */ 4633*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 4634*0Sstevel@tonic-gate (sd->sd_mn_mynode-> 4635*0Sstevel@tonic-gate nd_nodeid 4636*0Sstevel@tonic-gate != nd->nd_nodeid)) { 4637*0Sstevel@tonic-gate rval = 205; 4638*0Sstevel@tonic-gate } else { 4639*0Sstevel@tonic-gate /* Any other failure */ 4640*0Sstevel@tonic-gate rval = -1; 4641*0Sstevel@tonic-gate } 4642*0Sstevel@tonic-gate goto out; 4643*0Sstevel@tonic-gate } 4644*0Sstevel@tonic-gate if (clnt_delset(nd->nd_nodename, sp, 4645*0Sstevel@tonic-gate ep) == -1) { 4646*0Sstevel@tonic-gate /* RPC failure to !my node */ 4647*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 4648*0Sstevel@tonic-gate (sd->sd_mn_mynode-> 4649*0Sstevel@tonic-gate nd_nodeid 4650*0Sstevel@tonic-gate != nd->nd_nodeid)) { 4651*0Sstevel@tonic-gate rval = 205; 4652*0Sstevel@tonic-gate } else { 4653*0Sstevel@tonic-gate /* Any other failure */ 4654*0Sstevel@tonic-gate rval = -1; 4655*0Sstevel@tonic-gate } 4656*0Sstevel@tonic-gate goto out; 4657*0Sstevel@tonic-gate } 4658*0Sstevel@tonic-gate } else { 4659*0Sstevel@tonic-gate /* 4660*0Sstevel@tonic-gate * Delete host from sets on hosts 4661*0Sstevel@tonic-gate * not being deleted. 4662*0Sstevel@tonic-gate */ 4663*0Sstevel@tonic-gate anode[0] = Strdup(nd->nd_nodename); 4664*0Sstevel@tonic-gate if (clnt_delhosts(nd2->nd_nodename, sp, 4665*0Sstevel@tonic-gate 1, anode, ep) == -1) { 4666*0Sstevel@tonic-gate Free(anode[0]); 4667*0Sstevel@tonic-gate /* RPC failure to !my node */ 4668*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 4669*0Sstevel@tonic-gate (sd->sd_mn_mynode-> 4670*0Sstevel@tonic-gate nd_nodeid 4671*0Sstevel@tonic-gate != nd2->nd_nodeid)) { 4672*0Sstevel@tonic-gate rval = 205; 4673*0Sstevel@tonic-gate } else { 4674*0Sstevel@tonic-gate /* Any other failure */ 4675*0Sstevel@tonic-gate rval = -1; 4676*0Sstevel@tonic-gate } 4677*0Sstevel@tonic-gate goto out; 4678*0Sstevel@tonic-gate } 4679*0Sstevel@tonic-gate 4680*0Sstevel@tonic-gate meta_mc_log(MC_LOG5, 4681*0Sstevel@tonic-gate dgettext(TEXT_DOMAIN, 4682*0Sstevel@tonic-gate "Deleted node %s (%d) on node %s " 4683*0Sstevel@tonic-gate "from set %s: %s"), 4684*0Sstevel@tonic-gate nd->nd_nodename, nd->nd_nodeid, 4685*0Sstevel@tonic-gate nd2->nd_nodename, 4686*0Sstevel@tonic-gate sp->setname, 4687*0Sstevel@tonic-gate meta_print_hrtime( 4688*0Sstevel@tonic-gate gethrtime() - start_time)); 4689*0Sstevel@tonic-gate 4690*0Sstevel@tonic-gate Free(anode[0]); 4691*0Sstevel@tonic-gate } 4692*0Sstevel@tonic-gate nd2 = nd2->nd_next; 4693*0Sstevel@tonic-gate } 4694*0Sstevel@tonic-gate } 4695*0Sstevel@tonic-gate nd = nd->nd_next; 4696*0Sstevel@tonic-gate } 4697*0Sstevel@tonic-gate 4698*0Sstevel@tonic-gate nd = master_nodelist; 4699*0Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname); 4700*0Sstevel@tonic-gate while (nd) { 4701*0Sstevel@tonic-gate /* Skip non-alive nodes and node without set */ 4702*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 4703*0Sstevel@tonic-gate (nd->nd_flags & MD_MN_NODE_NOSET)) { 4704*0Sstevel@tonic-gate nd = nd->nd_next; 4705*0Sstevel@tonic-gate continue; 4706*0Sstevel@tonic-gate } 4707*0Sstevel@tonic-gate if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) { 4708*0Sstevel@tonic-gate /* If RPC failure to another node return 205 */ 4709*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 4710*0Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 4711*0Sstevel@tonic-gate nd->nd_nodeid)) { 4712*0Sstevel@tonic-gate rval = 205; 4713*0Sstevel@tonic-gate } else { 4714*0Sstevel@tonic-gate /* Any other failure */ 4715*0Sstevel@tonic-gate rval = -1; 4716*0Sstevel@tonic-gate } 4717*0Sstevel@tonic-gate goto out; 4718*0Sstevel@tonic-gate } 4719*0Sstevel@tonic-gate nd = nd->nd_next; 4720*0Sstevel@tonic-gate } 4721*0Sstevel@tonic-gate cl_set_setkey(NULL); 4722*0Sstevel@tonic-gate set_locked = 0; 4723*0Sstevel@tonic-gate 4724*0Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4725*0Sstevel@tonic-gate "Nodelist syncronization complete for set %s: %s"), 4726*0Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4727*0Sstevel@tonic-gate 4728*0Sstevel@tonic-gate metaflushsetname(sp); 4729*0Sstevel@tonic-gate 4730*0Sstevel@tonic-gate /* 4731*0Sstevel@tonic-gate * If all alive nodes have been deleted from set, just 4732*0Sstevel@tonic-gate * return since nothing else can be done until non-alive 4733*0Sstevel@tonic-gate * nodes (if there are any) rejoin the cluster. 4734*0Sstevel@tonic-gate */ 4735*0Sstevel@tonic-gate if (num_alive_nodes == num_alive_nodes_del) { 4736*0Sstevel@tonic-gate rval = 0; 4737*0Sstevel@tonic-gate goto out; 4738*0Sstevel@tonic-gate } 4739*0Sstevel@tonic-gate 4740*0Sstevel@tonic-gate /* 4741*0Sstevel@tonic-gate * Sync up drive records. 4742*0Sstevel@tonic-gate * 4743*0Sstevel@tonic-gate * If a node panic'd (or metaset command was killed) during the 4744*0Sstevel@tonic-gate * addition or deletion of a drive to the diskset, the nodes 4745*0Sstevel@tonic-gate * may have a different view of the drive list. During cleanup 4746*0Sstevel@tonic-gate * of the drive list during reconfig, a drive will be deleted 4747*0Sstevel@tonic-gate * from the list if the master node sees that the drive has been 4748*0Sstevel@tonic-gate * marked in the ADD state on any node or is marked in the DEL state 4749*0Sstevel@tonic-gate * on all nodes. 4750*0Sstevel@tonic-gate * This cleanup must occur even if all nodes in the cluster are 4751*0Sstevel@tonic-gate * not part of the cluster so that all nodes have the same view 4752*0Sstevel@tonic-gate * of the drivelist. 4753*0Sstevel@tonic-gate * Then if the entire cluster goes down and comes back up, the 4754*0Sstevel@tonic-gate * new master node could be a node that wasn't in the cluster when 4755*0Sstevel@tonic-gate * the node was deleted. This could lead to a situation where the 4756*0Sstevel@tonic-gate * master node thinks that a drive is OK, but this drive isn't 4757*0Sstevel@tonic-gate * known to the other nodes. 4758*0Sstevel@tonic-gate * This situation can also occur during the addition of a drive 4759*0Sstevel@tonic-gate * where a node has the drive marked OK, but the node executing the 4760*0Sstevel@tonic-gate * metaset command enountered a failure before marking that drive OK 4761*0Sstevel@tonic-gate * on the rest of the nodes. If the node with the OK drive then 4762*0Sstevel@tonic-gate * panics, then rest of the nodes will remove that drive marked ADD 4763*0Sstevel@tonic-gate * and when the node with the OK drive rejoins the cluster, it will 4764*0Sstevel@tonic-gate * have a drive marked OK that is unknown by the other nodes. 4765*0Sstevel@tonic-gate * 4766*0Sstevel@tonic-gate * There are 2 situations to consider: 4767*0Sstevel@tonic-gate * A) Master knows about a drive that other nodes don't know about. 4768*0Sstevel@tonic-gate * B) At least one slave node knows about a drive that the master 4769*0Sstevel@tonic-gate * node doesn't know about. 4770*0Sstevel@tonic-gate * 4771*0Sstevel@tonic-gate * To handle these situations the following steps are followed: 4772*0Sstevel@tonic-gate * 1) Count number of drives known by this master node and the 4773*0Sstevel@tonic-gate * other slave nodes. 4774*0Sstevel@tonic-gate * If all nodes have the same number of drives and the master has 4775*0Sstevel@tonic-gate * all drives marked OK, then skip to step4. 4776*0Sstevel@tonic-gate * 4777*0Sstevel@tonic-gate * 2) If a node has less drives listed than the master, the master 4778*0Sstevel@tonic-gate * must get the drive descriptor list from that node so that 4779*0Sstevel@tonic-gate * master can determine which drive it needs to delete from that 4780*0Sstevel@tonic-gate * node. Master must get the drive descriptor list since the 4781*0Sstevel@tonic-gate * drive record list does not contain the name of the drive, but 4782*0Sstevel@tonic-gate * only a key and the key can only be interprested on that other 4783*0Sstevel@tonic-gate * node. 4784*0Sstevel@tonic-gate * 4785*0Sstevel@tonic-gate * 3) The master will then create the master drive list by doing: 4786*0Sstevel@tonic-gate * - Master starts with drive list known by master. 4787*0Sstevel@tonic-gate * - Any drive marked ADD will be removed from the list. 4788*0Sstevel@tonic-gate * - Any drive not known by another node (from step2) will be 4789*0Sstevel@tonic-gate * removed from the drive list. 4790*0Sstevel@tonic-gate * - If a drive is marked DEL on the master, the master must 4791*0Sstevel@tonic-gate * verify that the drive record is marked DEL on all nodes. 4792*0Sstevel@tonic-gate * If any node has the drive record marked OK, mark it OK 4793*0Sstevel@tonic-gate * on the master. (The reason why is described below). 4794*0Sstevel@tonic-gate * 4795*0Sstevel@tonic-gate * 4) The master sends out the master drive list and the slave 4796*0Sstevel@tonic-gate * nodes will force their drive lists to match the master 4797*0Sstevel@tonic-gate * drive list by deleting drives, if necessary and by changing 4798*0Sstevel@tonic-gate * the drive record states from ADD->OK if master has drive 4799*0Sstevel@tonic-gate * marked OK and slave has drive marked ADD. 4800*0Sstevel@tonic-gate * 4801*0Sstevel@tonic-gate * Interesting scenarios: 4802*0Sstevel@tonic-gate * 4803*0Sstevel@tonic-gate * 1) System has 4 nodes with node 1 as the master. Node 3 starts 4804*0Sstevel@tonic-gate * to delete a drive record (drive record on node 1 is marked DEL), 4805*0Sstevel@tonic-gate * but is stopped when node 3 panics. Node 1 also panics. 4806*0Sstevel@tonic-gate * During reconfig cycle, node 2 is picked as master and the drive 4807*0Sstevel@tonic-gate * record is left alone since all nodes in the cluster have it 4808*0Sstevel@tonic-gate * marked OK. User now sees drive as part of diskset. 4809*0Sstevel@tonic-gate * Now, entire cluster is rebooted and node 1 rejoins the cluster. 4810*0Sstevel@tonic-gate * Node 1 is picked as the master and node 1 has drive record 4811*0Sstevel@tonic-gate * marked DEL. Node 1 contacts all other nodes in the cluster 4812*0Sstevel@tonic-gate * and since at least one node has the drive record marked OK, 4813*0Sstevel@tonic-gate * the master marks the drive record OK. 4814*0Sstevel@tonic-gate * User continues to see the drive as part of the diskset. 4815*0Sstevel@tonic-gate */ 4816*0Sstevel@tonic-gate 4817*0Sstevel@tonic-gate /* Reget set descriptor since flushed above */ 4818*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 4819*0Sstevel@tonic-gate rval = -1; 4820*0Sstevel@tonic-gate goto out; 4821*0Sstevel@tonic-gate } 4822*0Sstevel@tonic-gate 4823*0Sstevel@tonic-gate /* Has side effect of setting sd->sd_drvs to same as master_dd */ 4824*0Sstevel@tonic-gate if ((master_dd = metaget_drivedesc_sideno(sp, 4825*0Sstevel@tonic-gate sd->sd_mn_mynode->nd_nodeid, 4826*0Sstevel@tonic-gate (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) { 4827*0Sstevel@tonic-gate /* No drives in list */ 4828*0Sstevel@tonic-gate if (!mdisok(ep)) { 4829*0Sstevel@tonic-gate /* 4830*0Sstevel@tonic-gate * Can't get drive list for this node, so 4831*0Sstevel@tonic-gate * return -1 causing this node to be removed 4832*0Sstevel@tonic-gate * cluster config and fixed. 4833*0Sstevel@tonic-gate */ 4834*0Sstevel@tonic-gate rval = -1; 4835*0Sstevel@tonic-gate goto out; 4836*0Sstevel@tonic-gate } 4837*0Sstevel@tonic-gate } 4838*0Sstevel@tonic-gate 4839*0Sstevel@tonic-gate /* Count the number of drives for all nodes */ 4840*0Sstevel@tonic-gate mnsr_node = master_mnsr_node; 4841*0Sstevel@tonic-gate while (mnsr_node) { 4842*0Sstevel@tonic-gate dr_cnt = 0; 4843*0Sstevel@tonic-gate dr = mnsr_node->mmn_mnsr->sr_drivechain; 4844*0Sstevel@tonic-gate while (dr) { 4845*0Sstevel@tonic-gate dr_cnt++; 4846*0Sstevel@tonic-gate dr = dr->dr_next; 4847*0Sstevel@tonic-gate } 4848*0Sstevel@tonic-gate mnsr_node->mmn_numdrives = dr_cnt; 4849*0Sstevel@tonic-gate mnsr_node = mnsr_node->mmn_next; 4850*0Sstevel@tonic-gate } 4851*0Sstevel@tonic-gate 4852*0Sstevel@tonic-gate /* Count the number of drives for the master; also check flags */ 4853*0Sstevel@tonic-gate all_drives_ok = 1; 4854*0Sstevel@tonic-gate dd_cnt = 0; 4855*0Sstevel@tonic-gate dd = master_dd; 4856*0Sstevel@tonic-gate while (dd) { 4857*0Sstevel@tonic-gate dd_cnt++; 4858*0Sstevel@tonic-gate if (!(dd->dd_flags & MD_DR_OK)) 4859*0Sstevel@tonic-gate all_drives_ok = 0; 4860*0Sstevel@tonic-gate dd = dd->dd_next; 4861*0Sstevel@tonic-gate } 4862*0Sstevel@tonic-gate 4863*0Sstevel@tonic-gate /* If all drives are ok, do quick check against number of drives */ 4864*0Sstevel@tonic-gate if (all_drives_ok) { 4865*0Sstevel@tonic-gate /* If all nodes have same number of drives, almost done */ 4866*0Sstevel@tonic-gate mnsr_node = master_mnsr_node; 4867*0Sstevel@tonic-gate while (mnsr_node) { 4868*0Sstevel@tonic-gate if (mnsr_node->mmn_numdrives != dd_cnt) 4869*0Sstevel@tonic-gate break; 4870*0Sstevel@tonic-gate mnsr_node = mnsr_node->mmn_next; 4871*0Sstevel@tonic-gate } 4872*0Sstevel@tonic-gate /* All nodes have same number of drives, just send flags */ 4873*0Sstevel@tonic-gate if (mnsr_node == NULL) { 4874*0Sstevel@tonic-gate goto send_drive_list; 4875*0Sstevel@tonic-gate } 4876*0Sstevel@tonic-gate } 4877*0Sstevel@tonic-gate 4878*0Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4879*0Sstevel@tonic-gate "Begin detailed drive synchronization for set %s: %s"), 4880*0Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4881*0Sstevel@tonic-gate 4882*0Sstevel@tonic-gate /* Detailed check required */ 4883*0Sstevel@tonic-gate mnsr_node = master_mnsr_node; 4884*0Sstevel@tonic-gate while (mnsr_node) { 4885*0Sstevel@tonic-gate /* Does slave node have less drives than master? */ 4886*0Sstevel@tonic-gate if (mnsr_node->mmn_numdrives < dd_cnt) { 4887*0Sstevel@tonic-gate /* Yes - must determine which drive is missing */ 4888*0Sstevel@tonic-gate if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp, 4889*0Sstevel@tonic-gate &other_dd, ep)) { 4890*0Sstevel@tonic-gate /* RPC failure to !my node */ 4891*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 4892*0Sstevel@tonic-gate (strcmp(mynode(), mnsr_node->mmn_nodename) 4893*0Sstevel@tonic-gate != 0)) { 4894*0Sstevel@tonic-gate rval = 205; 4895*0Sstevel@tonic-gate } else { 4896*0Sstevel@tonic-gate /* Any other failure */ 4897*0Sstevel@tonic-gate rval = -1; 4898*0Sstevel@tonic-gate } 4899*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 4900*0Sstevel@tonic-gate "Master node %s unable to " 4901*0Sstevel@tonic-gate "retrieve drive list from node %s"), 4902*0Sstevel@tonic-gate mynode(), mnsr_node->mmn_nodename); 4903*0Sstevel@tonic-gate goto out; 4904*0Sstevel@tonic-gate } 4905*0Sstevel@tonic-gate mnsr_node->mmn_dd = other_dd; 4906*0Sstevel@tonic-gate dd = master_dd; 4907*0Sstevel@tonic-gate while (dd) { 4908*0Sstevel@tonic-gate if (!(dd->dd_flags & MD_DR_OK)) { 4909*0Sstevel@tonic-gate dd = dd->dd_next; 4910*0Sstevel@tonic-gate continue; 4911*0Sstevel@tonic-gate } 4912*0Sstevel@tonic-gate other_dd = mnsr_node->mmn_dd; 4913*0Sstevel@tonic-gate while (other_dd) { 4914*0Sstevel@tonic-gate /* Convert to devids, when available */ 4915*0Sstevel@tonic-gate if (strcmp(other_dd->dd_dnp->cname, 4916*0Sstevel@tonic-gate dd->dd_dnp->cname) == 0) { 4917*0Sstevel@tonic-gate break; 4918*0Sstevel@tonic-gate } 4919*0Sstevel@tonic-gate other_dd = other_dd->dd_next; 4920*0Sstevel@tonic-gate } 4921*0Sstevel@tonic-gate /* 4922*0Sstevel@tonic-gate * dd not found on slave so mark it 4923*0Sstevel@tonic-gate * ADD for later deletion (drives in ADD 4924*0Sstevel@tonic-gate * state are deleted later in this routine). 4925*0Sstevel@tonic-gate */ 4926*0Sstevel@tonic-gate if (other_dd == NULL) { 4927*0Sstevel@tonic-gate dd->dd_flags = MD_DR_ADD; 4928*0Sstevel@tonic-gate } 4929*0Sstevel@tonic-gate dd = dd->dd_next; 4930*0Sstevel@tonic-gate } 4931*0Sstevel@tonic-gate 4932*0Sstevel@tonic-gate } 4933*0Sstevel@tonic-gate mnsr_node = mnsr_node->mmn_next; 4934*0Sstevel@tonic-gate } 4935*0Sstevel@tonic-gate 4936*0Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4937*0Sstevel@tonic-gate "Drive check completed for set %s: %s"), 4938*0Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4939*0Sstevel@tonic-gate 4940*0Sstevel@tonic-gate dd = master_dd; 4941*0Sstevel@tonic-gate dd_prev = 0; 4942*0Sstevel@tonic-gate while (dd) { 4943*0Sstevel@tonic-gate /* Remove any ADD drives from list */ 4944*0Sstevel@tonic-gate if (dd->dd_flags & MD_DR_ADD) { 4945*0Sstevel@tonic-gate if (dd_prev) { 4946*0Sstevel@tonic-gate dd_prev->dd_next = dd->dd_next; 4947*0Sstevel@tonic-gate dd->dd_next = NULL; 4948*0Sstevel@tonic-gate metafreedrivedesc(&dd); 4949*0Sstevel@tonic-gate dd = dd_prev->dd_next; 4950*0Sstevel@tonic-gate } else { 4951*0Sstevel@tonic-gate /* 4952*0Sstevel@tonic-gate * If removing drive descriptor from head 4953*0Sstevel@tonic-gate * of linked list, also change sd->sd_drvs. 4954*0Sstevel@tonic-gate */ 4955*0Sstevel@tonic-gate master_dd = sd->sd_drvs = dd->dd_next; 4956*0Sstevel@tonic-gate dd->dd_next = NULL; 4957*0Sstevel@tonic-gate metafreedrivedesc(&dd); 4958*0Sstevel@tonic-gate dd = master_dd; 4959*0Sstevel@tonic-gate } 4960*0Sstevel@tonic-gate /* dd setup in if/else above */ 4961*0Sstevel@tonic-gate continue; 4962*0Sstevel@tonic-gate } 4963*0Sstevel@tonic-gate /* 4964*0Sstevel@tonic-gate * If drive is marked DEL, check all other nodes. 4965*0Sstevel@tonic-gate * If drive on another node is marked OK, mark drive OK 4966*0Sstevel@tonic-gate * in master list. If drive is marked DEL or doesn't exist 4967*0Sstevel@tonic-gate * on all nodes, remove drive from list. 4968*0Sstevel@tonic-gate */ 4969*0Sstevel@tonic-gate if (dd->dd_flags & MD_DR_DEL) { 4970*0Sstevel@tonic-gate mnsr_node = master_mnsr_node; 4971*0Sstevel@tonic-gate while (mnsr_node) { 4972*0Sstevel@tonic-gate if (mnsr_node->mmn_dd == NULL) { 4973*0Sstevel@tonic-gate if (clnt_getdrivedesc( 4974*0Sstevel@tonic-gate mnsr_node->mmn_nodename, sp, 4975*0Sstevel@tonic-gate &other_dd, ep)) { 4976*0Sstevel@tonic-gate /* RPC failure to !my node */ 4977*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 4978*0Sstevel@tonic-gate (strcmp(mynode(), 4979*0Sstevel@tonic-gate mnsr_node->mmn_nodename) 4980*0Sstevel@tonic-gate != 0)) { 4981*0Sstevel@tonic-gate rval = 205; 4982*0Sstevel@tonic-gate } else { 4983*0Sstevel@tonic-gate /* Any other failure */ 4984*0Sstevel@tonic-gate rval = -1; 4985*0Sstevel@tonic-gate } 4986*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 4987*0Sstevel@tonic-gate "Master node %s unable " 4988*0Sstevel@tonic-gate "to retrieve drive list from " 4989*0Sstevel@tonic-gate "node %s"), mynode(), 4990*0Sstevel@tonic-gate mnsr_node->mmn_nodename); 4991*0Sstevel@tonic-gate goto out; 4992*0Sstevel@tonic-gate } 4993*0Sstevel@tonic-gate mnsr_node->mmn_dd = other_dd; 4994*0Sstevel@tonic-gate } 4995*0Sstevel@tonic-gate other_dd = mnsr_node->mmn_dd; 4996*0Sstevel@tonic-gate while (other_dd) { 4997*0Sstevel@tonic-gate /* Found drive (OK) from other node */ 4998*0Sstevel@tonic-gate if (strcmp(dd->dd_dnp->cname, 4999*0Sstevel@tonic-gate other_dd->dd_dnp->cname) 5000*0Sstevel@tonic-gate == 0) { 5001*0Sstevel@tonic-gate /* Drive marked OK */ 5002*0Sstevel@tonic-gate if (other_dd->dd_flags & 5003*0Sstevel@tonic-gate MD_DR_OK) { 5004*0Sstevel@tonic-gate dd->dd_flags = MD_DR_OK; 5005*0Sstevel@tonic-gate } 5006*0Sstevel@tonic-gate break; 5007*0Sstevel@tonic-gate } 5008*0Sstevel@tonic-gate other_dd = other_dd->dd_next; 5009*0Sstevel@tonic-gate } 5010*0Sstevel@tonic-gate if (dd->dd_flags == MD_DR_OK) 5011*0Sstevel@tonic-gate break; 5012*0Sstevel@tonic-gate 5013*0Sstevel@tonic-gate mnsr_node = mnsr_node->mmn_next; 5014*0Sstevel@tonic-gate } 5015*0Sstevel@tonic-gate /* 5016*0Sstevel@tonic-gate * If no node had this drive marked OK, delete it. 5017*0Sstevel@tonic-gate */ 5018*0Sstevel@tonic-gate if (dd->dd_flags & MD_DR_DEL) { 5019*0Sstevel@tonic-gate if (dd_prev) { 5020*0Sstevel@tonic-gate dd_prev->dd_next = dd->dd_next; 5021*0Sstevel@tonic-gate dd->dd_next = NULL; 5022*0Sstevel@tonic-gate metafreedrivedesc(&dd); 5023*0Sstevel@tonic-gate dd = dd_prev->dd_next; 5024*0Sstevel@tonic-gate } else { 5025*0Sstevel@tonic-gate /* 5026*0Sstevel@tonic-gate * If removing drive descriptor from 5027*0Sstevel@tonic-gate * head of linked list, also change 5028*0Sstevel@tonic-gate * sd->sd_drvs. 5029*0Sstevel@tonic-gate */ 5030*0Sstevel@tonic-gate master_dd = sd->sd_drvs = dd->dd_next; 5031*0Sstevel@tonic-gate dd->dd_next = NULL; 5032*0Sstevel@tonic-gate metafreedrivedesc(&dd); 5033*0Sstevel@tonic-gate dd = master_dd; 5034*0Sstevel@tonic-gate } 5035*0Sstevel@tonic-gate /* dd setup in if/else above */ 5036*0Sstevel@tonic-gate continue; 5037*0Sstevel@tonic-gate } 5038*0Sstevel@tonic-gate } 5039*0Sstevel@tonic-gate dd_prev = dd; 5040*0Sstevel@tonic-gate dd = dd->dd_next; 5041*0Sstevel@tonic-gate } 5042*0Sstevel@tonic-gate 5043*0Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5044*0Sstevel@tonic-gate "Setting drive states completed for set %s: %s"), 5045*0Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5046*0Sstevel@tonic-gate 5047*0Sstevel@tonic-gate send_drive_list: 5048*0Sstevel@tonic-gate /* 5049*0Sstevel@tonic-gate * Set genid on all drives to be the highest value seen. 5050*0Sstevel@tonic-gate */ 5051*0Sstevel@tonic-gate dd = master_dd; 5052*0Sstevel@tonic-gate while (dd) { 5053*0Sstevel@tonic-gate dd->dd_genid = max_genid; 5054*0Sstevel@tonic-gate dd = dd->dd_next; 5055*0Sstevel@tonic-gate } 5056*0Sstevel@tonic-gate /* 5057*0Sstevel@tonic-gate * Send updated drive list to all alive nodes. 5058*0Sstevel@tonic-gate * Will also set genid on set and node records to have same 5059*0Sstevel@tonic-gate * as the drive records. 5060*0Sstevel@tonic-gate */ 5061*0Sstevel@tonic-gate nd = sd->sd_nodelist; 5062*0Sstevel@tonic-gate while (nd) { 5063*0Sstevel@tonic-gate /* Skip non-alive nodes */ 5064*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 5065*0Sstevel@tonic-gate nd = nd->nd_next; 5066*0Sstevel@tonic-gate continue; 5067*0Sstevel@tonic-gate } 5068*0Sstevel@tonic-gate if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) { 5069*0Sstevel@tonic-gate /* RPC failure to another node */ 5070*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 5071*0Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) { 5072*0Sstevel@tonic-gate rval = 205; 5073*0Sstevel@tonic-gate } else { 5074*0Sstevel@tonic-gate /* Any other failure */ 5075*0Sstevel@tonic-gate rval = -1; 5076*0Sstevel@tonic-gate } 5077*0Sstevel@tonic-gate goto out; 5078*0Sstevel@tonic-gate } 5079*0Sstevel@tonic-gate nd = nd->nd_next; 5080*0Sstevel@tonic-gate } 5081*0Sstevel@tonic-gate 5082*0Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5083*0Sstevel@tonic-gate "Sent drive list to all nodes for set %s: %s"), 5084*0Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5085*0Sstevel@tonic-gate 5086*0Sstevel@tonic-gate /* 5087*0Sstevel@tonic-gate * If no drive records left in set and nodes had been joined, 5088*0Sstevel@tonic-gate * withdraw the nodes. Always reset the master and mark 5089*0Sstevel@tonic-gate * all nodes as withdrawn on all nodes. 5090*0Sstevel@tonic-gate */ 5091*0Sstevel@tonic-gate if (master_dd == NULL) { 5092*0Sstevel@tonic-gate /* Reset new master flag since no longer master */ 5093*0Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf)); 5094*0Sstevel@tonic-gate sf.sf_setno = sp->setno; 5095*0Sstevel@tonic-gate sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 5096*0Sstevel@tonic-gate sf.sf_flags = MDDB_NM_RESET; 5097*0Sstevel@tonic-gate /* Use magic to help protect ioctl against attack. */ 5098*0Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5099*0Sstevel@tonic-gate /* Ignore failure, failure to reset flag isn't catastrophic */ 5100*0Sstevel@tonic-gate (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, 5101*0Sstevel@tonic-gate &sf.sf_mde, NULL); 5102*0Sstevel@tonic-gate 5103*0Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5104*0Sstevel@tonic-gate "Reset new master flag for " "set %s: %s"), 5105*0Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5106*0Sstevel@tonic-gate 5107*0Sstevel@tonic-gate nd = sd->sd_nodelist; 5108*0Sstevel@tonic-gate while (nd) { 5109*0Sstevel@tonic-gate /* Skip non-alive nodes */ 5110*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 5111*0Sstevel@tonic-gate nd = nd->nd_next; 5112*0Sstevel@tonic-gate continue; 5113*0Sstevel@tonic-gate } 5114*0Sstevel@tonic-gate 5115*0Sstevel@tonic-gate if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 5116*0Sstevel@tonic-gate /* RPC failure to another node */ 5117*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 5118*0Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 5119*0Sstevel@tonic-gate nd->nd_nodeid)) { 5120*0Sstevel@tonic-gate rval = 205; 5121*0Sstevel@tonic-gate } else { 5122*0Sstevel@tonic-gate /* Any other failure */ 5123*0Sstevel@tonic-gate rval = -1; 5124*0Sstevel@tonic-gate } 5125*0Sstevel@tonic-gate goto out; 5126*0Sstevel@tonic-gate } 5127*0Sstevel@tonic-gate set_locked = 1; 5128*0Sstevel@tonic-gate 5129*0Sstevel@tonic-gate /* Withdraw node from set if owner */ 5130*0Sstevel@tonic-gate if ((nd->nd_flags & MD_MN_NODE_OWN) && 5131*0Sstevel@tonic-gate (clnt_withdrawset(nd->nd_nodename, sp, ep))) { 5132*0Sstevel@tonic-gate /* RPC failure to another node */ 5133*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 5134*0Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 5135*0Sstevel@tonic-gate nd->nd_nodeid)) { 5136*0Sstevel@tonic-gate rval = 205; 5137*0Sstevel@tonic-gate } else { 5138*0Sstevel@tonic-gate /* Any other failure */ 5139*0Sstevel@tonic-gate rval = -1; 5140*0Sstevel@tonic-gate } 5141*0Sstevel@tonic-gate goto out; 5142*0Sstevel@tonic-gate } 5143*0Sstevel@tonic-gate 5144*0Sstevel@tonic-gate /* Mark all nodes as withdrawn on this node */ 5145*0Sstevel@tonic-gate if (clnt_upd_nr_flags(nd->nd_nodename, sp, 5146*0Sstevel@tonic-gate sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) { 5147*0Sstevel@tonic-gate /* RPC failure to another node */ 5148*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 5149*0Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 5150*0Sstevel@tonic-gate nd->nd_nodeid)) { 5151*0Sstevel@tonic-gate rval = 205; 5152*0Sstevel@tonic-gate } else { 5153*0Sstevel@tonic-gate /* Any other failure */ 5154*0Sstevel@tonic-gate rval = -1; 5155*0Sstevel@tonic-gate } 5156*0Sstevel@tonic-gate goto out; 5157*0Sstevel@tonic-gate } 5158*0Sstevel@tonic-gate 5159*0Sstevel@tonic-gate /* Resets master to no-master on this node */ 5160*0Sstevel@tonic-gate if (clnt_mnsetmaster(nd->nd_nodename, sp, 5161*0Sstevel@tonic-gate "", MD_MN_INVALID_NID, ep)) { 5162*0Sstevel@tonic-gate /* RPC failure to another node */ 5163*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 5164*0Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 5165*0Sstevel@tonic-gate nd->nd_nodeid)) { 5166*0Sstevel@tonic-gate rval = 205; 5167*0Sstevel@tonic-gate } else { 5168*0Sstevel@tonic-gate /* Any other failure */ 5169*0Sstevel@tonic-gate rval = -1; 5170*0Sstevel@tonic-gate } 5171*0Sstevel@tonic-gate goto out; 5172*0Sstevel@tonic-gate } 5173*0Sstevel@tonic-gate 5174*0Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname); 5175*0Sstevel@tonic-gate if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) { 5176*0Sstevel@tonic-gate /* RPC failure to another node */ 5177*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 5178*0Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 5179*0Sstevel@tonic-gate nd->nd_nodeid)) { 5180*0Sstevel@tonic-gate rval = 205; 5181*0Sstevel@tonic-gate } else { 5182*0Sstevel@tonic-gate /* Any other failure */ 5183*0Sstevel@tonic-gate rval = -1; 5184*0Sstevel@tonic-gate } 5185*0Sstevel@tonic-gate goto out; 5186*0Sstevel@tonic-gate } 5187*0Sstevel@tonic-gate set_locked = 0; 5188*0Sstevel@tonic-gate nd = nd->nd_next; 5189*0Sstevel@tonic-gate } 5190*0Sstevel@tonic-gate } 5191*0Sstevel@tonic-gate 5192*0Sstevel@tonic-gate out: 5193*0Sstevel@tonic-gate /* 5194*0Sstevel@tonic-gate * If got here and set is still locked, then an error has 5195*0Sstevel@tonic-gate * occurred and master_nodelist is still valid. 5196*0Sstevel@tonic-gate * If error is not an RPC error, then unlock. 5197*0Sstevel@tonic-gate * If error is an RPC error, skip unlocks since this could cause 5198*0Sstevel@tonic-gate * yet another RPC timeout if a node has failed. 5199*0Sstevel@tonic-gate * Ignore failures in unlock since unlock is just trying to 5200*0Sstevel@tonic-gate * clean things up. 5201*0Sstevel@tonic-gate */ 5202*0Sstevel@tonic-gate if ((set_locked) && !(mdanyrpcerror(ep))) { 5203*0Sstevel@tonic-gate nd = master_nodelist; 5204*0Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname); 5205*0Sstevel@tonic-gate while (nd) { 5206*0Sstevel@tonic-gate /* Skip non-alive nodes */ 5207*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 5208*0Sstevel@tonic-gate nd = nd->nd_next; 5209*0Sstevel@tonic-gate continue; 5210*0Sstevel@tonic-gate } 5211*0Sstevel@tonic-gate /* 5212*0Sstevel@tonic-gate * If clnt_unlock fails, just break out since next 5213*0Sstevel@tonic-gate * reconfig cycle will reset the locks anyway. 5214*0Sstevel@tonic-gate */ 5215*0Sstevel@tonic-gate if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 5216*0Sstevel@tonic-gate break; 5217*0Sstevel@tonic-gate } 5218*0Sstevel@tonic-gate nd = nd->nd_next; 5219*0Sstevel@tonic-gate } 5220*0Sstevel@tonic-gate cl_set_setkey(NULL); 5221*0Sstevel@tonic-gate } 5222*0Sstevel@tonic-gate /* Free master_mnsr and drive descs */ 5223*0Sstevel@tonic-gate mnsr_node = master_mnsr_node; 5224*0Sstevel@tonic-gate while (mnsr_node) { 5225*0Sstevel@tonic-gate master_mnsr_node = mnsr_node->mmn_next; 5226*0Sstevel@tonic-gate free_sr((md_set_record *)mnsr_node->mmn_mnsr); 5227*0Sstevel@tonic-gate free_rem_dd(mnsr_node->mmn_dd); 5228*0Sstevel@tonic-gate Free(mnsr_node); 5229*0Sstevel@tonic-gate mnsr_node = master_mnsr_node; 5230*0Sstevel@tonic-gate } 5231*0Sstevel@tonic-gate 5232*0Sstevel@tonic-gate /* Frees sd->sd_drvs (which is also master_dd) */ 5233*0Sstevel@tonic-gate metaflushsetname(sp); 5234*0Sstevel@tonic-gate return (rval); 5235*0Sstevel@tonic-gate } 5236*0Sstevel@tonic-gate 5237*0Sstevel@tonic-gate /* 5238*0Sstevel@tonic-gate * meta_mnsync_diskset_mddbs 5239*0Sstevel@tonic-gate * Calling node is guaranteed to be an owner node. 5240*0Sstevel@tonic-gate * Calling node is the master node. 5241*0Sstevel@tonic-gate * 5242*0Sstevel@tonic-gate * Master node verifies that ondisk mddb format matches its incore format. 5243*0Sstevel@tonic-gate * If no nodes are joined to set, remove the change log entries. 5244*0Sstevel@tonic-gate * If a node is joined to set, play the change log. 5245*0Sstevel@tonic-gate * 5246*0Sstevel@tonic-gate * Returns 0 - Success 5247*0Sstevel@tonic-gate * 1 - Master unable to join to set. 5248*0Sstevel@tonic-gate * 205 - Failure during RPC to another node 5249*0Sstevel@tonic-gate * -1 - Any other failure and ep is filled in. 5250*0Sstevel@tonic-gate * -1 return will eventually cause node to panic 5251*0Sstevel@tonic-gate * in a SunCluster environment. 5252*0Sstevel@tonic-gate */ 5253*0Sstevel@tonic-gate int 5254*0Sstevel@tonic-gate meta_mnsync_diskset_mddbs( 5255*0Sstevel@tonic-gate mdsetname_t *sp, 5256*0Sstevel@tonic-gate md_error_t *ep 5257*0Sstevel@tonic-gate ) 5258*0Sstevel@tonic-gate { 5259*0Sstevel@tonic-gate md_set_desc *sd; 5260*0Sstevel@tonic-gate mddb_config_t c; 5261*0Sstevel@tonic-gate md_mn_msgclass_t class; 5262*0Sstevel@tonic-gate mddb_setflags_config_t sf; 5263*0Sstevel@tonic-gate md_mnnode_desc *nd, *nd2; 5264*0Sstevel@tonic-gate md_error_t xep = mdnullerror; 5265*0Sstevel@tonic-gate int stale_set = 0; 5266*0Sstevel@tonic-gate 5267*0Sstevel@tonic-gate /* If setname is there, set desc should exist. */ 5268*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 5269*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 5270*0Sstevel@tonic-gate "Unable to get set %s desc information"), sp->setname); 5271*0Sstevel@tonic-gate return (-1); 5272*0Sstevel@tonic-gate } 5273*0Sstevel@tonic-gate 5274*0Sstevel@tonic-gate /* Are there drives in the set? */ 5275*0Sstevel@tonic-gate if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 5276*0Sstevel@tonic-gate ep) == NULL) { 5277*0Sstevel@tonic-gate if (! mdisok(ep)) { 5278*0Sstevel@tonic-gate return (-1); 5279*0Sstevel@tonic-gate } 5280*0Sstevel@tonic-gate /* No drives in set -- nothing to sync up */ 5281*0Sstevel@tonic-gate return (0); 5282*0Sstevel@tonic-gate } 5283*0Sstevel@tonic-gate 5284*0Sstevel@tonic-gate /* 5285*0Sstevel@tonic-gate * Is master node (which is this node) joined to set? 5286*0Sstevel@tonic-gate * If master node isn't joined (which means that no nodes 5287*0Sstevel@tonic-gate * are joined to diskset), remove the change log entries 5288*0Sstevel@tonic-gate * since no need to replay them - all nodes will have same 5289*0Sstevel@tonic-gate * view of mddbs since all nodes are reading in the mddbs 5290*0Sstevel@tonic-gate * from disk. 5291*0Sstevel@tonic-gate * There is also no need to sync up the master and ondisk mddbs 5292*0Sstevel@tonic-gate * since master has no incore knowledge. 5293*0Sstevel@tonic-gate * Need to join master to set in order to flush the change 5294*0Sstevel@tonic-gate * log entries. Don't need to block I/O during join of master 5295*0Sstevel@tonic-gate * to set since no other nodes are joined to set and so no I/O 5296*0Sstevel@tonic-gate * can be occurring. 5297*0Sstevel@tonic-gate */ 5298*0Sstevel@tonic-gate if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 5299*0Sstevel@tonic-gate /* Join master to set */ 5300*0Sstevel@tonic-gate if (clnt_joinset(mynode(), sp, 5301*0Sstevel@tonic-gate MNSET_IN_RECONFIG, ep)) { 5302*0Sstevel@tonic-gate if (mdismddberror(ep, MDE_DB_STALE)) { 5303*0Sstevel@tonic-gate /* 5304*0Sstevel@tonic-gate * If STALE, print message and continue on. 5305*0Sstevel@tonic-gate * Don't do any writes or reads to mddbs 5306*0Sstevel@tonic-gate * so don't clear change log. 5307*0Sstevel@tonic-gate */ 5308*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 5309*0Sstevel@tonic-gate "Join of master node to STALE set %s"), 5310*0Sstevel@tonic-gate sp->setname); 5311*0Sstevel@tonic-gate stale_set = 1; 5312*0Sstevel@tonic-gate mdclrerror(ep); 5313*0Sstevel@tonic-gate } else if (mdismddberror(ep, MDE_DB_ACCOK)) { 5314*0Sstevel@tonic-gate /* ACCOK means mediator provided extra vote */ 5315*0Sstevel@tonic-gate mdclrerror(ep); 5316*0Sstevel@tonic-gate } else { 5317*0Sstevel@tonic-gate /* 5318*0Sstevel@tonic-gate * If master is unable to join set, print an 5319*0Sstevel@tonic-gate * error message. Don't return failure or node 5320*0Sstevel@tonic-gate * will panic during cluster reconfig cycle. 5321*0Sstevel@tonic-gate * Also, withdraw node from set in order to 5322*0Sstevel@tonic-gate * cleanup from failed join attempt. 5323*0Sstevel@tonic-gate */ 5324*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 5325*0Sstevel@tonic-gate "Join of master node in set %s failed"), 5326*0Sstevel@tonic-gate sp->setname); 5327*0Sstevel@tonic-gate if (clnt_withdrawset(mynode(), sp, &xep)) 5328*0Sstevel@tonic-gate mdclrerror(&xep); 5329*0Sstevel@tonic-gate return (1); 5330*0Sstevel@tonic-gate } 5331*0Sstevel@tonic-gate } 5332*0Sstevel@tonic-gate /* 5333*0Sstevel@tonic-gate * Master node successfully joined. 5334*0Sstevel@tonic-gate * Set local copy of flags to OWN and 5335*0Sstevel@tonic-gate * send owner flag to rpc.metad. If not stale, 5336*0Sstevel@tonic-gate * flush the change log. 5337*0Sstevel@tonic-gate */ 5338*0Sstevel@tonic-gate sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN; 5339*0Sstevel@tonic-gate if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET, 5340*0Sstevel@tonic-gate MNSET_IN_RECONFIG, ep)) { 5341*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 5342*0Sstevel@tonic-gate "Flag update of master node join in set %s failed"), 5343*0Sstevel@tonic-gate sp->setname); 5344*0Sstevel@tonic-gate return (-1); 5345*0Sstevel@tonic-gate } 5346*0Sstevel@tonic-gate 5347*0Sstevel@tonic-gate if (!stale_set) { 5348*0Sstevel@tonic-gate if (mdmn_reset_changelog(sp, ep, 5349*0Sstevel@tonic-gate MDMN_CLF_RESETLOG) != 0) { 5350*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 5351*0Sstevel@tonic-gate "Unable to reset changelog.")); 5352*0Sstevel@tonic-gate return (-1); 5353*0Sstevel@tonic-gate } 5354*0Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5355*0Sstevel@tonic-gate "Removed changelog entries for set %s: %s"), 5356*0Sstevel@tonic-gate sp->setname, 5357*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 5358*0Sstevel@tonic-gate } 5359*0Sstevel@tonic-gate /* Reset new master flag before return */ 5360*0Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf)); 5361*0Sstevel@tonic-gate sf.sf_setno = sp->setno; 5362*0Sstevel@tonic-gate sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 5363*0Sstevel@tonic-gate sf.sf_flags = MDDB_NM_RESET; 5364*0Sstevel@tonic-gate /* Use magic to help protect ioctl against attack. */ 5365*0Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5366*0Sstevel@tonic-gate /* Ignore failure, failure to reset flag isn't catastrophic */ 5367*0Sstevel@tonic-gate (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, 5368*0Sstevel@tonic-gate &sf.sf_mde, NULL); 5369*0Sstevel@tonic-gate 5370*0Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5371*0Sstevel@tonic-gate "Reset new master flag for set %s: %s"), 5372*0Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5373*0Sstevel@tonic-gate 5374*0Sstevel@tonic-gate return (0); 5375*0Sstevel@tonic-gate } 5376*0Sstevel@tonic-gate 5377*0Sstevel@tonic-gate /* 5378*0Sstevel@tonic-gate * Is master already joined to STALE set (< 50% mddbs avail)? 5379*0Sstevel@tonic-gate * If so, can make no config changes to mddbs so don't check or play 5380*0Sstevel@tonic-gate * changelog and don't sync master node to ondisk mddbs. 5381*0Sstevel@tonic-gate * To get out of the stale state all nodes must be withdrawn 5382*0Sstevel@tonic-gate * from set. Then as nodes are re-joined, all nodes will 5383*0Sstevel@tonic-gate * have same view of mddbs since all nodes are reading the 5384*0Sstevel@tonic-gate * mddbs from disk. 5385*0Sstevel@tonic-gate */ 5386*0Sstevel@tonic-gate (void) memset(&c, 0, sizeof (c)); 5387*0Sstevel@tonic-gate c.c_id = 0; 5388*0Sstevel@tonic-gate c.c_setno = sp->setno; 5389*0Sstevel@tonic-gate if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 5390*0Sstevel@tonic-gate (void) mdstealerror(ep, &c.c_mde); 5391*0Sstevel@tonic-gate return (-1); 5392*0Sstevel@tonic-gate } 5393*0Sstevel@tonic-gate if (c.c_flags & MDDB_C_STALE) { 5394*0Sstevel@tonic-gate return (0); 5395*0Sstevel@tonic-gate } 5396*0Sstevel@tonic-gate 5397*0Sstevel@tonic-gate /* 5398*0Sstevel@tonic-gate * If this node is NOT a newly chosen master, then there's 5399*0Sstevel@tonic-gate * nothing else to do since the change log should be empty and 5400*0Sstevel@tonic-gate * the ondisk and incore mddbs are already consistent. 5401*0Sstevel@tonic-gate * 5402*0Sstevel@tonic-gate * A newly chosen master is a node that was not the master 5403*0Sstevel@tonic-gate * at the beginning of the reconfig cycle. If a node is a new 5404*0Sstevel@tonic-gate * master, then the new master state is reset after the ondisk 5405*0Sstevel@tonic-gate * and incore mddbs are consistent and the change log has 5406*0Sstevel@tonic-gate * been replayed. 5407*0Sstevel@tonic-gate */ 5408*0Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf)); 5409*0Sstevel@tonic-gate sf.sf_setno = sp->setno; 5410*0Sstevel@tonic-gate sf.sf_flags = MDDB_NM_GET; 5411*0Sstevel@tonic-gate /* Use magic to help protect ioctl against attack. */ 5412*0Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5413*0Sstevel@tonic-gate if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) && 5414*0Sstevel@tonic-gate ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) { 5415*0Sstevel@tonic-gate return (0); 5416*0Sstevel@tonic-gate } 5417*0Sstevel@tonic-gate 5418*0Sstevel@tonic-gate /* 5419*0Sstevel@tonic-gate * Now, sync up incore master view to ondisk mddbs. 5420*0Sstevel@tonic-gate * This is needed in the case where a master node 5421*0Sstevel@tonic-gate * had made a change to the mddb, but this change 5422*0Sstevel@tonic-gate * may not have been relayed to the slaves yet. 5423*0Sstevel@tonic-gate * So, the new master needs to verify that the ondisk 5424*0Sstevel@tonic-gate * mddbs match what the new master has incore - 5425*0Sstevel@tonic-gate * if different, new master rewrites all of the mddbs. 5426*0Sstevel@tonic-gate * Then the new master will replay the changelog and the 5427*0Sstevel@tonic-gate * new master will then execute what the old master had 5428*0Sstevel@tonic-gate * done. 5429*0Sstevel@tonic-gate * 5430*0Sstevel@tonic-gate * Block all I/Os to disks in this diskset on all nodes in 5431*0Sstevel@tonic-gate * the diskset. This will allow the rewriting of the mddbs 5432*0Sstevel@tonic-gate * (if needed), to proceed in a timely manner. 5433*0Sstevel@tonic-gate * 5434*0Sstevel@tonic-gate * If block of I/Os fail, return a -1. 5435*0Sstevel@tonic-gate */ 5436*0Sstevel@tonic-gate 5437*0Sstevel@tonic-gate nd = sd->sd_nodelist; 5438*0Sstevel@tonic-gate while (nd) { 5439*0Sstevel@tonic-gate /* Skip non-alive and non-owner nodes */ 5440*0Sstevel@tonic-gate if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5441*0Sstevel@tonic-gate (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5442*0Sstevel@tonic-gate nd = nd->nd_next; 5443*0Sstevel@tonic-gate continue; 5444*0Sstevel@tonic-gate } 5445*0Sstevel@tonic-gate if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5446*0Sstevel@tonic-gate MN_SUSP_IO, ep)) { 5447*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 5448*0Sstevel@tonic-gate "Unable to suspend I/O on node %s in set %s"), 5449*0Sstevel@tonic-gate nd->nd_nodename, sp->setname); 5450*0Sstevel@tonic-gate 5451*0Sstevel@tonic-gate /* 5452*0Sstevel@tonic-gate * Resume all other nodes that had been suspended. 5453*0Sstevel@tonic-gate * (Reconfig return step also resumes I/Os 5454*0Sstevel@tonic-gate * for all sets.) 5455*0Sstevel@tonic-gate */ 5456*0Sstevel@tonic-gate nd2 = sd->sd_nodelist; 5457*0Sstevel@tonic-gate while (nd2) { 5458*0Sstevel@tonic-gate /* Stop when reaching failed node */ 5459*0Sstevel@tonic-gate if (nd2->nd_nodeid == nd->nd_nodeid) 5460*0Sstevel@tonic-gate break; 5461*0Sstevel@tonic-gate /* Skip non-alive and non-owner nodes */ 5462*0Sstevel@tonic-gate if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) || 5463*0Sstevel@tonic-gate (!(nd2->nd_flags & MD_MN_NODE_OWN))) { 5464*0Sstevel@tonic-gate nd2 = nd2->nd_next; 5465*0Sstevel@tonic-gate continue; 5466*0Sstevel@tonic-gate } 5467*0Sstevel@tonic-gate (void) (clnt_mn_susp_res_io(nd2->nd_nodename, 5468*0Sstevel@tonic-gate sp->setno, MN_RES_IO, &xep)); 5469*0Sstevel@tonic-gate nd2 = nd2->nd_next; 5470*0Sstevel@tonic-gate } 5471*0Sstevel@tonic-gate 5472*0Sstevel@tonic-gate /* 5473*0Sstevel@tonic-gate * If an RPC failure on another node, return a 205. 5474*0Sstevel@tonic-gate * Otherwise, exit with failure. 5475*0Sstevel@tonic-gate */ 5476*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 5477*0Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 5478*0Sstevel@tonic-gate nd->nd_nodeid)) { 5479*0Sstevel@tonic-gate return (205); 5480*0Sstevel@tonic-gate } else { 5481*0Sstevel@tonic-gate return (-1); 5482*0Sstevel@tonic-gate } 5483*0Sstevel@tonic-gate 5484*0Sstevel@tonic-gate } 5485*0Sstevel@tonic-gate nd = nd->nd_next; 5486*0Sstevel@tonic-gate } 5487*0Sstevel@tonic-gate 5488*0Sstevel@tonic-gate (void) memset(&c, 0, sizeof (c)); 5489*0Sstevel@tonic-gate c.c_id = 0; 5490*0Sstevel@tonic-gate c.c_setno = sp->setno; 5491*0Sstevel@tonic-gate /* Master can't sync up to ondisk mddbs? Kick it out of cluster */ 5492*0Sstevel@tonic-gate if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0) 5493*0Sstevel@tonic-gate return (-1); 5494*0Sstevel@tonic-gate 5495*0Sstevel@tonic-gate /* 5496*0Sstevel@tonic-gate * Resume I/Os that were suspended above. 5497*0Sstevel@tonic-gate */ 5498*0Sstevel@tonic-gate nd = sd->sd_nodelist; 5499*0Sstevel@tonic-gate while (nd) { 5500*0Sstevel@tonic-gate /* Skip non-alive and non-owner nodes */ 5501*0Sstevel@tonic-gate if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5502*0Sstevel@tonic-gate (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5503*0Sstevel@tonic-gate nd = nd->nd_next; 5504*0Sstevel@tonic-gate continue; 5505*0Sstevel@tonic-gate } 5506*0Sstevel@tonic-gate if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5507*0Sstevel@tonic-gate MN_RES_IO, ep)) { 5508*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 5509*0Sstevel@tonic-gate "Unable to resume I/O on node %s in set %s"), 5510*0Sstevel@tonic-gate nd->nd_nodename, sp->setname); 5511*0Sstevel@tonic-gate 5512*0Sstevel@tonic-gate /* 5513*0Sstevel@tonic-gate * If an RPC failure then don't do any 5514*0Sstevel@tonic-gate * more RPC calls, since one timeout is enough 5515*0Sstevel@tonic-gate * to endure. If RPC failure to another node, return 5516*0Sstevel@tonic-gate * 205. If RPC failure to my node, return -1. 5517*0Sstevel@tonic-gate * If not an RPC failure, continue resuming the 5518*0Sstevel@tonic-gate * rest of the nodes and then return -1. 5519*0Sstevel@tonic-gate */ 5520*0Sstevel@tonic-gate if (mdanyrpcerror(ep)) { 5521*0Sstevel@tonic-gate if (sd->sd_mn_mynode->nd_nodeid == 5522*0Sstevel@tonic-gate nd->nd_nodeid) { 5523*0Sstevel@tonic-gate return (-1); 5524*0Sstevel@tonic-gate } else { 5525*0Sstevel@tonic-gate return (205); 5526*0Sstevel@tonic-gate } 5527*0Sstevel@tonic-gate } 5528*0Sstevel@tonic-gate 5529*0Sstevel@tonic-gate /* 5530*0Sstevel@tonic-gate * If not an RPC error, continue resuming rest of 5531*0Sstevel@tonic-gate * nodes, ignoring any failures except for an 5532*0Sstevel@tonic-gate * RPC failure which constitutes an immediate exit. 5533*0Sstevel@tonic-gate * Start in middle of list with failing node. 5534*0Sstevel@tonic-gate */ 5535*0Sstevel@tonic-gate nd2 = nd->nd_next; 5536*0Sstevel@tonic-gate while (nd2) { 5537*0Sstevel@tonic-gate /* Skip non-alive and non-owner nodes */ 5538*0Sstevel@tonic-gate if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) || 5539*0Sstevel@tonic-gate (!(nd2->nd_flags & MD_MN_NODE_OWN))) { 5540*0Sstevel@tonic-gate nd2 = nd2->nd_next; 5541*0Sstevel@tonic-gate continue; 5542*0Sstevel@tonic-gate } 5543*0Sstevel@tonic-gate (void) (clnt_mn_susp_res_io(nd2->nd_nodename, 5544*0Sstevel@tonic-gate sp->setno, MN_RES_IO, &xep)); 5545*0Sstevel@tonic-gate if (mdanyrpcerror(&xep)) { 5546*0Sstevel@tonic-gate return (-1); 5547*0Sstevel@tonic-gate } 5548*0Sstevel@tonic-gate nd2 = nd2->nd_next; 5549*0Sstevel@tonic-gate } 5550*0Sstevel@tonic-gate } 5551*0Sstevel@tonic-gate nd = nd->nd_next; 5552*0Sstevel@tonic-gate } 5553*0Sstevel@tonic-gate 5554*0Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed " 5555*0Sstevel@tonic-gate "checking/writing the mddb for set %s: %s"), sp->setname, 5556*0Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 5557*0Sstevel@tonic-gate 5558*0Sstevel@tonic-gate /* 5559*0Sstevel@tonic-gate * Send (aka replay) all messages we find in the changelog. 5560*0Sstevel@tonic-gate * Flag the messages with 5561*0Sstevel@tonic-gate * MD_MSGF_REPLAY_MSG, so no new message ID is generated for them 5562*0Sstevel@tonic-gate * MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd. 5563*0Sstevel@tonic-gate */ 5564*0Sstevel@tonic-gate for (class = MD_MN_NCLASSES - 1; class > 0; class--) { 5565*0Sstevel@tonic-gate mdmn_changelog_record_t *lr; 5566*0Sstevel@tonic-gate md_error_t xep = mdnullerror; 5567*0Sstevel@tonic-gate md_mn_result_t *resultp = NULL; 5568*0Sstevel@tonic-gate int ret; 5569*0Sstevel@tonic-gate 5570*0Sstevel@tonic-gate lr = mdmn_get_changelogrec(sp->setno, class); 5571*0Sstevel@tonic-gate if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) { 5572*0Sstevel@tonic-gate /* no entry for this class */ 5573*0Sstevel@tonic-gate continue; 5574*0Sstevel@tonic-gate } 5575*0Sstevel@tonic-gate 5576*0Sstevel@tonic-gate meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN, 5577*0Sstevel@tonic-gate "replaying message ID=(%d, 0x%llx-%d)\n"), 5578*0Sstevel@tonic-gate MSGID_ELEMS(lr->lr_msg.msg_msgid)); 5579*0Sstevel@tonic-gate 5580*0Sstevel@tonic-gate ret = mdmn_send_message_with_msgid( 5581*0Sstevel@tonic-gate lr->lr_msg.msg_setno, 5582*0Sstevel@tonic-gate lr->lr_msg.msg_type, 5583*0Sstevel@tonic-gate lr->lr_msg.msg_flags | MD_MSGF_REPLAY_MSG | 5584*0Sstevel@tonic-gate MD_MSGF_OVERRIDE_SUSPEND, 5585*0Sstevel@tonic-gate lr->lr_msg.msg_event_data, 5586*0Sstevel@tonic-gate lr->lr_msg.msg_event_size, 5587*0Sstevel@tonic-gate &resultp, 5588*0Sstevel@tonic-gate &lr->lr_msg.msg_msgid, 5589*0Sstevel@tonic-gate &xep); 5590*0Sstevel@tonic-gate 5591*0Sstevel@tonic-gate meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN, 5592*0Sstevel@tonic-gate "mdmn_send_message returned %d\n"), ret); 5593*0Sstevel@tonic-gate 5594*0Sstevel@tonic-gate if (resultp) 5595*0Sstevel@tonic-gate free_result(resultp); 5596*0Sstevel@tonic-gate } 5597*0Sstevel@tonic-gate 5598*0Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5599*0Sstevel@tonic-gate "Playing changelog completed for set %s: %s"), 5600*0Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5601*0Sstevel@tonic-gate 5602*0Sstevel@tonic-gate /* 5603*0Sstevel@tonic-gate * Now that new master has ondisk and incore mddbs in sync, reset 5604*0Sstevel@tonic-gate * this node's new master kernel flag (for this set). If this node 5605*0Sstevel@tonic-gate * re-enters another reconfig cycle before the completion of this 5606*0Sstevel@tonic-gate * reconfig cycle, this master node won't need to check if the ondisk 5607*0Sstevel@tonic-gate * and incore mddbs are in sync since this node won't be considered 5608*0Sstevel@tonic-gate * a new master (since this flag is being reset here in the middle of 5609*0Sstevel@tonic-gate * step2). This will save time during any subsequent reconfig 5610*0Sstevel@tonic-gate * cycles as long as this node continues to be master. 5611*0Sstevel@tonic-gate */ 5612*0Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf)); 5613*0Sstevel@tonic-gate sf.sf_setno = sp->setno; 5614*0Sstevel@tonic-gate sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 5615*0Sstevel@tonic-gate sf.sf_flags = MDDB_NM_RESET; 5616*0Sstevel@tonic-gate /* Use magic to help protect ioctl against attack. */ 5617*0Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5618*0Sstevel@tonic-gate /* Ignore failure, since failure to reset flag isn't catastrophic */ 5619*0Sstevel@tonic-gate (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL); 5620*0Sstevel@tonic-gate 5621*0Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5622*0Sstevel@tonic-gate "Reset new master flag for set %s: %s"), 5623*0Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5624*0Sstevel@tonic-gate 5625*0Sstevel@tonic-gate return (0); 5626*0Sstevel@tonic-gate } 5627*0Sstevel@tonic-gate 5628*0Sstevel@tonic-gate /* 5629*0Sstevel@tonic-gate * meta_mnjoin_all will join all starting nodes in the diskset. 5630*0Sstevel@tonic-gate * A starting node is considered to be any node that is not 5631*0Sstevel@tonic-gate * an owner of the set but is a member of the cluster. 5632*0Sstevel@tonic-gate * Master node is already joined to set (done in meta_mnsync_diskset_mddbs). 5633*0Sstevel@tonic-gate * 5634*0Sstevel@tonic-gate * Caller is the Master node. 5635*0Sstevel@tonic-gate * 5636*0Sstevel@tonic-gate * Returns 0 - Success 5637*0Sstevel@tonic-gate * 205 - Failure during RPC to another node 5638*0Sstevel@tonic-gate * -1 - Any other failure and ep is filled in. 5639*0Sstevel@tonic-gate */ 5640*0Sstevel@tonic-gate int 5641*0Sstevel@tonic-gate meta_mnjoin_all( 5642*0Sstevel@tonic-gate mdsetname_t *sp, 5643*0Sstevel@tonic-gate md_error_t *ep 5644*0Sstevel@tonic-gate ) 5645*0Sstevel@tonic-gate { 5646*0Sstevel@tonic-gate md_set_desc *sd; 5647*0Sstevel@tonic-gate md_mnnode_desc *nd, *nd2; 5648*0Sstevel@tonic-gate int rval = 0; 5649*0Sstevel@tonic-gate int stale_flag = 0; 5650*0Sstevel@tonic-gate mddb_config_t c; 5651*0Sstevel@tonic-gate int susp_res_flag = 0; 5652*0Sstevel@tonic-gate md_error_t xep = mdnullerror; 5653*0Sstevel@tonic-gate 5654*0Sstevel@tonic-gate /* If setname is there, set desc should exist. */ 5655*0Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 5656*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 5657*0Sstevel@tonic-gate "Unable to get set %s desc information"), sp->setname); 5658*0Sstevel@tonic-gate return (-1); 5659*0Sstevel@tonic-gate } 5660*0Sstevel@tonic-gate 5661*0Sstevel@tonic-gate /* Are there drives in the set? */ 5662*0Sstevel@tonic-gate if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 5663*0Sstevel@tonic-gate ep) == NULL) { 5664*0Sstevel@tonic-gate if (! mdisok(ep)) { 5665*0Sstevel@tonic-gate return (-1); 5666*0Sstevel@tonic-gate } 5667*0Sstevel@tonic-gate /* No drives in set -- nothing to join */ 5668*0Sstevel@tonic-gate return (0); 5669*0Sstevel@tonic-gate } 5670*0Sstevel@tonic-gate 5671*0Sstevel@tonic-gate /* 5672*0Sstevel@tonic-gate * Is set currently stale? 5673*0Sstevel@tonic-gate */ 5674*0Sstevel@tonic-gate (void) memset(&c, 0, sizeof (c)); 5675*0Sstevel@tonic-gate c.c_id = 0; 5676*0Sstevel@tonic-gate c.c_setno = sp->setno; 5677*0Sstevel@tonic-gate /* Ignore failure since master node may not be joined yet */ 5678*0Sstevel@tonic-gate (void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL); 5679*0Sstevel@tonic-gate if (c.c_flags & MDDB_C_STALE) { 5680*0Sstevel@tonic-gate stale_flag = MNSET_IS_STALE; 5681*0Sstevel@tonic-gate } 5682*0Sstevel@tonic-gate 5683*0Sstevel@tonic-gate /* 5684*0Sstevel@tonic-gate * If any nodes are going to be joined to diskset, then 5685*0Sstevel@tonic-gate * suspend I/O to all disks in diskset so that nodes can join 5686*0Sstevel@tonic-gate * (read in mddbs) in a reasonable amount of time even under 5687*0Sstevel@tonic-gate * high I/O load. Don't need to do this if set is STALE since 5688*0Sstevel@tonic-gate * no I/O can be occurring to a STALE set. 5689*0Sstevel@tonic-gate */ 5690*0Sstevel@tonic-gate if (stale_flag != MNSET_IS_STALE) { 5691*0Sstevel@tonic-gate nd = sd->sd_nodelist; 5692*0Sstevel@tonic-gate while (nd) { 5693*0Sstevel@tonic-gate /* Found a node that will be joined to diskset */ 5694*0Sstevel@tonic-gate if ((nd->nd_flags & MD_MN_NODE_ALIVE) && 5695*0Sstevel@tonic-gate (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5696*0Sstevel@tonic-gate /* Set flag that diskset should be suspended */ 5697*0Sstevel@tonic-gate susp_res_flag = 1; 5698*0Sstevel@tonic-gate break; 5699*0Sstevel@tonic-gate } 5700*0Sstevel@tonic-gate nd = nd->nd_next; 5701*0Sstevel@tonic-gate } 5702*0Sstevel@tonic-gate } 5703*0Sstevel@tonic-gate 5704*0Sstevel@tonic-gate if (susp_res_flag) { 5705*0Sstevel@tonic-gate /* 5706*0Sstevel@tonic-gate * Block all I/Os to disks in this diskset on all joined 5707*0Sstevel@tonic-gate * nodes in the diskset. 5708*0Sstevel@tonic-gate * If block of I/Os fails due to an RPC failure on another 5709*0Sstevel@tonic-gate * node, return 205; otherwise, return -1. 5710*0Sstevel@tonic-gate */ 5711*0Sstevel@tonic-gate nd = sd->sd_nodelist; 5712*0Sstevel@tonic-gate while (nd) { 5713*0Sstevel@tonic-gate /* Skip non-alive and non-owner nodes */ 5714*0Sstevel@tonic-gate if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5715*0Sstevel@tonic-gate (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5716*0Sstevel@tonic-gate nd = nd->nd_next; 5717*0Sstevel@tonic-gate continue; 5718*0Sstevel@tonic-gate } 5719*0Sstevel@tonic-gate if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5720*0Sstevel@tonic-gate MN_SUSP_IO, ep)) { 5721*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 5722*0Sstevel@tonic-gate "Unable to suspend I/O on node %s" 5723*0Sstevel@tonic-gate " in set %s"), nd->nd_nodename, 5724*0Sstevel@tonic-gate sp->setname); 5725*0Sstevel@tonic-gate /* 5726*0Sstevel@tonic-gate * Resume other nodes that had been suspended. 5727*0Sstevel@tonic-gate * (Reconfig return step also resumes I/Os 5728*0Sstevel@tonic-gate * for all sets.) 5729*0Sstevel@tonic-gate */ 5730*0Sstevel@tonic-gate nd2 = sd->sd_nodelist; 5731*0Sstevel@tonic-gate while (nd2) { 5732*0Sstevel@tonic-gate /* Stop when reaching failed node */ 5733*0Sstevel@tonic-gate if (nd2->nd_nodeid == nd->nd_nodeid) 5734*0Sstevel@tonic-gate break; 5735*0Sstevel@tonic-gate /* Skip non-alive/non-owner nodes */ 5736*0Sstevel@tonic-gate if ((!(nd2->nd_flags & 5737*0Sstevel@tonic-gate MD_MN_NODE_ALIVE)) || 5738*0Sstevel@tonic-gate (!(nd2->nd_flags & 5739*0Sstevel@tonic-gate MD_MN_NODE_OWN))) { 5740*0Sstevel@tonic-gate nd2 = nd2->nd_next; 5741*0Sstevel@tonic-gate continue; 5742*0Sstevel@tonic-gate } 5743*0Sstevel@tonic-gate (void) (clnt_mn_susp_res_io( 5744*0Sstevel@tonic-gate nd2->nd_nodename, sp->setno, 5745*0Sstevel@tonic-gate MN_RES_IO, &xep)); 5746*0Sstevel@tonic-gate nd2 = nd2->nd_next; 5747*0Sstevel@tonic-gate } 5748*0Sstevel@tonic-gate 5749*0Sstevel@tonic-gate /* 5750*0Sstevel@tonic-gate * If the suspend failed due to an 5751*0Sstevel@tonic-gate * RPC failure on another node, return 5752*0Sstevel@tonic-gate * a 205. 5753*0Sstevel@tonic-gate * Otherwise, exit with failure. 5754*0Sstevel@tonic-gate * The return reconfig step will resume 5755*0Sstevel@tonic-gate * I/Os for all disksets. 5756*0Sstevel@tonic-gate */ 5757*0Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 5758*0Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 5759*0Sstevel@tonic-gate nd->nd_nodeid)) { 5760*0Sstevel@tonic-gate return (205); 5761*0Sstevel@tonic-gate } else { 5762*0Sstevel@tonic-gate return (-1); 5763*0Sstevel@tonic-gate } 5764*0Sstevel@tonic-gate } 5765*0Sstevel@tonic-gate nd = nd->nd_next; 5766*0Sstevel@tonic-gate } 5767*0Sstevel@tonic-gate } 5768*0Sstevel@tonic-gate 5769*0Sstevel@tonic-gate nd = sd->sd_nodelist; 5770*0Sstevel@tonic-gate while (nd) { 5771*0Sstevel@tonic-gate /* 5772*0Sstevel@tonic-gate * If a node is in the membership list but isn't joined 5773*0Sstevel@tonic-gate * to the set, try to join the node. 5774*0Sstevel@tonic-gate */ 5775*0Sstevel@tonic-gate if ((nd->nd_flags & MD_MN_NODE_ALIVE) && 5776*0Sstevel@tonic-gate (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5777*0Sstevel@tonic-gate if (clnt_joinset(nd->nd_nodename, sp, 5778*0Sstevel@tonic-gate (MNSET_IN_RECONFIG | stale_flag), ep)) { 5779*0Sstevel@tonic-gate /* 5780*0Sstevel@tonic-gate * If RPC failure to another node 5781*0Sstevel@tonic-gate * then exit without attempting anything else. 5782*0Sstevel@tonic-gate * (Reconfig return step will resume I/Os 5783*0Sstevel@tonic-gate * for all sets.) 5784*0Sstevel@tonic-gate */ 5785*0Sstevel@tonic-gate if (mdanyrpcerror(ep)) { 5786*0Sstevel@tonic-gate mde_perror(ep, ""); 5787*0Sstevel@tonic-gate return (205); 5788*0Sstevel@tonic-gate } 5789*0Sstevel@tonic-gate /* 5790*0Sstevel@tonic-gate * STALE and ACCOK failures aren't true 5791*0Sstevel@tonic-gate * failures. STALE means that <50% mddbs 5792*0Sstevel@tonic-gate * are available. ACCOK means that the 5793*0Sstevel@tonic-gate * mediator provided the extra vote. 5794*0Sstevel@tonic-gate * If a true failure, then print messasge 5795*0Sstevel@tonic-gate * and withdraw node from set in order to 5796*0Sstevel@tonic-gate * cleanup from failed join attempt. 5797*0Sstevel@tonic-gate */ 5798*0Sstevel@tonic-gate if ((!mdismddberror(ep, MDE_DB_STALE)) && 5799*0Sstevel@tonic-gate (!mdismddberror(ep, MDE_DB_ACCOK))) { 5800*0Sstevel@tonic-gate mde_perror(ep, 5801*0Sstevel@tonic-gate "WARNING: Unable to join node %s " 5802*0Sstevel@tonic-gate "to set %s", nd->nd_nodename, 5803*0Sstevel@tonic-gate sp->setname); 5804*0Sstevel@tonic-gate mdclrerror(ep); 5805*0Sstevel@tonic-gate if (clnt_withdrawset(nd->nd_nodename, 5806*0Sstevel@tonic-gate sp, &xep)) 5807*0Sstevel@tonic-gate mdclrerror(&xep); 5808*0Sstevel@tonic-gate nd = nd->nd_next; 5809*0Sstevel@tonic-gate continue; 5810*0Sstevel@tonic-gate } 5811*0Sstevel@tonic-gate } 5812*0Sstevel@tonic-gate /* Set owner flag even if STALE or ACCOK */ 5813*0Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_OWN; 5814*0Sstevel@tonic-gate } 5815*0Sstevel@tonic-gate nd = nd->nd_next; 5816*0Sstevel@tonic-gate } 5817*0Sstevel@tonic-gate /* 5818*0Sstevel@tonic-gate * Resume I/Os if suspended above. 5819*0Sstevel@tonic-gate */ 5820*0Sstevel@tonic-gate if (susp_res_flag) { 5821*0Sstevel@tonic-gate nd = sd->sd_nodelist; 5822*0Sstevel@tonic-gate while (nd) { 5823*0Sstevel@tonic-gate /* 5824*0Sstevel@tonic-gate * Skip non-alive and non-owner nodes 5825*0Sstevel@tonic-gate * (this list doesn't include any of 5826*0Sstevel@tonic-gate * the nodes that were joined). 5827*0Sstevel@tonic-gate */ 5828*0Sstevel@tonic-gate if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5829*0Sstevel@tonic-gate (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5830*0Sstevel@tonic-gate nd = nd->nd_next; 5831*0Sstevel@tonic-gate continue; 5832*0Sstevel@tonic-gate } 5833*0Sstevel@tonic-gate if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5834*0Sstevel@tonic-gate MN_RES_IO, ep)) { 5835*0Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 5836*0Sstevel@tonic-gate "Unable to resume I/O on node %s" 5837*0Sstevel@tonic-gate " in set %s"), nd->nd_nodename, 5838*0Sstevel@tonic-gate sp->setname); 5839*0Sstevel@tonic-gate 5840*0Sstevel@tonic-gate /* 5841*0Sstevel@tonic-gate * If an RPC failure then don't do any 5842*0Sstevel@tonic-gate * more RPC calls, since one timeout is enough 5843*0Sstevel@tonic-gate * to endure. If RPC failure to another node, 5844*0Sstevel@tonic-gate * return 205. If RPC failure to my node, 5845*0Sstevel@tonic-gate * return -1. 5846*0Sstevel@tonic-gate * (Reconfig return step will resume I/Os 5847*0Sstevel@tonic-gate * for all sets.) 5848*0Sstevel@tonic-gate * If not an RPC failure, continue resuming the 5849*0Sstevel@tonic-gate * rest of the nodes and then return -1. 5850*0Sstevel@tonic-gate */ 5851*0Sstevel@tonic-gate if (mdanyrpcerror(ep)) { 5852*0Sstevel@tonic-gate if (sd->sd_mn_mynode->nd_nodeid == 5853*0Sstevel@tonic-gate nd->nd_nodeid) { 5854*0Sstevel@tonic-gate return (-1); 5855*0Sstevel@tonic-gate } else { 5856*0Sstevel@tonic-gate return (205); 5857*0Sstevel@tonic-gate } 5858*0Sstevel@tonic-gate } 5859*0Sstevel@tonic-gate 5860*0Sstevel@tonic-gate /* 5861*0Sstevel@tonic-gate * If not an RPC error, continue resuming rest 5862*0Sstevel@tonic-gate * of nodes, ignoring any failures except for 5863*0Sstevel@tonic-gate * an RPC failure which constitutes an 5864*0Sstevel@tonic-gate * immediate exit. 5865*0Sstevel@tonic-gate * Start in middle of list with failing node. 5866*0Sstevel@tonic-gate */ 5867*0Sstevel@tonic-gate nd2 = nd->nd_next; 5868*0Sstevel@tonic-gate while (nd2) { 5869*0Sstevel@tonic-gate /* Skip non-owner nodes */ 5870*0Sstevel@tonic-gate if ((!(nd2->nd_flags & 5871*0Sstevel@tonic-gate MD_MN_NODE_ALIVE)) || 5872*0Sstevel@tonic-gate (!(nd2->nd_flags & 5873*0Sstevel@tonic-gate MD_MN_NODE_OWN))) { 5874*0Sstevel@tonic-gate nd2 = nd2->nd_next; 5875*0Sstevel@tonic-gate continue; 5876*0Sstevel@tonic-gate } 5877*0Sstevel@tonic-gate (void) (clnt_mn_susp_res_io( 5878*0Sstevel@tonic-gate nd2->nd_nodename, sp->setno, 5879*0Sstevel@tonic-gate MN_RES_IO, &xep)); 5880*0Sstevel@tonic-gate if (mdanyrpcerror(&xep)) { 5881*0Sstevel@tonic-gate return (-1); 5882*0Sstevel@tonic-gate } 5883*0Sstevel@tonic-gate nd2 = nd2->nd_next; 5884*0Sstevel@tonic-gate } 5885*0Sstevel@tonic-gate } 5886*0Sstevel@tonic-gate nd = nd->nd_next; 5887*0Sstevel@tonic-gate } 5888*0Sstevel@tonic-gate } 5889*0Sstevel@tonic-gate 5890*0Sstevel@tonic-gate nd = sd->sd_nodelist; 5891*0Sstevel@tonic-gate while (nd) { 5892*0Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 5893*0Sstevel@tonic-gate nd = nd->nd_next; 5894*0Sstevel@tonic-gate continue; 5895*0Sstevel@tonic-gate } 5896*0Sstevel@tonic-gate /* 5897*0Sstevel@tonic-gate * If 1 node fails - go ahead and update the rest except 5898*0Sstevel@tonic-gate * in the case of an RPC failure, fail immediately. 5899*0Sstevel@tonic-gate */ 5900*0Sstevel@tonic-gate if (clnt_upd_nr_flags(nd->nd_nodename, sp, 5901*0Sstevel@tonic-gate sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) { 5902*0Sstevel@tonic-gate /* RPC failure to another node */ 5903*0Sstevel@tonic-gate if (mdanyrpcerror(ep)) { 5904*0Sstevel@tonic-gate return (205); 5905*0Sstevel@tonic-gate } 5906*0Sstevel@tonic-gate nd = nd->nd_next; 5907*0Sstevel@tonic-gate rval = -1; 5908*0Sstevel@tonic-gate continue; 5909*0Sstevel@tonic-gate } 5910*0Sstevel@tonic-gate nd = nd->nd_next; 5911*0Sstevel@tonic-gate } 5912*0Sstevel@tonic-gate 5913*0Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5914*0Sstevel@tonic-gate "Join of all nodes completed for set %s: %s"), 5915*0Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5916*0Sstevel@tonic-gate 5917*0Sstevel@tonic-gate return (rval); 5918*0Sstevel@tonic-gate } 5919